From 86931fef8538008a1a92036732b3eb7fe47b25d0 Mon Sep 17 00:00:00 2001 From: Andrew Kerr Date: Mon, 8 Jun 2020 16:17:35 -0700 Subject: [PATCH] CUTLASS 2.2 (#96) Adds support for NVIDIA Ampere Architecture features. CUDA 11 Toolkit recommended. --- CHANGELOG.md | 16 + CMakeLists.txt | 17 +- CONTRIBUTORS.md | 14 +- CUDA.cmake | 6 +- LICENSE.txt | 2 +- README.md | 98 +- cmake/nop.cu | 2 +- cuBLAS.cmake | 68 +- examples/00_basic_gemm/CMakeLists.txt | 2 +- examples/00_basic_gemm/basic_gemm.cu | 2 +- examples/01_cutlass_utilities/CMakeLists.txt | 2 +- .../01_cutlass_utilities/cutlass_utilities.cu | 2 +- examples/02_dump_reg_shmem/CMakeLists.txt | 2 +- examples/02_dump_reg_shmem/dump_reg_shmem.cu | 2 +- examples/03_visualize_layout/CMakeLists.txt | 2 +- examples/03_visualize_layout/options.h | 2 +- .../03_visualize_layout/register_layout.cu | 28 +- .../03_visualize_layout/register_layout.h | 2 +- .../03_visualize_layout/visualize_layout.cpp | 14 +- .../03_visualize_layout/visualize_layout.h | 2 +- examples/04_tile_iterator/CMakeLists.txt | 2 +- examples/04_tile_iterator/tile_iterator.cu | 2 +- examples/05_batched_gemm/CMakeLists.txt | 2 +- examples/05_batched_gemm/batched_gemm.cu | 2 +- examples/06_splitK_gemm/CMakeLists.txt | 2 +- examples/06_splitK_gemm/splitk_gemm.cu | 49 +- .../07_volta_tensorop_gemm/CMakeLists.txt | 2 +- .../volta_tensorop_gemm.cu | 46 +- .../08_turing_tensorop_gemm/CMakeLists.txt | 2 +- .../turing_tensorop_gemm.cu | 40 +- examples/10_planar_complex/planar_complex.cu | 12 +- .../planar_complex_array.cu | 12 +- examples/12_gemm_bias_relu/CMakeLists.txt | 27 + examples/12_gemm_bias_relu/gemm_bias_relu.cu | 282 ++ examples/13_fused_two_gemms/CMakeLists.txt | 33 + ...b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h | 190 ++ examples/13_fused_two_gemms/b2b_gemm_run.h | 608 ++++ .../b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h | 190 ++ .../b2b_interleaved_gemm_run.h | 633 +++++ examples/13_fused_two_gemms/device/b2b_gemm.h | 439 +++ examples/13_fused_two_gemms/fused_gemm.cu | 74 + examples/13_fused_two_gemms/kernel/b2b_gemm.h | 407 +++ .../kernel/default_b2b_gemm.h | 296 ++ .../threadblock/b2b_mma_base.h | 230 ++ .../threadblock/b2b_mma_pipelined.h | 509 ++++ .../threadblock/default_b2b_mma.h | 289 ++ examples/CMakeLists.txt | 4 +- include/cutlass/aligned_buffer.h | 2 +- include/cutlass/arch/arch.h | 6 +- include/cutlass/arch/cache_operation.h | 60 + include/cutlass/arch/memory.h | 262 +- include/cutlass/arch/memory_sm75.h | 70 +- include/cutlass/arch/memory_sm80.h | 238 ++ include/cutlass/arch/mma.h | 17 +- include/cutlass/arch/mma_sm50.h | 2 +- include/cutlass/arch/mma_sm60.h | 2 +- include/cutlass/arch/mma_sm61.h | 2 +- include/cutlass/arch/mma_sm70.h | 2 +- include/cutlass/arch/mma_sm75.h | 2 +- include/cutlass/arch/mma_sm80.h | 2091 ++++++++++++++ include/cutlass/arch/simd.h | 2 +- include/cutlass/arch/simd_sm60.h | 2 +- include/cutlass/arch/simd_sm61.h | 2 +- include/cutlass/arch/wmma.h | 2 +- include/cutlass/arch/wmma_sm70.h | 2 +- include/cutlass/arch/wmma_sm72.h | 2 +- include/cutlass/arch/wmma_sm75.h | 5 +- include/cutlass/array.h | 4 +- include/cutlass/array_subbyte.h | 2 +- include/cutlass/bfloat16.h | 461 ++++ include/cutlass/complex.h | 15 +- include/cutlass/coord.h | 26 +- include/cutlass/core_io.h | 68 +- include/cutlass/cutlass.h | 8 +- include/cutlass/device_kernel.h | 2 +- include/cutlass/epilogue/thread/activation.h | 119 + .../cutlass/epilogue/thread/conversion_op.h | 4 +- .../epilogue/thread/linear_combination.h | 24 +- .../thread/linear_combination_clamp.h | 106 +- .../linear_combination_planar_complex.h | 35 +- .../epilogue/thread/linear_combination_relu.h | 159 +- .../thread/linear_combination_sigmoid.h | 206 ++ .../cutlass/epilogue/thread/reduction_op.h | 2 +- .../default_epilogue_complex_tensor_op.h | 89 +- .../default_epilogue_planar_complex.h | 40 +- .../threadblock/default_epilogue_simt.h | 3 +- .../threadblock/default_epilogue_tensor_op.h | 216 +- .../default_epilogue_volta_tensor_op.h | 3 +- .../default_epilogue_wmma_tensor_op.h | 3 +- .../threadblock/default_thread_map_simt.h | 2 +- .../default_thread_map_tensor_op.h | 50 +- .../default_thread_map_volta_tensor_op.h | 2 +- .../default_thread_map_wmma_tensor_op.h | 2 +- .../threadblock/direct_epilogue_tensor_op.h | 2 +- .../cutlass/epilogue/threadblock/epilogue.h | 133 +- .../epilogue/threadblock/epilogue_base.h | 2 +- .../threadblock/epilogue_planar_complex.h | 2 +- .../epilogue/threadblock/epilogue_workspace.h | 2 +- .../threadblock/interleaved_epilogue.h | 2 +- .../threadblock/output_tile_thread_map.h | 63 +- .../threadblock/predicated_tile_iterator.h | 44 +- .../threadblock/shared_load_iterator.h | 22 +- .../threadblock/shared_load_iterator_mixed.h | 559 ++++ .../fragment_iterator_complex_tensor_op.h | 2 +- ...ment_iterator_gaussian_complex_tensor_op.h | 188 ++ .../epilogue/warp/fragment_iterator_simt.h | 2 +- .../warp/fragment_iterator_tensor_op.h | 2 +- .../warp/fragment_iterator_volta_tensor_op.h | 2 +- .../warp/fragment_iterator_wmma_tensor_op.h | 2 +- include/cutlass/epilogue/warp/simt_policy.h | 2 +- .../cutlass/epilogue/warp/tensor_op_policy.h | 2 +- .../epilogue/warp/tile_iterator_simt.h | 2 +- .../epilogue/warp/tile_iterator_tensor_op.h | 2 +- .../warp/tile_iterator_tensor_op_mixed.h | 675 +++++ .../warp/tile_iterator_volta_tensor_op.h | 2 +- .../warp/tile_iterator_wmma_tensor_op.h | 2 +- .../epilogue/warp/volta_tensor_op_policy.h | 2 +- .../epilogue/warp/wmma_tensor_op_policy.h | 2 +- include/cutlass/fast_math.h | 2 +- include/cutlass/functional.h | 218 +- .../gemm/device/default_gemm_configuration.h | 338 ++- include/cutlass/gemm/device/gemm.h | 4 +- include/cutlass/gemm/device/gemm_array.h | 2 +- include/cutlass/gemm/device/gemm_batched.h | 2 +- include/cutlass/gemm/device/gemm_complex.h | 8 +- .../gemm/device/gemm_splitk_parallel.h | 2 +- include/cutlass/gemm/device/gemm_universal.h | 4 +- .../gemm/device/gemm_universal_adapter.h | 116 +- .../cutlass/gemm/device/gemm_universal_base.h | 2 +- include/cutlass/gemm/gemm.h | 5 +- include/cutlass/gemm/kernel/default_gemm.h | 209 +- .../gemm/kernel/default_gemm_complex.h | 63 +- .../default_gemm_planar_complex_universal.h | 119 +- .../kernel/default_gemm_splitk_parallel.h | 2 +- .../gemm/kernel/default_gemm_universal.h | 2 +- include/cutlass/gemm/kernel/default_gemv.h | 2 +- include/cutlass/gemm/kernel/gemm.h | 2 +- include/cutlass/gemm/kernel/gemm_array.h | 2 +- include/cutlass/gemm/kernel/gemm_batched.h | 2 +- include/cutlass/gemm/kernel/gemm_pipelined.h | 2 +- .../cutlass/gemm/kernel/gemm_planar_complex.h | 9 +- .../gemm/kernel/gemm_planar_complex_array.h | 10 +- .../gemm/kernel/gemm_splitk_parallel.h | 2 +- include/cutlass/gemm/kernel/gemm_universal.h | 14 +- .../gemm/kernel/gemv_batched_strided.h | 2 +- include/cutlass/gemm/thread/mma.h | 2 +- include/cutlass/gemm/thread/mma_sm50.h | 2 +- include/cutlass/gemm/thread/mma_sm60.h | 2 +- include/cutlass/gemm/thread/mma_sm61.h | 2 +- .../gemm/threadblock/default_gemv_core.h | 2 +- .../cutlass/gemm/threadblock/default_mma.h | 264 +- .../gemm/threadblock/default_mma_core.h | 15 +- .../gemm/threadblock/default_mma_core_simt.h | 2 +- .../gemm/threadblock/default_mma_core_sm50.h | 2 +- .../gemm/threadblock/default_mma_core_sm70.h | 2 +- .../gemm/threadblock/default_mma_core_sm75.h | 519 +++- .../gemm/threadblock/default_mma_core_sm80.h | 2130 ++++++++++++++ .../gemm/threadblock/default_mma_core_wmma.h | 2 +- .../default_mma_planar_complex_multistage.h | 130 + .../default_multistage_mma_complex.h | 154 ++ .../default_multistage_mma_complex_core.h | 113 + ...default_multistage_mma_complex_core_sm80.h | 1113 ++++++++ include/cutlass/gemm/threadblock/gemv.h | 140 - include/cutlass/gemm/threadblock/mma_base.h | 2 +- .../cutlass/gemm/threadblock/mma_multistage.h | 526 ++++ .../cutlass/gemm/threadblock/mma_pipelined.h | 4 +- .../threadblock/mma_planar_complex_base.h | 2 +- .../mma_planar_complex_multistage.h | 642 +++++ .../gemm/threadblock/mma_singlestage.h | 2 +- .../gemm/threadblock/threadblock_swizzle.h | 67 +- .../gemm/warp/default_mma_complex_tensor_op.h | 401 +++ .../cutlass/gemm/warp/default_mma_tensor_op.h | 16 +- .../gemm/warp/default_mma_tensor_op_sm80.h | 186 ++ .../gemm/warp/default_mma_wmma_tensor_op.h | 14 +- include/cutlass/gemm/warp/mma.h | 2 +- .../cutlass/gemm/warp/mma_complex_tensor_op.h | 843 ++++++ ...mma_complex_tensor_op_tile_iterator_sm80.h | 2448 +++++++++++++++++ .../warp/mma_gaussian_complex_tensor_op.h | 357 +++ ...ian_complex_tensor_op_tile_iterator_sm80.h | 384 +++ include/cutlass/gemm/warp/mma_simt.h | 5 +- include/cutlass/gemm/warp/mma_simt_policy.h | 2 +- .../gemm/warp/mma_simt_tile_iterator.h | 2 +- include/cutlass/gemm/warp/mma_tensor_op.h | 105 +- .../warp/mma_tensor_op_fragment_iterator.h | 428 +++ .../cutlass/gemm/warp/mma_tensor_op_policy.h | 2 +- .../cutlass/gemm/warp/mma_tensor_op_sm70.h | 8 +- .../gemm/warp/mma_tensor_op_tile_iterator.h | 519 +++- .../warp/mma_tensor_op_tile_iterator_sm70.h | 2 +- .../warp/mma_tensor_op_tile_iterator_sm80.h | 1579 +++++++++++ .../warp/mma_tensor_op_tile_iterator_wmma.h | 2 +- .../cutlass/gemm/warp/mma_tensor_op_wmma.h | 21 +- include/cutlass/half.h | 2 +- include/cutlass/integer_subbyte.h | 2 +- include/cutlass/kernel_launch.h | 2 +- include/cutlass/layout/layout.h | 2 +- include/cutlass/layout/matrix.h | 2 +- include/cutlass/layout/pitch_linear.h | 2 +- include/cutlass/layout/tensor.h | 2 +- .../layout/tensor_op_multiplicand_sm70.h | 2 +- .../layout/tensor_op_multiplicand_sm75.h | 2 +- .../layout/tensor_op_multiplicand_sm80.h | 1133 ++++++++ include/cutlass/layout/vector.h | 2 +- include/cutlass/matrix_coord.h | 2 +- include/cutlass/matrix_shape.h | 2 +- include/cutlass/matrix_traits.h | 2 +- include/cutlass/numeric_conversion.h | 309 ++- include/cutlass/numeric_types.h | 6 +- include/cutlass/platform/platform.h | 2 +- include/cutlass/predicate_vector.h | 2 +- include/cutlass/real.h | 3 +- include/cutlass/reduction/batched_reduction.h | 2 +- .../reduction/batched_reduction_traits.h | 2 +- .../cutlass/reduction/device/reduce_split_k.h | 215 ++ .../cutlass/reduction/kernel/reduce_split_k.h | 10 +- include/cutlass/reduction/thread/reduce.h | 2 +- .../reduction/thread/reduction_operators.h | 2 +- .../cutlass/reduction/threadblock_swizzle.h | 2 +- include/cutlass/relatively_equal.h | 24 +- include/cutlass/semaphore.h | 10 +- include/cutlass/subbyte_reference.h | 2 +- include/cutlass/tensor_coord.h | 2 +- include/cutlass/tensor_ref.h | 2 +- include/cutlass/tensor_view.h | 14 +- include/cutlass/tfloat32.h | 453 +++ include/cutlass/thread/matrix.h | 2 +- .../transform/pitch_linear_thread_map.h | 2 +- include/cutlass/transform/thread/transpose.h | 2 +- include/cutlass/transform/thread/unaryOp.h | 101 + .../predicated_tile_access_iterator.h | 3 +- ...icated_tile_access_iterator_2dthreadtile.h | 2 +- .../threadblock/predicated_tile_iterator.h | 10 +- .../predicated_tile_iterator_2dthreadtile.h | 2 +- .../regular_tile_access_iterator.h | 2 +- ...egular_tile_access_iterator_pitch_linear.h | 2 +- .../regular_tile_access_iterator_tensor_op.h | 265 +- ...ular_tile_access_iterator_tensor_op_sm80.h | 1522 ++++++++++ .../threadblock/regular_tile_iterator.h | 2 +- .../regular_tile_iterator_pitch_linear.h | 2 +- ..._tile_iterator_pitch_linear_2dthreadtile.h | 2 +- .../regular_tile_iterator_tensor_op.h | 265 +- .../regular_tile_iterator_tensor_op_sm70.h | 2 +- include/cutlass/util/debug.h | 122 - include/cutlass/wmma_array.h | 2 +- media/docs/code_organization.md | 11 +- media/docs/doxygen_mainpage.md | 2 +- media/docs/efficient_gemm.md | 3 +- media/docs/functionality.md | 65 +- media/docs/fundamental_types.md | 37 +- media/docs/gemm_api.md | 2 +- media/docs/layout.md | 2 +- media/docs/profiler.md | 78 +- media/docs/programming_guidelines.md | 25 +- media/docs/quickstart.md | 90 +- media/docs/terminology.md | 2 +- media/docs/tile_iterator_concept.md | 2 +- media/docs/utilities.md | 11 +- media/images/cutlass-performance-plot.png | Bin 98106 -> 69902 bytes ...gemm-hierarchy-with-epilogue-no-labels.png | Bin 132936 -> 184294 bytes test/CMakeLists.txt | 2 +- test/unit/CMakeLists.txt | 5 +- test/unit/common/cutlass_unit_test.h | 2 +- test/unit/common/filter_architecture.cpp | 3 +- test/unit/core/CMakeLists.txt | 4 +- test/unit/core/array.cu | 10 +- test/unit/core/bfloat16.cu | 209 ++ test/unit/core/complex.cu | 2 +- test/unit/core/functional.cu | 12 +- test/unit/core/half.cu | 2 +- test/unit/core/matrix_coord.cu | 2 +- test/unit/core/numeric_conversion.cu | 2 +- test/unit/core/predicate_vector.cu | 2 +- test/unit/core/tensor_ref.cu | 2 +- test/unit/core/tensor_view.cu | 2 +- test/unit/core/test_unit_core.cpp | 2 +- test/unit/core/tfloat32.cu | 197 ++ test/unit/epilogue/CMakeLists.txt | 2 +- test/unit/epilogue/thread/CMakeLists.txt | 2 +- .../epilogue/thread/linear_combination.cu | 2 +- .../linear_combination_planar_complex.cu | 2 +- test/unit/epilogue/threadblock/CMakeLists.txt | 2 +- .../threadblock/epilogue_planar_complex.cu | 2 +- .../epilogue/threadblock/epilogue_simt.cu | 2 +- .../threadblock/epilogue_simt_sm60.cu | 2 +- .../threadblock/epilogue_simt_sm61.cu | 2 +- .../threadblock/epilogue_tensor_op.cu | 304 +- .../threadblock/epilogue_volta_tensor_op.cu | 2 +- .../epilogue_wmma_tensor_op_sm70.cu | 2 +- .../threadblock/output_tile_threadmap.cu | 2 +- .../threadblock/predicated_tile_iterator.cu | 2 +- test/unit/epilogue/threadblock/testbed.h | 2 +- .../threadblock/testbed_planar_complex.h | 2 +- test/unit/epilogue/warp/CMakeLists.txt | 2 +- .../warp/fragment_iterator_tensor_op.cu | 2 +- .../warp/fragment_iterator_volta_tensor_op.cu | 2 +- .../warp/fragment_iterator_wmma_tensor_op.cu | 2 +- test/unit/gemm/CMakeLists.txt | 2 +- test/unit/gemm/device/CMakeLists.txt | 61 +- .../gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu | 14 +- .../gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu | 373 +++ ...mm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu | 14 +- .../gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu | 14 +- .../gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu | 374 +++ ...mm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu | 14 +- ...emm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu | 353 +++ ...mm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu | 337 +++ ...32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu | 253 ++ ...32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu | 252 ++ ...cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu | 192 ++ ...mm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu | 246 ++ ...cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu | 191 ++ ...mm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu | 299 ++ ..._f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu | 8 +- ..._f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu | 8 +- .../gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu | 338 +++ ...f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu | 16 +- ..._f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu | 24 +- ..._f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu | 24 +- .../gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu | 337 +++ ..._f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu | 8 +- .../gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu | 340 +++ ...f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu | 16 +- ..._f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu | 20 +- ..._f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu | 8 +- ..._f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu | 8 +- ...6n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu | 4 +- ...6n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu | 82 + .../gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu | 14 +- .../gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu | 338 +++ .../gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu | 77 + ...f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu | 16 +- ..._f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu | 24 +- ..._f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu | 4 +- ..._f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu | 8 +- .../gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu | 339 +++ ...f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu | 16 +- ..._f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu | 20 +- ...16n_singlestage_wmma_tensor_op_f16_sm70.cu | 18 +- ..._f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu | 8 +- ..._f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu | 8 +- ...16t_singlestage_wmma_tensor_op_f16_sm70.cu | 18 +- ...6t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu | 4 +- ...6t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu | 83 + .../gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu | 14 +- .../gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu | 339 +++ ...f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu | 16 +- ..._f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu | 24 +- ..._f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu | 24 +- ..._f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu | 8 +- ...32t_singlestage_wmma_tensor_op_f32_sm70.cu | 12 +- .../gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu | 338 +++ ...f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu | 16 +- ..._f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu | 20 +- ..._f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu | 8 +- ..._f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu | 8 +- ..._f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu | 24 +- ..._f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu | 24 +- .../gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu | 338 +++ ..._f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu | 8 +- .../gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu | 338 +++ ...f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu | 14 +- ..._f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu | 20 +- ..._f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu | 87 + .../gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu | 82 + .../gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu | 212 ++ .../gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu | 212 ++ ...anar_complex_f16_f16_f32_tensor_op_sm70.cu | 6 +- ...anar_complex_f16_f16_f32_tensor_op_sm75.cu | 217 ++ ...anar_complex_f16_f16_f32_tensor_op_sm80.cu | 216 ++ .../gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu | 10 +- .../gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu | 213 ++ .../gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu | 354 +++ ...mm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu | 357 +++ ...mm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu | 2 +- .../gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu | 16 +- .../gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu | 361 +++ .../gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu | 14 +- ...mm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu | 8 +- .../gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu | 14 +- .../gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu | 355 +++ ...mm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu | 10 +- .../gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu | 14 +- .../gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu | 368 +++ ...emm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu | 10 +- .../gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu | 14 +- .../gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu | 368 +++ ...emm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu | 10 +- .../gemm_splitk_serial_tensor_op_sm75.cu | 4 +- .../unit/gemm/device/gemm_splitk_simt_sm50.cu | 2 +- .../gemm/device/gemm_splitk_tensor_op_sm70.cu | 4 +- .../gemm/device/gemm_splitk_tensor_op_sm75.cu | 4 +- ...emm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu | 549 ++++ ...emm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu | 549 ++++ ...emm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu | 487 ++++ ...emm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu | 550 ++++ ...mm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu | 10 +- ...al_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu | 193 ++ ...cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu | 194 ++ ...al_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu | 194 ++ ...ersal_f16n_f16t_f32t_tensor_op_f32_sm80.cu | 111 + test/unit/gemm/device/multistage_testbed.h | 251 ++ .../device/multistage_testbed_interleaved.h | 303 ++ test/unit/gemm/device/simt_cgemm_nn_sm50.cu | 86 +- test/unit/gemm/device/simt_cgemm_nt_sm50.cu | 86 +- test/unit/gemm/device/simt_cgemm_tn_sm50.cu | 86 +- test/unit/gemm/device/simt_cgemm_tt_sm50.cu | 86 +- test/unit/gemm/device/simt_dgemm_nn_sm50.cu | 74 +- test/unit/gemm/device/simt_dgemm_nt_sm50.cu | 74 +- test/unit/gemm/device/simt_dgemm_tn_sm50.cu | 74 +- test/unit/gemm/device/simt_dgemm_tt_sm50.cu | 74 +- test/unit/gemm/device/simt_hgemm_nn_sm50.cu | 144 +- test/unit/gemm/device/simt_hgemm_nt_sm50.cu | 144 +- test/unit/gemm/device/simt_hgemm_tn_sm50.cu | 144 +- test/unit/gemm/device/simt_hgemm_tt_sm50.cu | 144 +- test/unit/gemm/device/simt_igemm_nn_sm50.cu | 116 +- test/unit/gemm/device/simt_igemm_nt_sm50.cu | 116 +- test/unit/gemm/device/simt_igemm_tn_sm50.cu | 116 +- test/unit/gemm/device/simt_igemm_tt_sm50.cu | 116 +- test/unit/gemm/device/simt_int8_igemm_sm61.cu | 4 +- .../gemm/device/simt_int8_igemm_sm61_perf.cu | 10 +- .../device/simt_int8_igemm_sm61_sliced_k.cu | 18 +- test/unit/gemm/device/simt_sgemm_nn_sm50.cu | 116 +- test/unit/gemm/device/simt_sgemm_nt_sm50.cu | 116 +- test/unit/gemm/device/simt_sgemm_nt_sm80.cu | 249 ++ test/unit/gemm/device/simt_sgemm_tn_sm50.cu | 116 +- test/unit/gemm/device/simt_sgemm_tn_sm80.cu | 249 ++ test/unit/gemm/device/simt_sgemm_tt_sm50.cu | 116 +- test/unit/gemm/device/simt_sm50.py | 4 +- test/unit/gemm/device/simt_zgemm_nn_sm50.cu | 52 +- test/unit/gemm/device/simt_zgemm_nt_sm50.cu | 52 +- test/unit/gemm/device/simt_zgemm_tn_sm50.cu | 52 +- test/unit/gemm/device/simt_zgemm_tt_sm50.cu | 52 +- test/unit/gemm/device/testbed.h | 2 +- test/unit/gemm/device/testbed_complex.h | 2 +- test/unit/gemm/device/testbed_interleaved.h | 2 +- .../unit/gemm/device/testbed_planar_complex.h | 2 +- test/unit/gemm/device/testbed_sanity.h | 233 ++ test/unit/gemm/device/testbed_splitk.h | 2 +- test/unit/gemm/device/testbed_universal.h | 2 +- test/unit/gemm/device/testbed_utils.h | 3 +- test/unit/gemm/thread/CMakeLists.txt | 2 +- test/unit/gemm/thread/gemm_sm50.cu | 2 +- test/unit/gemm/thread/gemm_sm60.cu | 2 +- test/unit/gemm/thread/gemm_sm61.cu | 2 +- test/unit/gemm/thread/host/CMakeLists.txt | 2 +- test/unit/gemm/thread/host/gemm_sm60_host.cu | 2 +- test/unit/gemm/thread/host/testbed_host.h | 2 +- test/unit/gemm/thread/testbed.h | 2 +- test/unit/gemm/threadblock/CMakeLists.txt | 2 +- test/unit/gemm/threadblock/batched_gemv.cu | 2 +- .../gemm/threadblock/epilogue_workspace.cu | 2 +- .../gemm/threadblock/mma_pipelined_simt.cu | 2 +- .../gemm/threadblock/mma_pipelined_sm70.cu | 2 +- .../gemm/threadblock/mma_pipelined_sm75.cu | 337 ++- .../gemm/threadblock/mma_pipelined_testbed.h | 2 +- .../threadblock/mma_pipelined_wmma_sm70.cu | 2 +- .../threadblock/mma_pipelined_wmma_sm75.cu | 2 +- .../threadblock/mma_planar_complex_testbed.h | 2 +- .../threadblock/mma_singlestage_wmma_sm70.cu | 2 +- .../threadblock/mma_singlestage_wmma_sm75.cu | 2 +- test/unit/gemm/warp/CMakeLists.txt | 5 +- test/unit/gemm/warp/gemm_complex_sm80.cu | 635 +++++ .../gemm/warp/gemm_gaussian_complex_sm80.cu | 281 ++ test/unit/gemm/warp/gemm_sm50.cu | 2 +- test/unit/gemm/warp/gemm_sm60.cu | 2 +- test/unit/gemm/warp/gemm_sm61.cu | 2 +- test/unit/gemm/warp/gemm_sm70.cu | 2 +- test/unit/gemm/warp/gemm_sm75.cu | 6 +- test/unit/gemm/warp/gemm_sm80.cu | 1782 ++++++++++++ test/unit/gemm/warp/testbed.h | 3 +- test/unit/gemm/warp/wmma_sm70.cu | 2 +- test/unit/gemm/warp/wmma_sm72.cu | 2 +- test/unit/gemm/warp/wmma_sm75.cu | 2 +- test/unit/layout/CMakeLists.txt | 2 +- test/unit/layout/matrix.cu | 2 +- test/unit/layout/tensor.cu | 2 +- test/unit/layout/tensor_nhwc.cu | 2 +- test/unit/nvrtc/CMakeLists.txt | 2 +- test/unit/nvrtc/cutlass/nvrtc/environment.h | 2 +- .../unit/nvrtc/kernel/thread/testbed_kernel.h | 2 +- test/unit/nvrtc/stdlib/stdint.h | 2 +- test/unit/nvrtc/thread/CMakeLists.txt | 2 +- test/unit/nvrtc/thread/gemm_nvrtc.cu | 2 +- test/unit/nvrtc/thread/testbed.h | 2 +- test/unit/reduction/CMakeLists.txt | 2 +- test/unit/reduction/kernel/CMakeLists.txt | 2 +- test/unit/reduction/kernel/reduce_splitk.cu | 2 +- .../reduction/kernel/reduce_splitk_testbed.h | 2 +- test/unit/reduction/thread/CMakeLists.txt | 2 +- .../unit/reduction/thread/reduction_thread.cu | 2 +- test/unit/reduction/thread/testbed.h | 2 +- test/unit/test_unit.cpp | 2 +- test/unit/transform/CMakeLists.txt | 2 +- .../unit/transform/threadblock/CMakeLists.txt | 2 +- .../threadblock/predicated_tile_iterator.cu | 2 +- .../regular_tile_iterator_tensor_op.cu | 2 +- test/unit/util/complex.cu | 2 +- tools/CMakeLists.txt | 2 +- tools/library/CMakeLists.txt | 6 +- .../library/include/cutlass/library/handle.h | 60 +- .../library/include/cutlass/library/library.h | 112 +- .../include/cutlass/library/manifest.h | 7 + .../include/cutlass/library/operation_table.h | 69 +- tools/library/include/cutlass/library/util.h | 11 + tools/library/scripts/gemm_operation.py | 151 +- tools/library/scripts/generator.py | 868 +++++- tools/library/scripts/library.py | 178 +- tools/library/scripts/manifest.py | 2 +- tools/library/src/gemm_operation.h | 204 +- tools/library/src/handle.cu | 213 +- tools/library/src/library_internal.h | 36 +- tools/library/src/manifest.cpp | 16 +- tools/library/src/operation_table.cu | 114 +- tools/library/src/util.cu | 204 +- tools/profiler/CMakeLists.txt | 2 +- tools/profiler/src/cublas_helpers.cpp | 136 +- tools/profiler/src/cublas_helpers.h | 70 +- tools/profiler/src/cutlass_profiler.cu | 13 +- tools/profiler/src/cutlass_profiler.h | 3 +- tools/profiler/src/debug.h | 2 +- tools/profiler/src/device_allocation.cu | 49 +- tools/profiler/src/device_allocation.h | 2 +- tools/profiler/src/device_context.cu | 2 +- tools/profiler/src/device_context.h | 2 +- tools/profiler/src/enumerated_types.cpp | 2 +- tools/profiler/src/enumerated_types.h | 4 +- tools/profiler/src/gemm_operation_profiler.cu | 210 +- tools/profiler/src/gemm_operation_profiler.h | 21 +- tools/profiler/src/gpu_timer.cpp | 2 +- tools/profiler/src/gpu_timer.h | 2 +- tools/profiler/src/main.cpp | 2 +- tools/profiler/src/operation_profiler.cu | 119 +- tools/profiler/src/operation_profiler.h | 41 +- tools/profiler/src/options.cu | 124 +- tools/profiler/src/options.h | 2 +- tools/profiler/src/performance_report.cpp | 43 +- tools/profiler/src/performance_report.h | 2 +- tools/profiler/src/performance_result.cu | 55 + tools/profiler/src/performance_result.h | 5 +- tools/profiler/src/problem_space.cpp | 44 +- tools/profiler/src/problem_space.h | 13 +- tools/util/CMakeLists.txt | 2 +- .../util/include/cutlass/util/command_line.h | 2 +- tools/util/include/cutlass/util/debug.h | 2 +- tools/util/include/cutlass/util/device_dump.h | 2 +- .../util/include/cutlass/util/device_memory.h | 2 +- .../util/include/cutlass/util/distribution.h | 2 +- tools/util/include/cutlass/util/exceptions.h | 2 +- .../util/include/cutlass/util/host_reorder.h | 2 +- tools/util/include/cutlass/util/host_tensor.h | 2 +- .../cutlass/util/host_tensor_planar_complex.h | 2 +- .../util/reference/detail/inner_product.h | 2 +- .../cutlass/util/reference/device/gemm.h | 2 +- .../reference/device/gemm_planar_complex.h | 2 +- .../util/reference/device/kernel/gemm.h | 2 +- .../device/kernel/tensor_elementwise.h | 2 +- .../reference/device/kernel/tensor_foreach.h | 2 +- .../util/reference/device/tensor_compare.h | 2 +- .../util/reference/device/tensor_fill.h | 2 +- .../util/reference/device/tensor_foreach.h | 2 +- .../util/reference/device/tensor_relu.h | 135 + .../util/reference/device/thread/gemm.h | 2 +- .../cutlass/util/reference/host/gemm.h | 37 +- .../util/reference/host/gemm_complex.h | 2 +- .../util/reference/host/gemm_planar_complex.h | 2 +- .../util/reference/host/tensor_compare.h | 2 +- .../cutlass/util/reference/host/tensor_copy.h | 2 +- .../util/reference/host/tensor_elementwise.h | 2 +- .../cutlass/util/reference/host/tensor_fill.h | 2 +- .../util/reference/host/tensor_foreach.h | 2 +- .../cutlass/util/reference/host/tensor_norm.h | 2 +- .../include/cutlass/util/tensor_view_io.h | 2 +- tools/util/include/cutlass/util/type_traits.h | 2 +- 584 files changed, 51080 insertions(+), 3373 deletions(-) create mode 100644 examples/12_gemm_bias_relu/CMakeLists.txt create mode 100644 examples/12_gemm_bias_relu/gemm_bias_relu.cu create mode 100644 examples/13_fused_two_gemms/CMakeLists.txt create mode 100644 examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h create mode 100644 examples/13_fused_two_gemms/b2b_gemm_run.h create mode 100644 examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h create mode 100644 examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h create mode 100644 examples/13_fused_two_gemms/device/b2b_gemm.h create mode 100644 examples/13_fused_two_gemms/fused_gemm.cu create mode 100644 examples/13_fused_two_gemms/kernel/b2b_gemm.h create mode 100644 examples/13_fused_two_gemms/kernel/default_b2b_gemm.h create mode 100644 examples/13_fused_two_gemms/threadblock/b2b_mma_base.h create mode 100644 examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h create mode 100644 examples/13_fused_two_gemms/threadblock/default_b2b_mma.h create mode 100644 include/cutlass/arch/cache_operation.h create mode 100644 include/cutlass/arch/memory_sm80.h create mode 100644 include/cutlass/arch/mma_sm80.h create mode 100644 include/cutlass/bfloat16.h create mode 100644 include/cutlass/epilogue/thread/activation.h create mode 100644 include/cutlass/epilogue/thread/linear_combination_sigmoid.h create mode 100644 include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h create mode 100644 include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h create mode 100644 include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h create mode 100644 include/cutlass/gemm/threadblock/default_mma_core_sm80.h create mode 100644 include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h create mode 100644 include/cutlass/gemm/threadblock/default_multistage_mma_complex.h create mode 100644 include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h create mode 100644 include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h delete mode 100755 include/cutlass/gemm/threadblock/gemv.h create mode 100644 include/cutlass/gemm/threadblock/mma_multistage.h create mode 100644 include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h create mode 100644 include/cutlass/gemm/warp/default_mma_complex_tensor_op.h create mode 100644 include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h create mode 100644 include/cutlass/gemm/warp/mma_complex_tensor_op.h create mode 100644 include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h create mode 100644 include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h create mode 100644 include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h create mode 100644 include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h create mode 100644 include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h create mode 100644 include/cutlass/layout/tensor_op_multiplicand_sm80.h create mode 100644 include/cutlass/reduction/device/reduce_split_k.h create mode 100644 include/cutlass/tfloat32.h create mode 100644 include/cutlass/transform/thread/unaryOp.h create mode 100644 include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h delete mode 100644 include/cutlass/util/debug.h create mode 100644 test/unit/core/bfloat16.cu create mode 100644 test/unit/core/tfloat32.cu create mode 100644 test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu create mode 100644 test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu create mode 100644 test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/multistage_testbed.h create mode 100644 test/unit/gemm/device/multistage_testbed_interleaved.h create mode 100644 test/unit/gemm/device/simt_sgemm_nt_sm80.cu create mode 100644 test/unit/gemm/device/simt_sgemm_tn_sm80.cu create mode 100644 test/unit/gemm/device/testbed_sanity.h create mode 100644 test/unit/gemm/warp/gemm_complex_sm80.cu create mode 100644 test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu create mode 100644 test/unit/gemm/warp/gemm_sm80.cu create mode 100644 tools/profiler/src/performance_result.cu create mode 100644 tools/util/include/cutlass/util/reference/device/tensor_relu.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 367d69350..b92893e85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,22 @@ # CUTLASS 2.x +## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08) + * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/) + * Fast Tensor Core operations: + * Maximum performance via [`mma.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends) + * Tensor Float 32, BFloat16, and double-precision data types + * Mixed integer data types (int8, int4, bin1) + * Asynchronous copy for deep software pipelines via [`cp.async`](https://docs.nvidia.com/cuda/parallel-thread-execution) + * Features: + * SDK examples showing GEMM fused with bias+relu and fused GEMM+GEMM + * Complex-valued GEMMs targeting NVIDIA Ampere Tensor Cores in double-precision and Tensor Float 32 + * Gaussian complex GEMMs using 3m complex multiply algorithm + * Universal GEMM kernel supporting two batch modes and two algorithms for parallel reductions + * Policy updates: + * [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit) needed to enable NVIDIA Ampere Architecture features + * Disabled F16C by default for compatibility - enable on cmake command line with `-DCUTLASS_ENABLE_F16C=ON` + ## [2.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.1.0) (2020-04-06) * BLAS-style host-side API added to [CUTLASS Library](/media/docs/quickstart.md#cutlass-library) * API to launch compiled kernel instances for GEMM and planar complex GEMM diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b7bbc488..b6583747c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -32,7 +32,7 @@ endif() message(STATUS "CMake Version: ${CMAKE_VERSION}") -project(CUTLASS VERSION 2.1.0 LANGUAGES CXX) +project(CUTLASS VERSION 2.2.0 LANGUAGES CXX) include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake) find_package(Doxygen QUIET) @@ -84,7 +84,7 @@ endif() set(CUTLASS_NVCC_ARCHS_SUPPORTED "") if (NOT CUDA_VERSION VERSION_LESS 7.5) - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 50) + list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 53) endif() if (NOT CUDA_VERSION VERSION_LESS 8.0) list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 60 61) @@ -98,6 +98,9 @@ endif() if (NOT CUDA_VERSION VERSION_LESS 10.0) list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 75) endif() +if (NOT CUDA_VERSION VERSION_LESS 11.0) + list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80) +endif() set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.") set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.") @@ -154,7 +157,7 @@ endif() set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.") set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.") set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.") -set(CUTLASS_ENABLE_F16C ON CACHE BOOL "Enable F16C x86 extensions in host code.") +set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.") # # CUTLASS generator cmake configuration @@ -248,8 +251,8 @@ if(CUDA_COMPILER MATCHES "[Cc]lang") endif() list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) - list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm=-pragma-unroll-threshold=100000) - list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm=-unroll-threshold=5000) + list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -pragma-unroll-threshold=100000) + list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -unroll-threshold=5000) list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unused-command-line-argument) string(REPLACE "." ";" CUDA_VERSION_PARTS ${CMAKE_CUDA_COMPILER_VERSION}) @@ -271,7 +274,7 @@ function(cutlass_apply_cuda_gencode_flags TARGET) set(NVCC_FLAGS) set(CLANG_FLAGS) foreach(ARCH ${CUTLASS_NVCC_ARCHS_ENABLED}) - list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH}) + list(APPEND CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH}) set(CODES) if(CUTLASS_NVCC_EMBED_CUBIN) list(APPEND CODES sm_${ARCH}) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index fc95674d2..f8778b80e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -9,15 +9,17 @@ This is the official list of CUTLASS developers and contributors. ## DEVELOPERS Andrew Kerr Haicheng Wu -Naila Farooqui +Manish Gupta Dustyn Blasig Pradeep Ramani -Manish Gupta -Aditya Atluri +Naila Farooqui +Piotr Majcher Paul Springer -David Tanner -Scott Yokim Jin Wang +Scott Yokim +Markus Hohnerbach +Aditya Atluri +David Tanner ## CONTRIBUTORS Timothy Costa @@ -25,12 +27,10 @@ Julien Demouth Brian Fahs Michael Goldfarb Mostafa Hagog -Markus Hohnerbach Fei Hu Alan Kaatz Tina Li Timmy Liu -Piotr Majcher Duane Merrill Kevin Siu Markus Tavenrath diff --git a/CUDA.cmake b/CUDA.cmake index d1eb4dbc4..b8b343a72 100644 --- a/CUDA.cmake +++ b/CUDA.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -206,14 +206,14 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) function(cutlass_correct_source_file_language_property) if(CUDA_COMPILER MATCHES "clang") foreach(File ${ARGN}) - if(${File} MATCHES ".*\.cu$") + if(File MATCHES ".*\.cu$") set_source_files_properties(${File} PROPERTIES LANGUAGE CXX) endif() endforeach() endif() endfunction() -set(CUTLASS_UNITY_BUILD_ENABLED ON CACHE BOOL "Enable combined source compilation") +set(CUTLASS_UNITY_BUILD_ENABLED OFF CACHE BOOL "Enable combined source compilation") set(CUTLASS_UNITY_BUILD_BATCH_SIZE 16 CACHE STRING "Batch size for unified source files") function(cutlass_unify_source_files TARGET_ARGS_VAR) diff --git a/LICENSE.txt b/LICENSE.txt index 283345b55..64a49d680 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2017 - 2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017 - 2020, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index dd1c4c65f..c1507c036 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") -# CUTLASS 2.1 +# CUTLASS 2.2 -_CUTLASS 2.1 - April 2020_ +_CUTLASS 2.2 - June 2020_ CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. @@ -17,14 +17,28 @@ and applications. To support a wide variety of applications, CUTLASS provides extensive support for mixed-precision computations, providing specialized data-movement and multiply-accumulate abstractions for half-precision floating -point (FP16), single-precision floating point (FP32), double-precision floating +point (FP16), BFloat16 (BF16), Tensor Float 32 (TF32), +single-precision floating point (FP32), double-precision floating point (FP64) types, integer data types (4b and 8b), and binary data types (1b). -Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations for + +Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations targeting the programmable, high-throughput _Tensor Cores_ implemented by -NVIDIA's Volta and Turing architectures. +NVIDIA's Volta, Turing, and Ampere architectures. See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly. +See the [functionality listing](media/docs/functionality.md) for the list of operations +supported at each level of the execution model hierarchy. + +# What's New in CUTLASS 2.2 + +CUTLASS 2.2 is a significant update to CUTLASS adding: + +- Coverage of [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/) +- Tensor Core-accelerated GEMMs targeting Tensor Float 32, BFloat16, and double-precision data types +- Deep software pipelines using asynchronous copy +- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit) + # What's New in CUTLASS 2.1 CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding: @@ -32,7 +46,6 @@ CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding: - [Planar complex GEMM kernels](/examples/10_planar_complex/planar_complex.cu) targeting Volta and Turing Tensor Cores - BLAS-style API to launch kernels compiled into the [CUTLASS Library](/media/docs/quickstart.md#cutlass-library) - # What's New in CUTLASS 2.0 CUTLASS 2.0 is a substantial refactoring from the previous version, intended to offer: @@ -43,9 +56,6 @@ CUTLASS 2.0 is a substantial refactoring from the previous version, intended to **See the [CHANGELOG](CHANGELOG.md) for more details.** -See the [functionality listing](media/docs/functionality.md) for the list of operations -supported at each level of the execution model hierarchy. - # Performance

@@ -53,15 +63,15 @@ supported at each level of the execution model hierarchy. CUTLASS primitives are very efficient. When used to construct device-wide GEMM kernels, they exhibit performance comparable to cuBLAS for scalar GEMM computations. The above figure shows CUTLASS performance relative to cuBLAS -for large matrix dimensions on an NVIDIA GeForce 2080 Ti and an NVIDIA TitanV -using CUDA 10.2. Tensor Core operations are implemented using CUDA's +for large matrix dimensions on an NVIDIA GeForce 2080 Ti, an NVIDIA A100, and an NVIDIA TitanV +using CUDA 11.0 Toolkit. Tensor Core operations are implemented using CUDA's [mma instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma). # Compatibility CUTLASS requires a C++11 host compiler and -performs best when built with the [CUDA 10.2 Toolkit](https://developer.nvidia.com/cuda-toolkit). -It is compatible with CUDA 9.2, CUDA 10.0, and CUDA 10.1. +performs best when built with the [CUDA 11.0 Toolkit](https://developer.nvidia.com/cuda-toolkit). +It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, and CUDA 10.2. We have tested the following environments. @@ -70,27 +80,28 @@ We have tested the following environments. | Windows 10 | Microsoft Visual Studio 2015| | | Microsoft Visual Studio 2017| | Ubuntu 16.04 | GCC 5.4.0 | -| Ubuntu 18.04 | GCC 7.3.0 | +| Ubuntu 18.04 | GCC 7.5.0 | Additionally, CUTLASS may be built with clang. See [these instructions](media/docs/quickstart.md#clang) for more details. CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on -any Maxwell-, Pascal-, Volta-, or Turing- architecture NVIDIA GPU. +any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU. -|**GPU**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**| -|---|---|---| -|NVIDIA GeForce 1080|9.2| | -|NVIDIA TitanXP|9.2| | -|NVIDIA Tesla P100|9.2| | -|NVIDIA Tesla V100|9.2|10.1| -|NVIDIA TitanV|9.2|10.1| -|NVIDIA GeForce RTX 2080 TI, 2080, 2070|10.0|10.2| -|NVIDIA Tesla T4|10.0|10.2| +|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**| +|---|---|---|---| +|NVIDIA Tesla P100|6.0|9.2| | +|NVIDIA GeForce 1080|6.1|9.2| | +|NVIDIA TitanXP|6.1|9.2| | +|NVIDIA Tesla V100|7.0|9.2|10.1| +|NVIDIA TitanV|7.0|9.2|10.1| +|NVIDIA GeForce RTX 2080 TI, 2080, 2070|7.5|10.0|10.2| +|NVIDIA Tesla T4|7.5|10.0|10.2| +|NVIDIA A100|8.0|11.0|11.0| # Documentation -CUTLASS 2.1 is described in the following documents and the accompanying +CUTLASS 2.2 is described in the following documents and the accompanying [Doxygen documentation](https://nvidia.github.io/cutlass). - [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS @@ -124,7 +135,7 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc ``` Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels -for CUDA architecture versions 5.0, 6.0, 6.1, 7.0 and 7.5. To reduce compile time you can specify +for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, and 8.0. To reduce compile time you can specify the architectures to build CUTLASS for by changing the CMake configuration setting `CUTLASS_NVCC_ARCHS`. @@ -210,6 +221,10 @@ examples/ 10_planar_complex/ # example demonstrating planar complex GEMM kernels 11_planar_complex_array/ # example demonstrating planar complex kernels with batch-specific problem sizes + + 12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu + + 13_fused_two_gemms/ # example demonstrating two GEMms fused in one kernel ``` ### Tools @@ -255,29 +270,32 @@ $ make cutlass_profiler -j Example command line for profiling SGEMM kernels is as follows: ``` -$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096 +$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 ============================= Problem ID: 1 - Provider: CUTLASS - Operation: cutlass_simt_sgemm_128x128_nn + Provider: CUTLASS + OperationKind: gemm + Operation: cutlass_simt_sgemm_128x128_8x2_nn_align1 - Disposition: Passed - Status: Success + Status: Success + Verification: ON + Disposition: Passed - Arguments: --m=4352 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 \ - --split_k_slices=1 --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 \ - --stages=2 --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 \ - --max_cc=1024 + cuBLAS: Passed - Bytes: 52428800 bytes - FLOPs: 146064539648 flops + Arguments: --m=3456 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1 \ + --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 - Runtime: 10.5424 ms - Memory: 4.63158 GiB/s + Bytes: 180355072 bytes + FLOPs: 115992428544 flops - Math: 13854.9 GFLOP/s + Runtime: 6.73655 ms + Memory: 24.934 GiB/s + + Math: 17218.4 GFLOP/s ``` [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md) diff --git a/cmake/nop.cu b/cmake/nop.cu index 571c6c7c0..518a582b8 100644 --- a/cmake/nop.cu +++ b/cmake/nop.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/cuBLAS.cmake b/cuBLAS.cmake index d7f330cf3..4c73a1db4 100644 --- a/cuBLAS.cmake +++ b/cuBLAS.cmake @@ -10,28 +10,35 @@ if((DEFINED CUTLASS_ENABLE_CUBLAS AND NOT CUTLASS_ENABLE_CUBLAS) OR message(STATUS "cuBLAS Disabled.") elseif(NOT TARGET cublas) - + find_path( - _CUBLAS_INCLUDE_DIR cublas.h - PATHS - ${CUDA_TOOLKIT_ROOT_DIR}/include - $ENV{CUBLAS_PATH}/include - $ENV{CUDA_PATH}/include - ${CUBLAS_PATH}/include - /usr/include) + _CUBLAS_INCLUDE_DIR + NAMES cublas.h + HINTS + ${CUBLAS_INCLUDE_PATH} + ENV CUBLAS_INCLUDE_PATH + ${CUBLAS_PATH} + ENV CUBLAS_PATH + ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES + include + ) find_library( - _CUBLAS_LIBRARY cublas + _CUBLAS_LIBRARY + NAMES cublas HINTS - ${CUDA_TOOLKIT_ROOT_DIR}/lib64 - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 - $ENV{CUBLAS_PATH}/lib64 - $ENV{CUBLAS_PATH}/lib/x64 - $ENV{CUDA_PATH}/lib64 - $ENV{CUDA_PATH}/lib/x64 - ${CUBLAS_PATH}/lib64 - ${CUBLAS_PATH}/lib/x64 - /usr/lib/x86_64-linux-gnu) + ${CUBLAS_LIBRARY_PATH} + ENV CUBLAS_LIBRARY_PATH + ${_CUBLAS_INCLUDE_DIR}/.. + ${CUBLAS_PATH} + ENV CUBLAS_PATH + ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES + lib64 + lib/x64 + lib + ) if(_CUBLAS_INCLUDE_DIR AND _CUBLAS_LIBRARY) @@ -79,17 +86,20 @@ if(CUTLASS_ENABLE_CUBLAS AND NOT TARGET cublas) $) find_library( - _CUBLASLT_LIBRARY cublasLt + _CUBLASLT_LIBRARY + NAMES cublasLt HINTS - ${CUDA_TOOLKIT_ROOT_DIR}/lib64 - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 - $ENV{CUBLAS_PATH}/lib64 - $ENV{CUBLAS_PATH}/lib/x64 - $ENV{CUDA_PATH}/lib64 - $ENV{CUDA_PATH}/lib/x64 - ${CUBLAS_PATH}/lib64 - ${CUBLAS_PATH}/lib/x64 - /usr/lib/x86_64-linux-gnu) + ${CUBLAS_LIBRARY_PATH} + ENV CUBLAS_LIBRARY_PATH + ${_CUBLAS_INCLUDE_DIR}/.. + ${CUBLAS_PATH} + ENV CUBLAS_PATH + ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES + lib64 + lib/x64 + lib + ) if(_CUBLASLT_LIBRARY AND NOT TARGET cublasLt) @@ -106,6 +116,8 @@ if(CUTLASS_ENABLE_CUBLAS AND NOT TARGET cublas) add_library(nvidia::cublasLt ALIAS cublasLt) + target_link_libraries(cublas INTERFACE cublasLt) + endif() endif() diff --git a/examples/00_basic_gemm/CMakeLists.txt b/examples/00_basic_gemm/CMakeLists.txt index 5b833b85d..9ae257d9a 100644 --- a/examples/00_basic_gemm/CMakeLists.txt +++ b/examples/00_basic_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu index 415646327..bda012abe 100644 --- a/examples/00_basic_gemm/basic_gemm.cu +++ b/examples/00_basic_gemm/basic_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/01_cutlass_utilities/CMakeLists.txt b/examples/01_cutlass_utilities/CMakeLists.txt index 2dfa083c1..5f22b7b1c 100644 --- a/examples/01_cutlass_utilities/CMakeLists.txt +++ b/examples/01_cutlass_utilities/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu index 0b6aa3867..d1eaa57fe 100644 --- a/examples/01_cutlass_utilities/cutlass_utilities.cu +++ b/examples/01_cutlass_utilities/cutlass_utilities.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/02_dump_reg_shmem/CMakeLists.txt b/examples/02_dump_reg_shmem/CMakeLists.txt index 4e9af4fbb..5e6112e02 100644 --- a/examples/02_dump_reg_shmem/CMakeLists.txt +++ b/examples/02_dump_reg_shmem/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu index 39d58db87..ed712aa84 100644 --- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu +++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt index 81211df90..5a08c0f8d 100644 --- a/examples/03_visualize_layout/CMakeLists.txt +++ b/examples/03_visualize_layout/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/03_visualize_layout/options.h b/examples/03_visualize_layout/options.h index c72b1228f..dd7de198a 100644 --- a/examples/03_visualize_layout/options.h +++ b/examples/03_visualize_layout/options.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/03_visualize_layout/register_layout.cu b/examples/03_visualize_layout/register_layout.cu index 655d1f37d..0d2b25eb3 100644 --- a/examples/03_visualize_layout/register_layout.cu +++ b/examples/03_visualize_layout/register_layout.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -34,6 +34,8 @@ #include "cutlass/layout/pitch_linear.h" #include "cutlass/layout/tensor_op_multiplicand_sm70.h" #include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + #include "visualize_layout.h" #include "register_layout.h" @@ -59,18 +61,40 @@ void RegisterLayouts(std::map // Integer matrix multiply.int4 8832 TN kblock128 {"TensorOpMultiplicand<4,128>", new VisualizeLayout>}, + // Integer matrix multiply.int4 16864 TN kblock256 + {"TensorOpMultiplicand<4,256>", + new VisualizeLayout>}, // Integer matrix multiply 8816 Interleaved-32 {"TensorOpMultiplicand<8,32>", new VisualizeLayout>}, // Integer matrix multiply 8816 TN kblock64 {"TensorOpMultiplicand<8,64>", new VisualizeLayout>}, + {"TensorOpMultiplicand<8,128>", + new VisualizeLayout>}, // Matrix Multiply 1688 TN kblock32 {"TensorOpMultiplicand<16,32>", new VisualizeLayout>}, // Matrix multiply 1688 NT {"TensorOpMultiplicand<16,64>", new VisualizeLayout>}, + // Matrix multiply 1688.TF32 TN kblock16 + {"TensorOpMultiplicand<32,16>", + new VisualizeLayout>}, + // Matrix multiply 1688.TF32 TN kblock32 + {"TensorOpMultiplicand<32,32>", + new VisualizeLayout>}, + // Matrix multiply 1688 NT + {"TensorOpMultiplicandCongruous<32,32>", + new VisualizeLayout< + cutlass::layout::TensorOpMultiplicandCongruous<32, 32>>}, + // Matrix multiply 884 NT + {"TensorOpMultiplicandCongruous<64,16>", + new VisualizeLayout< + cutlass::layout::TensorOpMultiplicandCongruous<64, 16>>}, + // Matrix multiply 884 TN + {"TensorOpMultiplicand64bCrosswise", + new VisualizeLayout}, {"TensorOpMultiplicandCongruous<128,4>", new VisualizeLayout< cutlass::layout::TensorOpMultiplicandCongruous<128, 4>>}, @@ -82,7 +106,7 @@ void RegisterLayouts(std::map cutlass::layout::VoltaTensorOpMultiplicandCongruous<16>>}, {"VoltaTensorOpMultiplicandCrosswise<16,32>", new VisualizeLayout< - cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>}, + cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>} }; for (auto layout : layout_pairs) { diff --git a/examples/03_visualize_layout/register_layout.h b/examples/03_visualize_layout/register_layout.h index fee911f79..1518e433c 100644 --- a/examples/03_visualize_layout/register_layout.h +++ b/examples/03_visualize_layout/register_layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp index 8908d2c1f..a0f271812 100644 --- a/examples/03_visualize_layout/visualize_layout.cpp +++ b/examples/03_visualize_layout/visualize_layout.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -65,14 +65,26 @@ void print_usage(std::ostream &out) { "--extent=64,64 --vectorize=32 --output-shape=256,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<4,128>\" " "--extent=128,32 --vectorize=32 --output-shape=256,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicand<4,256>\" " + "--extent=256,16 --vectorize=32 --output-shape=256,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<8,32>\" " "--extent=32,64 --vectorize=16 --output-shape=128,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<8,64>\" " "--extent=64,32 --vectorize=16 --output-shape=128,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicand<8,128>\" " + "--extent=128,16 --vectorize=16 --output-shape=128,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<16,32>\" " "--extent=32,32 --vectorize=8 --output-shape=64,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<16,64>\" " "--extent=64,16 --vectorize=8 --output-shape=64,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicand<32,16>\" " + "--extent=16,32 --vectorize=4 --output-shape=32,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicand<32,32>\" " + "--extent=32,16 --vectorize=4 --output-shape=32,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicandCongruous<32,32>\" " + "--extent=32,16 --vectorize=4 --output-shape=32,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicandCongruous<64, 16>\" " + "--extent=16,16 --vectorize=2 --output-shape=16,4\n" << "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCrosswise<16,32>\" " "--extent=32,64 --vectorize=4 --output-shape=64,4\n" << "$ 03_visualize_layout \"VotlaTensorOpMultiplicandCongruous<16>\" " diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h index 031916c74..4093d2772 100644 --- a/examples/03_visualize_layout/visualize_layout.h +++ b/examples/03_visualize_layout/visualize_layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/04_tile_iterator/CMakeLists.txt b/examples/04_tile_iterator/CMakeLists.txt index cef156249..cd32e2287 100644 --- a/examples/04_tile_iterator/CMakeLists.txt +++ b/examples/04_tile_iterator/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu index e63157608..5c56f33bd 100644 --- a/examples/04_tile_iterator/tile_iterator.cu +++ b/examples/04_tile_iterator/tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/05_batched_gemm/CMakeLists.txt b/examples/05_batched_gemm/CMakeLists.txt index 6c9bf5046..6cd0ca8db 100644 --- a/examples/05_batched_gemm/CMakeLists.txt +++ b/examples/05_batched_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu index d1fecda6e..a9d8a9c68 100644 --- a/examples/05_batched_gemm/batched_gemm.cu +++ b/examples/05_batched_gemm/batched_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/06_splitK_gemm/CMakeLists.txt b/examples/06_splitK_gemm/CMakeLists.txt index 750c6205b..7b30ae166 100644 --- a/examples/06_splitK_gemm/CMakeLists.txt +++ b/examples/06_splitK_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu index 5fb513cb9..f0e1d5783 100644 --- a/examples/06_splitK_gemm/splitk_gemm.cu +++ b/examples/06_splitK_gemm/splitk_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,7 +39,7 @@ inner product (1/16th of output), they accumulate to single output matrix. Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing high performance kernels at scale which works for multiple problem sizes with good abstractions is -really hard. CUTLASS solves this problem by providing simplified abstractions (knobs) to compose +really hard. CUTLASS solves this problem by providing simplified abstractions to compose multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU easily. @@ -144,7 +144,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>; // <- MMA Op tile M = 8, N = 8, K = 4 // This code section describes how threadblocks are scheduled on GPU -using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; // <- ?? +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ?? // This code section describes ? using EpilogueOp = cutlass::epilogue::thread::LinearCombination< @@ -172,17 +172,7 @@ using Gemm = cutlass::gemm::device::GemmSplitKParallel; -int main() { - - // - // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1. - // - // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. - // - if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { - std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - return -1; - } +int run() { cudaDeviceProp props; @@ -316,11 +306,30 @@ int main() { tensor_ref_d.sync_host(); // Check if output from CUTLASS kernel and reference kernel are equal or not - std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), - tensor_ref_d.host_view()) - ? "Passed" - : "Failed") - << std::endl; + bool passed = cutlass::reference::host::TensorEquals( + tensor_d.host_view(), + tensor_ref_d.host_view()); - CUTLASS_CHECK(status); + std::cout << (passed ? "Passed" : "Failed") << std::endl; + + return (passed ? 0 : -1); } + +int main() { + + // + // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1. + // + // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero, so this test passes when built with older CUDA Toolkits. Its action are no-op. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/07_volta_tensorop_gemm/CMakeLists.txt b/examples/07_volta_tensorop_gemm/CMakeLists.txt index 56dfce9ec..82e817227 100644 --- a/examples/07_volta_tensorop_gemm/CMakeLists.txt +++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu index 447cc1cc5..208c4f645 100644 --- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu +++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -156,7 +156,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>; // <- MMA Op tile M = 8, N = 8, K = 4 // This code section describes how threadblocks are scheduled on GPU -using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; // <- ?? +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ?? // This code section describes ? using EpilogueOp = cutlass::epilogue::thread::LinearCombination< @@ -188,15 +188,7 @@ using Gemm = cutlass::gemm::device::Gemm; -int main() { - - // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1. - // - // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. - if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { - std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - return -1; - } +int run() { cudaDeviceProp props; @@ -223,7 +215,7 @@ int main() { cutlass::HostTensor tensor_a( problem_size.mk()); // <- Create matrix A with dimensions M x K cutlass::HostTensor tensor_b( - problem_size.nk()); // <- Create matrix B with dimensions N x K + problem_size.kn()); // <- Create matrix B with dimensions K x N cutlass::HostTensor tensor_c( problem_size.mn()); // <- Create matrix C with dimensions M x N cutlass::HostTensor tensor_d( @@ -326,12 +318,28 @@ int main() { tensor_ref_d.sync_host(); // Check if output from CUTLASS kernel and reference kernel are equal or not - std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), - tensor_ref_d.host_view()) - ? "Passed" - : "Failed") - << std::endl; + bool passed = cutlass::reference::host::TensorEquals( + tensor_d.host_view(), + tensor_ref_d.host_view()); - CUTLASS_CHECK(status); - return 0; + std::cout << (passed ? "Passed" : "Failed") << std::endl; + + return (passed ? 0 : -1); } + +int main() { + + // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1. + // + // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero when built on older Toolkits so tests pass. The actions of this SDK example are no-op. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt index 9e011a1ed..b4e4fe82f 100644 --- a/examples/08_turing_tensorop_gemm/CMakeLists.txt +++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu index 3440d82f2..d7ba83319 100644 --- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu +++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -150,12 +150,12 @@ using SmArch = cutlass::arch::Sm75; using ShapeMMAThreadBlock = cutlass::gemm::GemmShape<128, 256, 64>; // <- threadblock tile M = 128, N = 256, K = 64 // This code section describes tile size a warp will compute -using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 64>; // <- warp tile M = 64, N = 64, K = 16 +using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 64>; // <- warp tile M = 64, N = 64, K = 64 // This code section describes the size of MMA op using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 16>; // <- MMA Op tile M = 8, N = 8, K = 16 // This code section describes how threadblocks are scheduled on GPU -using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; // <- ?? +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ?? // This code section describes the epilogue part of the kernel using EpilogueOp = cutlass::epilogue::thread::LinearCombination< @@ -186,7 +186,7 @@ using Gemm = cutlass::gemm::device::Gemm; -int main() { +int run() { // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available // in CUDA 10.2. @@ -222,7 +222,7 @@ int main() { cutlass::HostTensor tensor_a( problem_size.mk()); // <- Create matrix A with dimensions M x K cutlass::HostTensor tensor_b( - problem_size.nk()); // <- Create matrix B with dimensions N x K + problem_size.kn()); // <- Create matrix B with dimensions K x N cutlass::HostTensor tensor_c( problem_size.mn()); // <- Create matrix C with dimensions M x N cutlass::HostTensor tensor_d( @@ -325,12 +325,28 @@ int main() { tensor_ref_d.sync_host(); // Check if output from CUTLASS kernel and reference kernel are equal or not - std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), - tensor_ref_d.host_view()) - ? "Passed" - : "Failed") - << std::endl; + bool passed = cutlass::reference::host::TensorEquals( + tensor_d.host_view(), + tensor_ref_d.host_view()); - CUTLASS_CHECK(status); - return 0; + std::cout << (passed ? "Passed" : "Failed") << std::endl; + + return (passed ? 0 : -1); } + +int main() { + // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available + // in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes when built on older Toolkits. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu index 7fc92870f..b7318b99c 100644 --- a/examples/10_planar_complex/planar_complex.cu +++ b/examples/10_planar_complex/planar_complex.cu @@ -500,7 +500,9 @@ int main(int argc, char const **args) { if (props.major < 7) { std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70." << std::endl; - return -1; + + // Returning zero so this test passes on older architectures even though its actions are no-op. + return 0; } else if (props.major == 7 && props.minor <= 2) { // @@ -508,7 +510,9 @@ int main(int argc, char const **args) { // if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - return -1; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; } } else if (props.major == 7 && props.minor >= 5) { @@ -517,7 +521,9 @@ int main(int argc, char const **args) { // if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; - return -1; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; } } diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu index 3003a9009..6a0270533 100644 --- a/examples/11_planar_complex_array/planar_complex_array.cu +++ b/examples/11_planar_complex_array/planar_complex_array.cu @@ -560,7 +560,9 @@ int main(int argc, char const **args) { if (props.major < 7) { std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70." << std::endl; - return -1; + + // Returning zero so this passes on older architectures. Its actions are no-op. + return 0; } else if (props.major == 7 && props.minor <= 2) { // @@ -568,7 +570,9 @@ int main(int argc, char const **args) { // if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - return -1; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; } } else if (props.major == 7 && props.minor >= 5) { @@ -577,7 +581,9 @@ int main(int argc, char const **args) { // if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; - return -1; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; } } diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt new file mode 100644 index 000000000..fb78d77fa --- /dev/null +++ b/examples/12_gemm_bias_relu/CMakeLists.txt @@ -0,0 +1,27 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cutlass_example_add_executable( + 12_gemm_bias_relu + gemm_bias_relu.cu + ) + diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu new file mode 100644 index 000000000..7faaa98aa --- /dev/null +++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu @@ -0,0 +1,282 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/** +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/epilogue/thread/linear_combination_relu.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" +#include "helper.h" + +// The code section below describes datatype for input, output matrices and computation between +// elements in input matrices. +using ElementAccumulator = float; // <- data type of accumulator +using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations +using ElementInputA = cutlass::half_t; // <- data type of elements in input matrix A +using ElementInputB = cutlass::half_t; // <- data type of elements in input matrix B +using ElementOutput = float; // <- data type of elements in output matrix D + +// The code section below describes matrix layout of input and output matrices. Column Major for +// Matrix A, Row Major for Matrix B and Row Major for Matrix C +using LayoutInputA = cutlass::layout::ColumnMajor; +using LayoutInputB = cutlass::layout::ColumnMajor; +using LayoutOutput = cutlass::layout::RowMajor; + +// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM +using MMAOp = cutlass::arch::OpClassTensorOp; + +// This code section describes CUDA SM architecture number +using SmArch = cutlass::arch::Sm75; + +// This code section describes the tile size a thread block will compute +using ShapeMMAThreadBlock = + cutlass::gemm::GemmShape<128, 128, 32>; // <- threadblock tile M = 128, N = 128, K = 32 +// This code section describes tile size a warp will compute +using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = 64, N = 64, K = 32 +// This code section describes the size of MMA op +using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>; // <- MMA Op tile M = 8, N = 8, K = 4 + +// This code section describes how threadblocks are scheduled on GPU +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ?? + +// Define the epilogue operation as LinearCombinationRelu. This is approximately equal to +// +// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + beta * c_ij ) +// +using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- this is the number of elements per + // vectorized memory access. For half + // precision, it's 8 elements. This becomes + // the vector width of math instructions in + // epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + +// Number of pipelines you want to use +constexpr int NumStages = 2; + +using Gemm = cutlass::gemm::device::Gemm; + +int run() { + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!(props.major * 10 + props.minor >= 75)) { + std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + << std::endl; + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + + const int length_m = 5120; + const int length_n = 4096; + const int length_k = 4096; + + // Create a tuple of problem size for matrix multiplication + cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k); + + // Initialize tensors using CUTLASS helper functions + cutlass::HostTensor tensor_a( + problem_size.mk()); // <- Create matrix A with dimensions M x K + cutlass::HostTensor tensor_b( + problem_size.nk()); // <- Create matrix B with dimensions N x K + + cutlass::HostTensor tensor_c_bias( + {problem_size.m(), 1}); // <- Create matrix C with dimensions M x 1 + + cutlass::HostTensor tensor_d( + problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from + // CUTLASS kernel + cutlass::HostTensor tensor_ref_d( + problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from + // reference kernel + + // Fill input and output matrices on host using CUTLASS helper functions + cutlass::reference::host::TensorFillRandomUniform( + tensor_a.host_view(), + 1, + ElementInputA(4), + ElementInputA(-4), + 0); // <- Fill matrix A on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_b.host_view(), + 1, + ElementInputB(4), + ElementInputB(-4), + 0); // <- Fill matrix B on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_c_bias.host_view(), + 1, + ElementOutput(4), + ElementOutput(-4), + 0); // <- Fill matrix C on host with uniform-distribution random data + cutlass::reference::host::TensorFill( + tensor_d.host_view()); // <- fill matrix D on host with zeros + cutlass::reference::host::TensorFill( + tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros + + // Copy data from host to GPU + tensor_a.sync_device(); + tensor_b.sync_device(); + tensor_c_bias.sync_device(); + tensor_d.sync_device(); + tensor_ref_d.sync_device(); + + // Initialize alpha and beta for dot product computation + ElementComputeEpilogue alpha = ElementComputeEpilogue(1); + ElementComputeEpilogue beta = ElementComputeEpilogue(0); + + // Split K dimension into 1 partitions + int split_k_slices = 1; + + // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch + // instantiated CUTLASS kernel + typename Gemm::Arguments arguments{ + problem_size, // <- problem size of matrix multiplication + tensor_a.device_ref(), // <- reference to matrix A on device + tensor_b.device_ref(), // <- reference to matrix B on device + + {tensor_c_bias.device_data(), 0}, // <- the C matrix is treated as the bias vector. We can enable the GEMM + // to project away the N dimension by setting the stride to zero. + + tensor_d.device_ref(), // <- reference to matrix D on device + {alpha, beta}, // <- tuple of alpha and beta + split_k_slices}; // <- k-dimension split factor + + // Using the arguments, query for extra workspace required for matrix multiplication computation + size_t workspace_size = Gemm::get_workspace_size(arguments); + + // Allocate workspace memory + cutlass::device_memory::allocation workspace(workspace_size); + + // Instantiate CUTLASS kernel depending on templates + Gemm gemm_op; + + // Initialize CUTLASS kernel with arguments and workspace pointer + cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); + CUTLASS_CHECK(status); + + // Launch initialized CUTLASS kernel + status = gemm_op(); + CUTLASS_CHECK(status); + + // + // Create instantiation for device reference gemm kernel + // + + cutlass::reference::device::Gemm + gemm_device_reference; + + // Launch device reference to compute strictly the product A * B + gemm_device_reference( + problem_size, + alpha, + tensor_a.device_ref(), + tensor_b.device_ref(), + 0, + tensor_c_bias.device_ref(), + tensor_ref_d.device_ref()); + + // Wait for kernels to finish + cudaDeviceSynchronize(); + + // Copy output data from CUTLASS and reference kernel to host for comparison + tensor_d.sync_host(); + tensor_ref_d.sync_host(); + + // Compute bias + relu in host code + for (int i = 0; i < problem_size.m(); ++i) { + for (int j = 0; j < problem_size.n(); ++j) { + tensor_ref_d.at({i, j}) = std::max( + ElementOutput(0), + ElementOutput(tensor_ref_d.at({i, j}) + beta * tensor_c_bias.at({i, 0})) + ); + } + } + + // Check if output from CUTLASS kernel and reference kernel are equal or not + std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), + tensor_ref_d.host_view()) + ? "Passed" + : "Failed") + << std::endl; + + CUTLASS_CHECK(status); + return 0; +} + +int main() { + // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/13_fused_two_gemms/CMakeLists.txt b/examples/13_fused_two_gemms/CMakeLists.txt new file mode 100644 index 000000000..ba51537ca --- /dev/null +++ b/examples/13_fused_two_gemms/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cutlass_example_add_executable( + 13_fused_two_gemms + fused_gemm.cu + ) + +target_include_directories( + 13_fused_two_gemms + PRIVATE + . + ) + diff --git a/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h b/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h new file mode 100644 index 000000000..10a0d4bf9 --- /dev/null +++ b/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h @@ -0,0 +1,190 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "device/b2b_gemm.h" +#include "b2b_gemm_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +void run_nonfused_gemm_f16() { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); + cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Gemm0 = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + using Gemm1 = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + + B2bNonFusedGemmRun nonFusedGemm; + + std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n"; + bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; +} + +void run_fused_gemm_f16() { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); + cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bGemm = cutlass::gemm::device::B2bGemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + + B2bFusedGemmRun fusedGemm; + + std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n"; + bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + if(passed) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} +//////////////////////////////////////////////////////////////////////////////// + +#endif //#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) diff --git a/examples/13_fused_two_gemms/b2b_gemm_run.h b/examples/13_fused_two_gemms/b2b_gemm_run.h new file mode 100644 index 000000000..053064d75 --- /dev/null +++ b/examples/13_fused_two_gemms/b2b_gemm_run.h @@ -0,0 +1,608 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include +#include + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/device/tensor_relu.h" + +#include "helper.h" + +#define CHECK_GT(val1, val2) \ + if((val1) <= (val2)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n"; +#define CHECK_TRUE(val) \ + if(!(val)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n"; + +//////////////////////////////////////////////////////////////////////////////// + +template +struct B2bNonFusedGemmRun +{ + + using Gemm0 = Gemm0_; + using Gemm1 = Gemm1_; + using ElementAccumulator = typename Gemm0::ElementAccumulator; + using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + B2bNonFusedGemmRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + std::cerr << "Not implemented\n"; + return false; + } + + return true; + } + + + + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size_0, + cutlass::gemm::GemmCoord problem_size_1, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename Gemm0::ElementA, + typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk()); + + cutlass::HostTensor< + typename Gemm0::ElementB, + typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> reference_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementB, + typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> reference_D1(problem_size_1.mn()); + + + CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); + CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); + CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); + CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); + CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); + + cutlass::reference::host::TensorFill( + tensor_D0.host_view()); + cutlass::reference::host::TensorFill( + tensor_D1.host_view()); + cutlass::reference::host::TensorFill( + reference_D0.host_view()); + cutlass::reference::host::TensorFill( + reference_D1.host_view()); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_C0.sync_device(); + tensor_D0.sync_device(); + tensor_B1.sync_device(); + tensor_C1.sync_device(); + tensor_D1.sync_device(); + reference_D0.sync_device(); + reference_D1.sync_device(); + + // + // Initialize the GEMM operator + // + + typename Gemm0::Arguments arguments_0{ + problem_size_0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_D0.device_ref(), + {alpha0, beta0} + }; + + typename Gemm1::Arguments arguments_1{ + problem_size_1, + tensor_D0.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1.device_ref(), + {alpha1, beta1} + }; + + + Gemm0 gemm_op_0; + Gemm1 gemm_op_1; + + cutlass::Status status = gemm_op_0.initialize(arguments_0); + + CUTLASS_CHECK(status); + + status = gemm_op_1.initialize(arguments_1); + + CUTLASS_CHECK(status); + // + // Run the GEMM + // + + cudaEvent_t start, stop1, stop2; + cudaEventCreate(&start); + cudaEventCreate(&stop1); + cudaEventCreate(&stop2); + + cudaEventRecord(start); + + for(int i = 0; i < 100; i++) { + status = gemm_op_0(); + + CUTLASS_CHECK(status); + } + cudaEventRecord(stop1); + for(int i = 0; i < 100; i++) { + + status = gemm_op_1(); + + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop2); + cudaDeviceSynchronize(); + float gemm0Time, gemm1Time, totalTime; + cudaEventElapsedTime(&gemm0Time, start, stop1); + cudaEventElapsedTime(&gemm1Time, stop1, stop2); + cudaEventElapsedTime(&totalTime, start, stop2); + std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n"; + std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n"; + std::cout << "total time " << totalTime / 100.0 << " ms\n"; + + tensor_D0.sync_host(); + tensor_D1.sync_host(); + + // + // Verify + // + cutlass::reference::device::Gemm< + typename Gemm0::ElementA, typename Gemm0::LayoutA, + typename Gemm0::ElementB, typename Gemm0::LayoutB, + typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm0::Operator> + reference_gemm_0; + + cutlass::reference::device::Gemm< + typename Gemm1::ElementA, typename Gemm1::LayoutA, + typename Gemm1::ElementB, typename Gemm1::LayoutB, + typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm1::Operator> + reference_gemm_1; + + reference_gemm_0( + problem_size_0, + alpha0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + beta0, + tensor_C0.device_ref(), + reference_D0.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D0.device_view()); + } + + reference_gemm_1( + problem_size_1, + alpha1, + reference_D0.device_ref(), + tensor_B1.device_ref(), + beta1, + tensor_C1.device_ref(), + reference_D1.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D1.device_view()); + } + + // Wait for kernels to finish + cudaDeviceSynchronize(); + reference_D0.sync_host(); + reference_D1.sync_host(); + + + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D1.host_view(), + tensor_D1.host_view()); + + CHECK_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_B2bGemm_device_nonfused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream file(fname.str()); + + file + << "A0 =\n" << tensor_A0.host_view() + << "\nB0 =\n" << tensor_B0.host_view() + << "\nC0 =\n" << tensor_C0.host_view() + << "\nD0 =\n" << tensor_D0.host_view() + << "\nB1 =\n" << tensor_B1.host_view() + << "\nC1 =\n" << tensor_C1.host_view() + << "\n\nReference =\n" << reference_D1.host_view() + << "\nComputed =\n" << tensor_D1.host_view(); + } + + return passed; + } +}; + +template +struct B2bFusedGemmRun +{ + + using B2bGemm = B2bGemm_; + using ElementAccumulator = typename B2bGemm::ElementAccumulator; + using ElementCompute = typename B2bGemm::B2bGemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + B2bFusedGemmRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + std::cerr << "Not implemented\n"; + return false; + } + + return true; + } + + + + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size_0, + cutlass::gemm::GemmCoord problem_size_1, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename B2bGemm::ElementA, + typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn()); + +// cutlass::HostTensor< +// typename B2bGemm::ElementC, +// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn()); + + + CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); + CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); + CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); + CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); + CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); + + cutlass::reference::host::TensorFill( + tensor_D1.host_view()); + cutlass::reference::host::TensorFill( + reference_D0.host_view()); + cutlass::reference::host::TensorFill( + reference_D1.host_view()); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_C0.sync_device(); + tensor_B1.sync_device(); + tensor_C1.sync_device(); + tensor_D1.sync_device(); + reference_D0.sync_device(); + reference_D1.sync_device(); + + // + // Initialize the GEMM operator + // + + typename B2bGemm::Arguments arguments{ + problem_size_0, + problem_size_1, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1.device_ref(), + {alpha0, beta0}, + {alpha1, beta1}, + }; + + B2bGemm b2b_gemm_op; + + cutlass::Status status = b2b_gemm_op.initialize(arguments); + + CUTLASS_CHECK(status); + + // + // Run the GEMM + // + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + for(int i = 0; i < 100; i++) { + status = b2b_gemm_op(); + + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop); + cudaDeviceSynchronize(); + float gemmTime; + cudaEventElapsedTime(&gemmTime, start, stop); + std::cout << "time " << gemmTime / 100.0 << " ms\n"; + + //tensor_D0.sync_host(); + tensor_D1.sync_host(); + + // + // Verify + // + cutlass::reference::device::Gemm< + typename B2bGemm::ElementA, typename B2bGemm::LayoutA, + typename B2bGemm::ElementB, typename B2bGemm::LayoutB, + typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute, + ElementAccumulator, typename B2bGemm::Operator> + reference_gemm_0, reference_gemm_1; + + reference_gemm_0( + problem_size_0, + alpha0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + beta0, + tensor_C0.device_ref(), + reference_D0.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D0.device_view()); + } + + reference_gemm_1( + problem_size_1, + alpha1, + reference_D0.device_ref(), + tensor_B1.device_ref(), + beta1, + tensor_C1.device_ref(), + reference_D1.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D1.device_view()); + } + + cudaDeviceSynchronize(); + reference_D0.sync_host(); + reference_D1.sync_host(); + + + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D1.host_view(), + tensor_D1.host_view()); + + CHECK_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_B2bGemm_device_fused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream file(fname.str()); + + file + << "A0 =\n" << tensor_A0.host_view() + << "\nB0 =\n" << tensor_B0.host_view() + << "\nC0 =\n" << tensor_C0.host_view() +// << "\nD0 =\n" << tensor_D0.host_view() + << "\nB1 =\n" << tensor_B1.host_view() + << "\nC1 =\n" << tensor_C1.host_view() + << "\n\nReference =\n" << reference_D1.host_view() + << "\nComputed =\n" << tensor_D1.host_view(); + } + + return passed; + } + +}; + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h b/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h new file mode 100644 index 000000000..1c3f15c2c --- /dev/null +++ b/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h @@ -0,0 +1,190 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "device/b2b_gemm.h" +#include "b2b_interleaved_gemm_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +void run_nonfused_gemm_s8() { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); + cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + + using Gemm0 = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + using Gemm1 = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + + B2bInterleavedNonFusedGemmRun nonFusedGemm; + + std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n"; + bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; +} + +void run_fused_gemm_s8() { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); + cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 64>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bGemm = cutlass::gemm::device::B2bGemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + + B2bInterleavedFusedGemmRun fusedGemm; + + std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n"; + bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + if(passed) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) diff --git a/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h b/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h new file mode 100644 index 000000000..906cabb40 --- /dev/null +++ b/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h @@ -0,0 +1,633 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include +#include +#include + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/host_reorder.h" +#include "cutlass/util/reference/device/gemm.h" +#include "helper.h" + +#define CHECK_GT(val1, val2) \ + if((val1) <= (val2)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n"; +#define CHECK_TRUE(val) \ + if(!(val)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n"; + +template +struct B2bInterleavedNonFusedGemmRun +{ + + using Gemm0 = Gemm0_; + using Gemm1 = Gemm1_; + using ElementAccumulator = typename Gemm0::ElementAccumulator; + using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + B2bInterleavedNonFusedGemmRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + std::cerr << "Not implemented\n"; + return false; + } + + return true; + } + + + + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size_0, + cutlass::gemm::GemmCoord problem_size_1, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename Gemm0::ElementA, + typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk()); + + cutlass::HostTensor< + typename Gemm0::ElementB, + typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn()); + + cutlass::HostTensor< + typename Gemm0::ElementB, + typename Gemm0::LayoutB> tensor_B0_reordered(problem_size_0.kn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> reference_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementB, + typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn()); + + cutlass::HostTensor< + typename Gemm1::ElementB, + typename Gemm1::LayoutB> tensor_B1_reordered(problem_size_1.kn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> reference_D1(problem_size_1.mn()); + + + CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); + CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); + CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); + CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); + CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); + + //Reorder B0 and B1 + cutlass::reorder_column( + tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0); + cutlass::reorder_column( + tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1); + + cutlass::reference::host::TensorFill( + tensor_D0.host_view()); + cutlass::reference::host::TensorFill( + tensor_D1.host_view()); + cutlass::reference::host::TensorFill( + reference_D0.host_view()); + cutlass::reference::host::TensorFill( + reference_D1.host_view()); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_B0_reordered.sync_device(); + tensor_C0.sync_device(); + tensor_D0.sync_device(); + tensor_B1.sync_device(); + tensor_B1_reordered.sync_device(); + tensor_C1.sync_device(); + tensor_D1.sync_device(); + reference_D0.sync_device(); + reference_D1.sync_device(); + + // + // Initialize the GEMM operator + // + + typename Gemm0::Arguments arguments_0{ + problem_size_0, + tensor_A0.device_ref(), + tensor_B0_reordered.device_ref(), + tensor_C0.device_ref(), + tensor_D0.device_ref(), + {alpha0, beta0} + }; + + typename Gemm1::Arguments arguments_1{ + problem_size_1, + tensor_D0.device_ref(), + tensor_B1_reordered.device_ref(), + tensor_C1.device_ref(), + tensor_D1.device_ref(), + {alpha1, beta1} + }; + + + Gemm0 gemm_op_0; + Gemm1 gemm_op_1; + + cutlass::Status status = gemm_op_0.initialize(arguments_0); + + CUTLASS_CHECK(status); + + status = gemm_op_1.initialize(arguments_1); + + CUTLASS_CHECK(status); + // + // Run the GEMM + // + cudaEvent_t start, stop1, stop2; + cudaEventCreate(&start); + cudaEventCreate(&stop1); + cudaEventCreate(&stop2); + + cudaEventRecord(start); + + for(int i = 0; i < 100; i++) { + status = gemm_op_0(); + + CUTLASS_CHECK(status); + } + cudaEventRecord(stop1); + + for(int i = 0; i < 100; i++) { + status = gemm_op_1(); + + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop2); + cudaDeviceSynchronize(); + float gemm0Time, gemm1Time, totalTime; + cudaEventElapsedTime(&gemm0Time, start, stop1); + cudaEventElapsedTime(&gemm1Time, stop1, stop2); + cudaEventElapsedTime(&totalTime, start, stop2); + std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n"; + std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n"; + std::cout << "total time " << totalTime / 100.0 << " ms\n"; + + tensor_D0.sync_host(); + tensor_D1.sync_host(); + + // + // Verify + // + cutlass::reference::device::Gemm< + typename Gemm0::ElementA, typename Gemm0::LayoutA, + typename Gemm0::ElementB, typename Gemm0::LayoutB, + typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm0::Operator> + reference_gemm_0; + + cutlass::reference::device::Gemm< + typename Gemm1::ElementA, typename Gemm1::LayoutA, + typename Gemm1::ElementB, typename Gemm1::LayoutB, + typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm1::Operator> + reference_gemm_1; + + reference_gemm_0( + problem_size_0, + alpha0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + beta0, + tensor_C0.device_ref(), + reference_D0.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D0.device_view()); + } + + reference_gemm_1( + problem_size_1, + alpha1, + tensor_D0.device_ref(), + tensor_B1.device_ref(), + beta1, + tensor_C1.device_ref(), + reference_D1.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D1.device_view()); + } + + cudaDeviceSynchronize(); + reference_D0.sync_host(); + reference_D1.sync_host(); + + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D1.host_view(), + tensor_D1.host_view()); + + CHECK_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_B2bGemm_device_interleaved_nonfused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream file(fname.str()); + + file + << "A0 =\n" << tensor_A0.host_view() + << "\nB0 =\n" << tensor_B0.host_view() + << "\nB0_reordered =\n" << tensor_B0_reordered.host_view() + << "\nC0 =\n" << tensor_C0.host_view() + << "\nD0 =\n" << tensor_D0.host_view() + << "\nB1 =\n" << tensor_B1.host_view() + << "\nB1_reordered =\n" << tensor_B1_reordered.host_view() + << "\nC1 =\n" << tensor_C1.host_view() + << "\n\nReference =\n" << reference_D1.host_view() + << "\nComputed =\n" << tensor_D1.host_view(); + } + + return passed; + } +}; + +template +struct B2bInterleavedFusedGemmRun +{ + + using B2bGemm = B2bGemm_; + using ElementAccumulator = typename B2bGemm::ElementAccumulator; + using ElementCompute = typename B2bGemm::B2bGemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + B2bInterleavedFusedGemmRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + std::cerr << "Not implemented\n"; + return false; + } + + return true; + } + + + + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size_0, + cutlass::gemm::GemmCoord problem_size_1, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename B2bGemm::ElementA, + typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B0_reordered(problem_size_0.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn()); + +// cutlass::HostTensor< +// typename B2bGemm::ElementC, +// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B1_reordered(problem_size_1.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn()); + + + CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); + CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); + CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); + CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); + CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); + + //Reorder B0 + cutlass::reorder_column( + tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0); + cutlass::reorder_column( + tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1); + + cutlass::reference::host::TensorFill( + tensor_D1.host_view()); + cutlass::reference::host::TensorFill( + reference_D0.host_view()); + cutlass::reference::host::TensorFill( + reference_D1.host_view()); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_B0_reordered.sync_device(); + tensor_C0.sync_device(); + //tensor_D0.sync_device(); + tensor_B1.sync_device(); + tensor_B1_reordered.sync_device(); + tensor_C1.sync_device(); + tensor_D1.sync_device(); + reference_D0.sync_device(); + reference_D1.sync_device(); + + // + // Initialize the GEMM operator + // + + typename B2bGemm::Arguments arguments{ + problem_size_0, + problem_size_1, + tensor_A0.device_ref(), + tensor_B0_reordered.device_ref(), + tensor_C0.device_ref(), + tensor_B1_reordered.device_ref(), + tensor_C1.device_ref(), + tensor_D1.device_ref(), + {alpha0, beta0}, + {alpha1, beta1}, + 1, /*threadblock_swizzle_k_tile*/ + }; + + B2bGemm b2b_gemm_op; + + cutlass::Status status = b2b_gemm_op.initialize(arguments); + + CUTLASS_CHECK(status); + + // + // Run the GEMM + // + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + for(int i = 0; i < 100; i++) { + status = b2b_gemm_op(); + + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop); + cudaDeviceSynchronize(); + float gemmTime; + cudaEventElapsedTime(&gemmTime, start, stop); + std::cout << "time " << gemmTime / 100.0 << " ms\n"; + + //tensor_D0.sync_host(); + tensor_D1.sync_host(); + + // + // Verify + // + cutlass::reference::device::Gemm< + typename B2bGemm::ElementA, typename B2bGemm::LayoutA, + typename B2bGemm::ElementB, typename B2bGemm::LayoutB, + typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute, + ElementAccumulator, typename B2bGemm::Operator> + reference_gemm_0, reference_gemm_1; + + reference_gemm_0( + problem_size_0, + alpha0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + beta0, + tensor_C0.device_ref(), + reference_D0.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D0.device_view()); + } + + reference_gemm_1( + problem_size_1, + alpha1, + reference_D0.device_ref(), + tensor_B1.device_ref(), + beta1, + tensor_C1.device_ref(), + reference_D1.device_ref() + ); + + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D1.device_view()); + } + + cudaDeviceSynchronize(); + reference_D0.sync_host(); + reference_D1.sync_host(); + + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D1.host_view(), + tensor_D1.host_view()); + + CHECK_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_B2bGemm_device_interleaved_fused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream file(fname.str()); + + file + << "A0 =\n" << tensor_A0.host_view() + << "\nB0 =\n" << tensor_B0.host_view() + << "\nB0_reordered =\n" << tensor_B0_reordered.host_view() + << "\nC0 =\n" << tensor_C0.host_view() +// << "\nD0 =\n" << tensor_D0.host_view() + << "\nB1 =\n" << tensor_B1.host_view() + << "\nB1_reordered =\n" << tensor_B1_reordered.host_view() + << "\nC1 =\n" << tensor_C1.host_view() + << "\n\nReference =\n" << reference_D1.host_view() + << "\nComputed =\n" << tensor_D1.host_view(); + } + + return passed; + } + +}; + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/device/b2b_gemm.h b/examples/13_fused_two_gemms/device/b2b_gemm.h new file mode 100644 index 000000000..3f161435d --- /dev/null +++ b/examples/13_fused_two_gemms/device/b2b_gemm.h @@ -0,0 +1,439 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/arch.h" +#include "cutlass/device_kernel.h" + +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +#include "cutlass/gemm/device/default_gemm_configuration.h" +#include "cutlass/epilogue/thread/linear_combination_relu.h" + +#include "kernel/b2b_gemm.h" +#include "kernel/default_b2b_gemm.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Element type for C and D matrix operands + typename ElementC_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Element type for internal accumulation + typename ElementAccumulator_ = ElementC_, + /// Operator class tag + typename OperatorClass_ = arch::OpClassSimt, + /// Tag indicating architecture to tune for + typename ArchTag_ = arch::Sm70, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::ThreadblockShape, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::EpilogueOutputOp, + /// Epilogue output operator + typename EpilogueOutputOp1_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>, + /// Number of stages used in the pipelined mainloop + int Stages = + DefaultGemmConfiguration::kStages, + /// Access granularity of A matrix in units of elements + int AlignmentA = + DefaultGemmConfiguration::kAlignmentA, + /// Access granularity of B matrix in units of elements + int AlignmentB = + DefaultGemmConfiguration::kAlignmentB, + /// If true, kernel supports split-K with serial reduction + bool SplitKSerial = false, + /// Operation performed by GEMM + typename Operator_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::Operator, + /// Whether Beta is zero or not + bool IsBetaZero = false> +class B2bGemm { + public: + + using ElementA = ElementA_; + using LayoutA = LayoutA_; + using TensorRefA = TensorRef; + using ElementB = ElementB_; + using LayoutB = LayoutB_; + using TensorRefB = TensorRef; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + using TensorRefC = TensorRef; + using TensorRefD = TensorRef; + using ElementAccumulator = ElementAccumulator_; + using OperatorClass = OperatorClass_; + using ArchTag = ArchTag_; + using ThreadblockShape0 = ThreadblockShape0_; + using ThreadblockShape1 = ThreadblockShape1_; + using WarpShape0 = WarpShape0_; + using WarpShape1 = WarpShape1_; + using InstructionShape = InstructionShape_; + using EpilogueOutputOp0 = EpilogueOutputOp0_; + using EpilogueOutputOp1 = EpilogueOutputOp1_; + using ThreadblockSwizzle = ThreadblockSwizzle_; + using Operator = Operator_; + static int const kStages = Stages; + static int const kAlignmentA = AlignmentA; + static int const kAlignmentB = AlignmentB; + static int const kAlignmentC = EpilogueOutputOp1::kCount; + static bool const kSplitKSerial = SplitKSerial; + static bool const kIsBetaZero = IsBetaZero; + static ComplexTransform const kTransformA = ComplexTransform::kNone; + static ComplexTransform const kTransformB = ComplexTransform::kNone; + + /// Define the kernel + using B2bGemmKernel = typename kernel::DefaultB2bGemm< + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + kStages, + kSplitKSerial, + Operator, + kIsBetaZero + >::B2bGemmKernel; + + /// Argument structure + struct Arguments { + + // + // Data members + // + + GemmCoord problem_size_0; + GemmCoord problem_size_1; + TensorRef ref_A0; + TensorRef ref_B0; + TensorRef ref_C0; + TensorRef ref_B1; + TensorRef ref_C1; + TensorRef ref_D1; + typename EpilogueOutputOp0::Params epilogue0; + typename EpilogueOutputOp1::Params epilogue1; + int split_k_slices; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + Arguments(): problem_size_0(0, 0, 0), problem_size_1(0, 0, 0), split_k_slices(1) { + + } + + /// Constructs an Arguments structure + CUTLASS_HOST_DEVICE + Arguments( + GemmCoord problem_size_0_, + GemmCoord problem_size_1_, + TensorRef ref_A0_, + TensorRef ref_B0_, + TensorRef ref_C0_, + TensorRef ref_B1_, + TensorRef ref_C1_, + TensorRef ref_D1_, + typename EpilogueOutputOp0::Params epilogue0_ = + typename EpilogueOutputOp0::Params(), + typename EpilogueOutputOp1::Params epilogue1_ = + typename EpilogueOutputOp1::Params(), + int split_k_slices_ = 1 + ): + problem_size_0(problem_size_0_), + problem_size_1(problem_size_1_), + ref_A0(ref_A0_), + ref_B0(ref_B0_), + ref_C0(ref_C0_), + ref_B1(ref_B1_), + ref_C1(ref_C1_), + ref_D1(ref_D1_), + epilogue0(epilogue0_), + epilogue1(epilogue1_), + split_k_slices(split_k_slices_) { + + } + }; + +private: + + /// Kernel parameters object + typename B2bGemmKernel::Params params_; + +public: + + /// Constructs the GEMM. + B2bGemm() { } + + /// Determines whether the GEMM can execute the given problem. + static Status can_implement(Arguments const &args) { + + if (!kSplitKSerial && args.split_k_slices > 1) { + return Status::kErrorInvalidProblem; + } + + Status status = B2bGemmKernel::can_implement( + args.problem_size_0, + args.problem_size_1, + args.ref_A0.non_const_ref(), + args.ref_B0.non_const_ref(), + args.ref_C0.non_const_ref(), + args.ref_B1.non_const_ref(), + args.ref_C1.non_const_ref(), + args.ref_D1 + ); + + if (status != Status::kSuccess) { + return status; + } + + return Status::kSuccess; + } + + /// Gets the workspace size + static size_t get_workspace_size(Arguments const &args) { + + size_t bytes = 0; + + // Determine grid shape + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape( + args.problem_size_0, + {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK}, + args.split_k_slices); + + if (kSplitKSerial && args.split_k_slices > 1) { + + + bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n()); + } + + return bytes; + } + + /// Initializes GEMM state from arguments. + Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) { + + // Determine grid shape + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape( + args.problem_size_0, + {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK}, + args.split_k_slices); +// cutlass::gemm::GemmCoord grid_shape_1 = threadblock_swizzle.get_tiled_shape( +// args.problem_size_1, +// {ThreadblockShape1::kM, ThreadblockShape1::kN, ThreadblockShape1::kK}, +// args.split_k_slices); + + if (kSplitKSerial) { + if (args.split_k_slices > 1) { + if (!workspace) { + return Status::kErrorWorkspaceNull; + } + + size_t bytes = get_workspace_size(args); + + cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + } + } + else { + + if (args.split_k_slices > 1) { + return Status::kErrorInvalidProblem; + } + } + + // Initialize the Params structure + params_ = typename B2bGemmKernel::Params{ + args.problem_size_0, + args.problem_size_1, + grid_shape, + args.ref_A0.non_const_ref(), + args.ref_B0.non_const_ref(), + args.ref_C0.non_const_ref(), + args.ref_B1.non_const_ref(), + args.ref_C1.non_const_ref(), + args.ref_D1, + args.epilogue0, + args.epilogue1, + static_cast(workspace), + }; + + return Status::kSuccess; + } + + /// Lightweight update given a subset of arguments + Status update(Arguments const &args, void *workspace = nullptr) { + + if (kSplitKSerial && args.split_k_slices > 1) { + if (!workspace) { + return Status::kErrorWorkspaceNull; + } + } + + params_.ref_A0.reset(args.ref_A.non_const_ref().data()); + params_.ref_B0.reset(args.ref_B.non_const_ref().data()); + params_.ref_C0.reset(args.ref_C.non_const_ref().data()); + params_.ref_B1.reset(args.ref_B.non_const_ref().data()); + params_.ref_C1.reset(args.ref_C.non_const_ref().data()); + params_.ref_D1.reset(args.ref_D.data()); + params_.output_op_0 = args.epilogue0; + params_.output_op_1 = args.epilogue1; + params_.semaphore = static_cast(workspace); + + return Status::kSuccess; + } + + /// Runs the kernel using initialized state. + Status run(cudaStream_t stream = nullptr) { + + ThreadblockSwizzle threadblock_swizzle; + + dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape); + dim3 block(B2bGemmKernel::kThreadCount, 1, 1); + + cudaError_t result; + + int smem_size = int(sizeof(typename B2bGemmKernel::SharedStorage)); + if (smem_size >= (48 << 10)) { + result = cudaFuncSetAttribute(Kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + result = cudaFuncSetAttribute( + Kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + } + + cutlass::Kernel<<>>(params_); + + result = cudaGetLastError(); + + return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; + } + + /// Runs the kernel using initialized state. + Status operator()(cudaStream_t stream = nullptr) { + return run(stream); + } + + /// Runs the kernel using initialized state. + Status operator()( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + Status status = initialize(args, workspace); + + if (status == Status::kSuccess) { + status = run(stream); + } + + return status; + } +}; + +} // namespace device +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/fused_gemm.cu b/examples/13_fused_two_gemms/fused_gemm.cu new file mode 100644 index 000000000..8f5d4f2cc --- /dev/null +++ b/examples/13_fused_two_gemms/fused_gemm.cu @@ -0,0 +1,74 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/** +*/ + +#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h" +#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h" + +int run() { + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!(props.major * 10 + props.minor >= 75)) { + std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + << std::endl; + + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + run_nonfused_gemm_f16(); + run_fused_gemm_f16(); + run_nonfused_gemm_s8(); + run_fused_gemm_s8(); +#endif + + return 0; +} + +int main() { + // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/13_fused_two_gemms/kernel/b2b_gemm.h b/examples/13_fused_two_gemms/kernel/b2b_gemm.h new file mode 100644 index 000000000..d106fa46a --- /dev/null +++ b/examples/13_fused_two_gemms/kernel/b2b_gemm.h @@ -0,0 +1,407 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_coord.h" +#include "cutlass/semaphore.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename B2bMma_, ///! Threadblock-scoped matrix multiply-accumulate + typename Epilogue_, ///! Epilogue + typename ThreadblockSwizzle_, ///! Threadblock swizzling function + bool SplitKSerial ///! If true, code supporting split-K via serial reduction is enabled. +> +struct B2bGemm { + + using B2bMma = B2bMma_; + using Epilogue = Epilogue_; + using OutputOp0 = typename B2bMma::OutputOp; + using OutputOp1 = typename Epilogue::OutputOp; + using ThreadblockSwizzle = ThreadblockSwizzle_; + static bool const kSplitKSerial = SplitKSerial; + + /// Warp count (concept: GemmShape) + using WarpCount0 = typename B2bMma::WarpCount0; + static int const kThreadCount = 32 * WarpCount0::kCount; + + /// Parameters structure + struct Params { + cutlass::gemm::GemmCoord problem_size_0; + cutlass::gemm::GemmCoord problem_size_1; + cutlass::gemm::GemmCoord grid_tiled_shape; + typename B2bMma::IteratorA0::Params params_A0; + typename B2bMma::IteratorA0::TensorRef ref_A0; + typename B2bMma::IteratorB0::Params params_B0; + typename B2bMma::IteratorB0::TensorRef ref_B0; + typename Epilogue::OutputTileIterator::Params params_C0; + typename Epilogue::OutputTileIterator::TensorRef ref_C0; + typename B2bMma::IteratorB1::Params params_B1; + typename B2bMma::IteratorB1::TensorRef ref_B1; + typename Epilogue::OutputTileIterator::Params params_C1; + typename Epilogue::OutputTileIterator::TensorRef ref_C1; + typename Epilogue::OutputTileIterator::Params params_D1; + typename Epilogue::OutputTileIterator::TensorRef ref_D1; + typename OutputOp0::Params output_op_0; + typename OutputOp1::Params output_op_1; + int *semaphore; + int gemm_k_iterations_0; + int gemm_k_size_0; + int gemm_k_iterations_1; + int gemm_k_size_1; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params(): semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0), + gemm_k_iterations_1(0), gemm_k_size_1(0) { } + + CUTLASS_HOST_DEVICE + Params( + cutlass::gemm::GemmCoord const & problem_size_0, + cutlass::gemm::GemmCoord const & problem_size_1, + cutlass::gemm::GemmCoord const & grid_tiled_shape, + typename B2bMma::IteratorA0::TensorRef ref_A0, + typename B2bMma::IteratorB0::TensorRef ref_B0, + typename Epilogue::OutputTileIterator::TensorRef ref_C0, + typename B2bMma::IteratorB1::TensorRef ref_B1, + typename Epilogue::OutputTileIterator::TensorRef ref_C1, + typename Epilogue::OutputTileIterator::TensorRef ref_D1, + typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(), + typename OutputOp1::Params output_op_1 = typename OutputOp1::Params(), + int *workspace = nullptr + ): + problem_size_0(problem_size_0), + problem_size_1(problem_size_1), + grid_tiled_shape(grid_tiled_shape), + params_A0(ref_A0.layout()), + ref_A0(ref_A0), + params_B0(ref_B0.layout()), + ref_B0(ref_B0), + params_C0(ref_C0.layout()), + ref_C0(ref_C0), + params_B1(ref_B1.layout()), + ref_B1(ref_B1), + params_C1(ref_C1.layout()), + ref_C1(ref_C1), + params_D1(ref_D1.layout()), + ref_D1(ref_D1), + output_op_0(output_op_0), + output_op_1(output_op_1) { + + int total_gemm_k_iterations_0 = (problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK; + int gemm_k_iterations_0 = (total_gemm_k_iterations_0 + grid_tiled_shape.k() - 1) / grid_tiled_shape.k(); + gemm_k_size_0 = gemm_k_iterations_0 * B2bMma::Shape0::kK; + int total_gemm_k_iterations_1 = (problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK; + int gemm_k_iterations_1 = (total_gemm_k_iterations_1 + grid_tiled_shape.k() - 1) / grid_tiled_shape.k(); + gemm_k_size_1 = gemm_k_iterations_1 * B2bMma::Shape1::kK; + + semaphore = workspace; + } + }; + + /// Shared memory storage structure + union SharedStorage { + typename B2bMma::B2bMmaSharedStorage main_loop; + typename Epilogue::SharedStorage epilogue; + }; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + B2bGemm() { } + + /// Determines whether kernel satisfies alignment + static Status can_implement( + cutlass::gemm::GemmCoord const & problem_size_0, + cutlass::gemm::GemmCoord const & problem_size_1, + typename B2bMma::IteratorA0::TensorRef ref_A0, + typename B2bMma::IteratorB0::TensorRef ref_B0, + typename Epilogue::OutputTileIterator::TensorRef ref_C0, + typename B2bMma::IteratorB1::TensorRef ref_B1, + typename Epilogue::OutputTileIterator::TensorRef ref_C1, + typename Epilogue::OutputTileIterator::TensorRef ref_D1) { + + static int const kAlignmentA = B2bMma::IteratorA0::AccessType::kElements; + static int const kAlignmentB = B2bMma::IteratorB0::AccessType::kElements; + static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess; + + if (!TensorRef_aligned(ref_A0, kAlignmentA)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_B0, kAlignmentB)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_C0, kAlignmentC)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_B1, kAlignmentB)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_C1, kAlignmentC)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_D1, kAlignmentC)) { + return Status::kErrorMisalignedOperand; + } + + if ((problem_size_0.m() % kAlignmentA) || (problem_size_0.k() % kAlignmentA) || + (problem_size_0.n() % kAlignmentB) || (problem_size_0.k() % kAlignmentB) || + (problem_size_0.m() % kAlignmentC) || (problem_size_0.n() % kAlignmentC) || + (problem_size_1.m() % kAlignmentA) || (problem_size_1.k() % kAlignmentA) || + (problem_size_1.n() % kAlignmentB) || (problem_size_1.k() % kAlignmentB) || + (problem_size_1.m() % kAlignmentC) || (problem_size_1.n() % kAlignmentC)) { + + return Status::kErrorMisalignedOperand; + } + + return Status::kSuccess; + } + + /// Executes one GEMM + CUTLASS_DEVICE + void operator()(Params const ¶ms, SharedStorage &shared_storage) { + + // Compute threadblock location + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(); + + // Early exit if CTA is out of range + if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() || + params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) { + + return; + } + + // Compute initial location in logical coordinates + cutlass::MatrixCoord tb_offset_A0{ + threadblock_tile_offset.m() * B2bMma::Shape0::kM, + threadblock_tile_offset.k() * params.gemm_k_size_0, + }; + + cutlass::MatrixCoord tb_offset_B0{ + threadblock_tile_offset.k() * params.gemm_k_size_0, + threadblock_tile_offset.n() * B2bMma::Shape0::kN + }; + + cutlass::MatrixCoord tb_offset_B1{ + threadblock_tile_offset.k() * params.gemm_k_size_1, + threadblock_tile_offset.n() * B2bMma::Shape1::kN + }; + + // Problem size is a function of threadblock index in the K dimension + int problem_size_k_0 = min( + params.problem_size_0.k(), + (threadblock_tile_offset.k() + 1) * params.gemm_k_size_0); + + // Compute threadblock-scoped matrix multiply-add + int gemm_k_iterations_0 = (problem_size_k_0 - tb_offset_A0.column() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK; + + // Problem size is a function of threadblock index in the K dimension + int problem_size_k_1 = min( + params.problem_size_1.k(), + (threadblock_tile_offset.k() + 1) * params.gemm_k_size_1); + + // Compute threadblock-scoped matrix multiply-add +// int gemm_k_iterations_1 = (problem_size_k_1 - tb_offset_B1.row() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK; + + + // Compute position within threadblock + int thread_idx = threadIdx.x; + + // Construct iterators to A and B operands + typename B2bMma::IteratorA0 iterator_A0( + params.params_A0, + params.ref_A0.data(), + {params.problem_size_0.m(), problem_size_k_0}, + thread_idx, + tb_offset_A0); + + typename B2bMma::IteratorB0 iterator_B0( + params.params_B0, + params.ref_B0.data(), + {problem_size_k_0, params.problem_size_0.n()}, + thread_idx, + tb_offset_B0); + + typename B2bMma::IteratorB1 iterator_B1( + params.params_B1, + params.ref_B1.data(), + {problem_size_k_1, params.problem_size_1.n()}, + thread_idx, + tb_offset_B1); + + + // Broadcast the warp_id computed by lane 0 to ensure dependent code + // is compiled as warp-uniform. + int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0); + int lane_idx = threadIdx.x % 32; + + // + // Main loop + // + + OutputOp0 output_op_0(params.output_op_0); + + // Construct thread-scoped matrix multiply + B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx); + + typename B2bMma::FragmentC0 src_accum; + typename B2bMma::FragmentC1 accumulators; + + src_accum.clear(); + accumulators.clear(); + + if (!kSplitKSerial || gemm_k_iterations_0 > 0) { + // Compute threadblock-scoped matrix multiply-add + b2bMma(gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_B1, src_accum, output_op_0); + } + + // + // Epilogue + // + + OutputOp1 output_op_1(params.output_op_1); + + // + // Masked tile iterators constructed from members + // + + threadblock_tile_offset = threadblock_swizzle.get_tile_offset(); + + //assume identity swizzle + MatrixCoord threadblock_offset( + threadblock_tile_offset.m() * B2bMma::Shape1::kM, + threadblock_tile_offset.n() * B2bMma::Shape1::kN + ); + + int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m(); + + // Construct the semaphore. + Semaphore semaphore(params.semaphore + block_idx, thread_idx); + + // If performing a reduction via split-K, fetch the initial synchronization + if (kSplitKSerial && params.grid_tiled_shape.k() > 1) { + + // Fetch the synchronization lock initially but do not block. + semaphore.fetch(); + + // Indicate which position in a serial reduction the output operator is currently updating + output_op_1.set_k_partition(threadblock_tile_offset.k()); + } + + // Tile iterator loading from source tensor. + typename Epilogue::OutputTileIterator iterator_C1( + params.params_C1, + params.ref_C1.data(), + params.problem_size_1.mn(), + thread_idx, + threadblock_offset + ); + + // Tile iterator writing to destination tensor. + typename Epilogue::OutputTileIterator iterator_D1( + params.params_D1, + params.ref_D1.data(), + params.problem_size_1.mn(), + thread_idx, + threadblock_offset + ); + + Epilogue epilogue( + shared_storage.epilogue, + thread_idx, + warp_idx, + lane_idx); + + // Wait on the semaphore - this latency may have been covered by iterator construction + if (kSplitKSerial && params.grid_tiled_shape.k() > 1) { + + // For subsequent threadblocks, the source matrix is held in the 'D' tensor. + if (threadblock_tile_offset.k()) { + iterator_C1 = iterator_D1; + } + + semaphore.wait(threadblock_tile_offset.k()); + + __threadfence(); + } + + // Execute the epilogue operator to update the destination tensor. + epilogue(output_op_1, iterator_D1, accumulators, iterator_C1); + + // + // Release the semaphore + // + + if (kSplitKSerial && params.grid_tiled_shape.k() > 1) { + + int lock = 0; + if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) { + + // The final threadblock resets the semaphore for subsequent grids. + lock = 0; + } + else { + // Otherwise, the semaphore is incremented + lock = threadblock_tile_offset.k() + 1; + } + + __threadfence(); + semaphore.release(lock); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace gemm +} // namespace cutlass + diff --git a/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h b/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h new file mode 100644 index 000000000..45b2d545e --- /dev/null +++ b/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h @@ -0,0 +1,296 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + *modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + *notice, this list of conditions and the following disclaimer in the + *documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its + *contributors may be used to endorse or promote products derived from this + *software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, + *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING + *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief + Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with + the appropriate threadblock-scoped epilogue. + + Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are + accommodated by exchanging A and B operands and assuming transposed layouts. Partial + specializations here choose 'device::GemmTransposed' to implement this functionality. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/layout/matrix.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/epilogue/threadblock/epilogue.h" +#include "cutlass/epilogue/thread/linear_combination.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/kernel/gemm_pipelined.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm75.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" +#include "cutlass/gemm/threadblock/default_mma_core_simt.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" +#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h" +#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h" +#include "cutlass/epilogue/threadblock/default_epilogue_simt.h" + +#include "cutlass/transform/threadblock/predicated_tile_iterator.h" + +#include "kernel/b2b_gemm.h" +#include "threadblock/default_b2b_mma.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace kernel { + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Operator class tag + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0, + /// Epilogue output operator + typename EpilogueOutputOp1, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator, + /// Beta is zero or not + bool IsBetaZero = false +> +struct DefaultB2bGemm; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Turing Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0, + /// Epilogue output operator + typename EpilogueOutputOp1, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator +> +struct DefaultB2bGemm< + ElementA, LayoutA, kAlignmentA, + ElementB, LayoutB, kAlignmentB, + ElementC, layout::RowMajor, + ElementAccumulator, + arch::OpClassTensorOp, + arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + 2, + SplitKSerial, + Operator +> { + + /// Define the threadblock-scoped matrix multiply-accumulate + using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma< + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementAccumulator, + layout::RowMajor, + arch::OpClassTensorOp, + arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + 2, + Operator, + EpilogueOutputOp0 + >::ThreadblockB2bMma; + + static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK; + + /// Define the epilogue + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape1, + typename B2bMma::Operator1, + kPartitionsK1, + EpilogueOutputOp1, + EpilogueOutputOp1::kCount + >::Epilogue; + + /// Define the kernel-level GEMM operator. + using B2bGemmKernel = kernel::B2bGemm; +}; + + +/// Partial specialization for Turing IMMA Interleaved layout +template < + /// Element type for A matrix operand + typename ElementA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0, + /// Epilogue output operator + typename EpilogueOutputOp1, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of Interleaved k + int InterleavedK, + /// If true, kernel is configured to support serial reduction in the + /// epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator, + /// Is Beta zero or not + bool IsBetaZero> +struct DefaultB2bGemm, + kAlignmentA, ElementB, + layout::RowMajorInterleaved, kAlignmentB, + ElementC, layout::ColumnMajorInterleaved, + int32_t, arch::OpClassTensorOp, arch::Sm75, + ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, + InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1, + ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> { + using LayoutA = layout::ColumnMajorInterleaved; + using LayoutB = layout::RowMajorInterleaved; + using LayoutC = layout::ColumnMajorInterleaved; + + using ElementAccumulator = int32_t; + + /// Define the threadblock-scoped matrix multiply-accumulate + using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC, + arch::OpClassTensorOp, arch::Sm75, ThreadblockShape0, ThreadblockShape1, + WarpShape0, WarpShape1, InstructionShape, 2, Operator, EpilogueOutputOp0, true>::ThreadblockB2bMma; + + static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK; + + /// Define the epilogue for the 2nd Gemm + using Epilogue = typename cutlass::epilogue::threadblock:: + DefaultInterleavedEpilogueTensorOp< + ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1, + 64 / sizeof_bits::value, InterleavedK, + IsBetaZero>::Epilogue; + + /// Define the kernel-level GEMM operator. + using B2bGemmKernel = kernel::B2bGemm; +}; + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace gemm +} // namespace cutlass diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h b/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h new file mode 100644 index 000000000..01cca8b7a --- /dev/null +++ b/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h @@ -0,0 +1,230 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape0_, + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape1_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy0_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy1_, + /// Number of stages, + int Stages, + /// Used for partial specialization + typename Enable = bool> +class B2bMmaBase { + public: + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape0 = Shape0_; + using Shape1 = Shape1_; + + ///< Policy describing tuning details + using Policy0 = Policy0_; + using Policy1 = Policy1_; + + // + // Dependent types + // + + /// Warp-level Mma + using Operator0 = typename Policy0::Operator; + using Operator1 = typename Policy1::Operator; + + /// Shape describing the overall GEMM computed from shared memory + /// by each warp. + using WarpGemm0 = typename Policy0::Operator::Shape; + using WarpGemm1 = typename Policy1::Operator::Shape; + + /// Shape describing the number of warps filling the CTA + using WarpCount0 = GemmShape; + using WarpCount1 = GemmShape; + + /// Number of warp-level GEMM oeprations + static int const kWarpGemmIterations0 = + (WarpGemm0::kK / Operator0::Policy::MmaShape::kK); + static int const kWarpGemmIterations1 = + (WarpGemm1::kK / Operator1::Policy::MmaShape::kK); + + /// Number of stages + static int const kStages = Stages; + + // + // Nested structs + // + + /// Shared storage object needed by threadblock-scoped GEMM + template< + typename Shape_, + typename Policy_ + > + class SharedStorage { + public: + // + // Type definitions + // + using Shape = Shape_; + using Policy = Policy_; + using Operator = typename Policy::Operator; + + /// Tensor reference to the A operand + using TensorRefA = TensorRef; + + /// Tensor reference to the B operand + using TensorRefB = TensorRef; + + + /// Shape of the A matrix operand in shared memory + using ShapeA = MatrixShape; + + /// Shape of the B matrix operand in shared memory + using ShapeB = + MatrixShape; + + public: + // + // Data members + // + + /// Buffer for A operand + AlignedBuffer operand_A; + + /// Buffer for B operand + AlignedBuffer operand_B; + + public: + + // + // Methods + // + + /// Returns a layout object for the A matrix + CUTLASS_DEVICE + static typename Operator::LayoutA LayoutA() { + return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn}); + } + + /// Returns a layout object for the B matrix + CUTLASS_HOST_DEVICE + static typename Operator::LayoutB LayoutB() { + return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn}); + } + + /// Returns a TensorRef to the A operand + CUTLASS_HOST_DEVICE + TensorRefA operand_A_ref() { + return TensorRefA{operand_A.data(), LayoutA()}; + } + + /// Returns a TensorRef to the B operand + CUTLASS_HOST_DEVICE + TensorRefB operand_B_ref() { + return TensorRefB{operand_B.data(), LayoutB()}; + } + }; + + using SharedStorage0 = SharedStorage; + using SharedStorage1 = SharedStorage; + union B2bMmaSharedStorage { + SharedStorage0 sharedStorage0; + SharedStorage1 sharedStorage1; + }; + + + protected: + + // + // Data members + // + + /// Iterator to load a warp-scoped tile of A0 operand from shared memory + typename Operator0::IteratorA warp_tile_iterator_A0_; + + /// Iterator to load a warp-scoped tile of B0 operand from shared memory + typename Operator0::IteratorB warp_tile_iterator_B0_; + + /// Iterator to load a warp-scoped tile of B0 operand from shared memory + typename Operator1::IteratorB warp_tile_iterator_B1_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + B2bMmaBase( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + B2bMmaSharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx + ): + warp_tile_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), lane_idx), + warp_tile_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), lane_idx), + warp_tile_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), lane_idx) { + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h b/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h new file mode 100644 index 000000000..ca89cf0bd --- /dev/null +++ b/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h @@ -0,0 +1,509 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/aligned_buffer.h" +#include "cutlass/numeric_conversion.h" + +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h" + +#include "threadblock/b2b_mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////////////////////// +template +struct chk_val { + static_assert(a==0, "check value"); +}; + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape0_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorA0_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA0_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorB0_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB0_, + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape1_, + /// Iterates over the intermediate accumulator tile + // (concept::MmaTensorOpFragmentIterator) + typename FragmentIteratorA1_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorB1_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB1_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) + typename OutputOp_, + /// Policy describing tuning details (concept: MmaPipelinedPolicy) + typename Policy0_, + /// Policy describing tuning details (concept: MmaPipelinedPolicy) + typename Policy1_, + /// Transformation applied to A0 operand + typename TransformA0_ = NumericArrayConverter< + typename SmemIteratorA0_::Element, + typename IteratorA0_::Element, + IteratorA0_::Fragment::kElements>, + /// + /// Transformation applied to B0 operand + typename TransformB0_ = NumericArrayConverter< + typename SmemIteratorB0_::Element, + typename IteratorB0_::Element, + IteratorB0_::Fragment::kElements>, + /// + /// Transformation applied to B1 operand + typename TransformB1_ = NumericArrayConverter< + typename SmemIteratorB1_::Element, + typename IteratorB1_::Element, + IteratorB1_::Fragment::kElements>, + /// Used for partial specialization + typename Enable = bool +> +class B2bMmaPipelined : public B2bMmaBase { +public: + + ///< Base class + using Base = B2bMmaBase; + + using Shape0 = Shape0_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using IteratorA0 = IteratorA0_; ///< Iterates over tiles of A operand in global memory + using IteratorB0 = IteratorB0_; ///< Iterates over tiles of B operand in global memory + using Policy0 = Policy0_; ///< Policy describing tuning details + + using SmemIteratorA0 = SmemIteratorA0_; + using SmemIteratorB0 = SmemIteratorB0_; + + using Shape1 = Shape1_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over intermediate accumulator tile + using IteratorB1 = IteratorB1_; ///< Iterates over tiles of B operand in global memory + using Policy1 = Policy1_; ///< Policy describing tuning details + + using SmemIteratorB1 = SmemIteratorB1_; + + + using ElementC = ElementC_; ///< Data type of accumulator matrix + using LayoutC = LayoutC_; ///< Layout of accumulator matrix + + using OutputOp = OutputOp_; ///< Epilogue after 1st Gemm + + using TransformA0 = TransformA0_; + using TransformB0 = TransformB0_; + using TransformB1 = TransformB1_; + + // + // Dependent types + // + + /// Fragment of operand A loaded from global memory + using FragmentA0 = typename IteratorA0::Fragment; + + /// Fragment of operand B loaded from global memory + using FragmentB0 = typename IteratorB0::Fragment; + + /// Fragment of accumulator tile + using FragmentC0 = typename Policy0::Operator::FragmentC; + + /// Warp-level Mma + using Operator0 = typename Policy0::Operator; + + /// Fragment of operand B loaded from global memory + using FragmentB1 = typename IteratorB1::Fragment; + + /// Fragment of accumulator tile + using FragmentC1 = typename Policy1::Operator::FragmentC; + + /// Warp-level Mma + using Operator1 = typename Policy1::Operator; + + /// Obtain the arch tag from the warp-level operator + using ArchTag = typename Policy0::Operator::ArchTag; + + /// Complex transform on A0 operand + static ComplexTransform const kTransformA0 = Operator0::kTransformA; + + /// Complex transform on B0 operand + static ComplexTransform const kTransformB0 = Operator0::kTransformB; + + /// Complex transform on B1 operand + static ComplexTransform const kTransformB1 = Operator1::kTransformB; + + // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline) + static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2"); + +private: + + using WarpFragmentA0 = typename Operator0::FragmentA; + using WarpFragmentB0 = typename Operator0::FragmentB; + /// Warp Fragment of operand A1 loaded from accmulator tile + using WarpFragmentA1 = typename FragmentIteratorA1::Fragment; + using WarpFragmentB1 = typename Operator1::FragmentB; + +protected: + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA0 smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B0 operand to shared memory + SmemIteratorB0 smem_iterator_B0_; + + /// Iterator to write threadblock-scoped tile of B1 operand to shared memory + SmemIteratorB1 smem_iterator_B1_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + B2bMmaPipelined( + typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM + int thread_idx, ///< ID within the threadblock + int warp_idx, ///< ID of warp + int lane_idx ///< ID of each thread within a warp + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx), + smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx), + smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) { + + + // Compute warp location within threadblock tile by mapping the warp_id to three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + //These should stay the same across different GEMM layers + int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN); + int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM; + + //These may change across different GEMM layers + int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k; + int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0}); + this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n}); + this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n}); + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + int gemm_k_iterations_0, ///< number of iterations of the mainloop + FragmentC1 &accum, ///< destination accumulator tile + IteratorA0 iterator_A, ///< iterator over A operand in global memory + IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory + IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory + FragmentC0 const &src_accum, ///< source accumualtor tile + OutputOp output_op_0, ///< epilogue operation after 1st Gemm + TransformA0 transform_A0 = TransformA0(), ///< transformation applied to A0 fragment + TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment + TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment + + // + // Prologue + // + + // Perform accumulation in the 'd' output operand + FragmentC0 accum0 = src_accum; + + FragmentA0 tb_frag_A; + FragmentB0 tb_frag_B0; + + tb_frag_A.clear(); + tb_frag_B0.clear(); + + // The last kblock is loaded in the prolog + iterator_A.load(tb_frag_A); + iterator_B0.load(tb_frag_B0); + + ++iterator_A; + ++iterator_B0; + + this->smem_iterator_A_.store(tb_frag_A); + this->smem_iterator_B0_.store(tb_frag_B0); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B0_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math instructions + WarpFragmentA0 warp_frag_A0[2]; + WarpFragmentB0 warp_frag_B0[2]; + + this->warp_tile_iterator_A0_.set_kgroup_index(0); + this->warp_tile_iterator_B0_.set_kgroup_index(0); + + this->warp_tile_iterator_A0_.load(warp_frag_A0[0]); + this->warp_tile_iterator_B0_.load(warp_frag_B0[0]); + + ++this->warp_tile_iterator_A0_; + ++this->warp_tile_iterator_B0_; + + Operator0 warp_mma0; + + int smem_write_stage_idx = 1; + + // Avoid reading out of bounds + if (gemm_k_iterations_0 <= 1) { + iterator_A.clear_mask(); + iterator_B0.clear_mask(); + } + + // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing + // shared memory loads (which have the tighest latency requirement). + iterator_A.load(tb_frag_A); + + // + // Mainloop + // + + // Note: The main loop does not support Base::WarpGemmIterations == 2. + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) { + + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group + // as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations0 - 1) { + + // Write fragments to shared memory + this->smem_iterator_A_.store(tb_frag_A); + + this->smem_iterator_B0_.store(tb_frag_B0); + + __syncthreads(); + + // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing + // shared memory loads (which have the tighest latency requirement). + iterator_A.load(tb_frag_A); + + ++this->smem_iterator_B0_; + ++this->smem_iterator_A_; + + + // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory + if (smem_write_stage_idx == 1) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0}); + } + else { + this->warp_tile_iterator_A0_.add_tile_offset( + {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0}); + this->warp_tile_iterator_B0_.add_tile_offset( + {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0); + this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0); + + this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A0_; + ++this->warp_tile_iterator_B0_; + + if (warp_mma_k == 0) { + + iterator_B0.load(tb_frag_B0); + + ++iterator_A; + ++iterator_B0; + + // Avoid reading out of bounds if this was the last loop iteration + if (gemm_k_iterations_0 <= 2) { + iterator_A.clear_mask(); + iterator_B0.clear_mask(); + } + } + + warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2], warp_frag_B0[warp_mma_k % 2], accum0); + } + } + + //2nd Gemm + + /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile + FragmentIteratorA1 warp_tile_iterator_A1_(accum0); + + // + // Prologue + // + + FragmentB1 tb_frag_B1; + + tb_frag_B1.clear(); + + // The last kblock is loaded in the prolog + iterator_B1.load(tb_frag_B1); + + ++iterator_B1; + + this->smem_iterator_B1_.store(tb_frag_B1); + + ++this->smem_iterator_B1_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math instructions + WarpFragmentA1 warp_frag_A1[2]; + WarpFragmentB1 warp_frag_B1[2]; + + //warp_tile_iterator_A1_.set_kgroup_index(0); + this->warp_tile_iterator_B1_.set_kgroup_index(0); + + warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0); + this->warp_tile_iterator_B1_.load(warp_frag_B1[0]); + + ++warp_tile_iterator_A1_; + ++this->warp_tile_iterator_B1_; + + Operator1 warp_mma1; + + smem_write_stage_idx = 1; + + int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1; + + // Avoid reading out of bounds + if (gemm_k_iterations_1 <= 1) { + iterator_B1.clear_mask(); + } + + // + // Mainloop + // + + // Note: The main loop does not support Base::WarpGemmIterations == 2. + CUTLASS_PRAGMA_UNROLL + for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) { + + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group + // as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations1 - 1) { + + // Write fragments to shared memory + + this->smem_iterator_B1_.store(tb_frag_B1); + + __syncthreads(); + ++smem_iterator_B1_; + + // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory + if (smem_write_stage_idx == 1) { + smem_iterator_B1_.add_tile_offset({-Base::kStages, 0}); + } + else { + this->warp_tile_iterator_B1_.add_tile_offset( + {-Base::kStages * Policy1::kPartitionsK * + Base::kWarpGemmIterations1, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1); + + warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0); + this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]); + + + ++warp_tile_iterator_A1_; + ++this->warp_tile_iterator_B1_; + + if (warp_mma_k == 0) { + + iterator_B1.load(tb_frag_B1); + ++iterator_B1; + + + // Avoid reading out of bounds if this was the last loop iteration + if (gemm_k_iterations_1 <= 2) { + iterator_B1.clear_mask(); + } + } + + warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], warp_frag_B1[warp_mma_k % 2], accum); + } + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass diff --git a/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h b/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h new file mode 100644 index 000000000..cd1403c79 --- /dev/null +++ b/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h @@ -0,0 +1,289 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/arch.h" + +#include "cutlass/transform/threadblock/predicated_tile_iterator.h" +#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm75.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" +#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h" + +#include "threadblock/b2b_mma_pipelined.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Operator class tag + typename OperatorClass_, + /// Tag indicating architecture to tune for + typename ArchTag_, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0_, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1_, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0_, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1_, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape_, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation perfomed by GEMM + typename Operator, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor = false> +struct DefaultB2bMma; + +//////////////////////////////////////////////////////////////////////////////// +/// Specialization for row-major output +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Operation performed by GEMM + typename Operator, + /// Epilogue output operator + typename EpilogueOutputOp> +struct DefaultB2bMma { + // Define the MmaCore components + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, + OperatorClass, 2, Operator>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, + OperatorClass, 2, Operator>; + + // Define iterators over tiles from the A operand + using IteratorA0 = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>; + + // Define iterators over tiles from the B operand + using IteratorB0 = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::ColumnMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp, true>; + + // Define iterators over tiles from the B operand + using IteratorB1 = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>; + + // Define the threadblock-scoped pipelined matrix multiply + using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined< + typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA, + IteratorB0, typename MmaCore0::SmemIteratorB, + typename MmaCore1::Shape, FragmentIteratorA1, + IteratorB1, typename MmaCore1::SmemIteratorB, + ElementAccumulator, layout::RowMajor, + EpilogueOutputOp, + typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>; + +}; +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for column-major-interleaved output +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Operation performed by GEMM + typename Operator, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Number of Interleaved K + int InterleavedK> +struct DefaultB2bMma, OperatorClass, ArchTag, + ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, + InstructionShape, 2, Operator, EpilogueOutputOp, true> { + // Define the MmaCore components + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, + layout::ColumnMajorInterleaved, OperatorClass, 2, Operator, + true>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, + layout::ColumnMajorInterleaved, OperatorClass, 2, Operator, + true>; + + static_assert(kAlignmentA == 128 / sizeof_bits::value, + "Alignment must match thread data map's vector length"); + + static_assert(kAlignmentB ==128 / sizeof_bits::value, + "Alignment must match thread data map's vector length"); + + // Define iterators over tiles from the A operand + using IteratorA0 = cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, ElementA, + LayoutA, 1, typename MmaCore0::IteratorThreadMapA>; + + // Define iterators over tiles from the B operand + using IteratorB0 = cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, ElementB, + LayoutB, 0, typename MmaCore0::IteratorThreadMapB>; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::RowMajor; //AccumulatorsInRowMajor = true + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, + InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>; + + // Define iterators over tiles from the B operand + using IteratorB1 = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>; + + + + // Define the threadblock-scoped pipelined matrix multiply + using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined< + typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA, + IteratorB0, typename MmaCore0::SmemIteratorB, + typename MmaCore1::Shape, FragmentIteratorA1, + IteratorB1, typename MmaCore1::SmemIteratorB, + ElementAccumulator, layout::ColumnMajorInterleaved, + EpilogueOutputOp, + typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d5c503e90..3da7ae452 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -60,6 +60,8 @@ foreach(EXAMPLE 08_turing_tensorop_gemm 10_planar_complex 11_planar_complex_array + 12_gemm_bias_relu + 13_fused_two_gemms ) add_subdirectory(${EXAMPLE}) diff --git a/include/cutlass/aligned_buffer.h b/include/cutlass/aligned_buffer.h index 3232ef87d..8b3bb0713 100644 --- a/include/cutlass/aligned_buffer.h +++ b/include/cutlass/aligned_buffer.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h index b38a347a4..faf01cc65 100644 --- a/include/cutlass/arch/arch.h +++ b/include/cutlass/arch/arch.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -52,6 +52,10 @@ struct Sm72 { struct Sm75 { static int const kMinComputeCapability = 75; }; +struct Sm80 { + static int const kMinComputeCapability = 80; +}; + //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace arch diff --git a/include/cutlass/arch/cache_operation.h b/include/cutlass/arch/cache_operation.h new file mode 100644 index 000000000..646b51ded --- /dev/null +++ b/include/cutlass/arch/cache_operation.h @@ -0,0 +1,60 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Directives related to cache operations +*/ +#pragma once + +#include "cutlass/cutlass.h" + +namespace cutlass { +namespace arch { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Controls PTX cache operations +struct CacheOperation { + enum Kind { + /// Cache at all levels - accessed again + Always, + /// Cache at global level + Global, + /// Streaming - likely to be accessed once + Streaming, + /// Indicates the line will not be used again + LastUse, + /// Don't cache, and fetch again + Volatile, + /// Write back at all coherent levels + WriteBack, + /// Write through to system memory + WriteThrough + }; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace arch +} // namespace cutlass diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h index fc939053d..48ef02cd0 100644 --- a/include/cutlass/arch/memory.h +++ b/include/cutlass/arch/memory.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -28,13 +28,271 @@ #pragma once +#include "cutlass/cutlass.h" + namespace cutlass { namespace arch { ///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Fragment type to store loaded data + typename AccessType, + /// The bytes of loading + int LoadBytes + > +struct global_load; ///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Specializations +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint4 *data = reinterpret_cast(&D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %9, 0;\n" + " mov.b32 %0, %10;\n" + " mov.b32 %1, %11;\n" + " mov.b32 %2, %12;\n" + " mov.b32 %3, %13;\n" + " mov.b32 %4, %14;\n" + " mov.b32 %5, %15;\n" + " mov.b32 %6, %16;\n" + " mov.b32 %7, %17;\n" + " @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n" + " @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n" + "}\n" + : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w), + "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w) + : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y), + "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y), + "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16)); + } +}; + + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint4 &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " mov.b32 %0, %6;\n" + " mov.b32 %1, %7;\n" + " mov.b32 %2, %8;\n" + " mov.b32 %3, %9;\n" + " @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n" + "}\n" + : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w) + : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w)); + } +}; + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint2 &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %3, 0;\n" + " mov.b32 %0, %4;\n" + " mov.b32 %1, %5;\n" + " @p ld.global.v2.u32 {%0, %1}, [%2];\n" + "}\n" + : "=r"(data.x), "=r"(data.y) + : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y)); + } +}; + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + unsigned &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " mov.b32 %0, %3;\n" + " @p ld.global.u32 %0, [%1];\n" + "}\n" + : "=r"(data) + : "l"(ptr), "r"((int)pred_guard), "r"(data)); + } +}; + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint16_t &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " mov.b16 %0, %3;\n" + " @p ld.global.u16 %0, [%1];\n" + "}\n" + : "=h"(data) + : "l"(ptr), "r"((int)pred_guard), "h"(data)); + } +}; + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + if (pred_guard) D = *(reinterpret_cast(ptr)); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Fragment type to store loaded data + typename AccessType, + /// The bytes of loading + int LoadBytes + > +struct global_store; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Specializations +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint4 const *data = reinterpret_cast(&D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" + " @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n" + "}\n" + : + : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), + "r"(data[0].w), "r"((int)pred_guard), "l"(((uint8_t *)ptr) + 16), + "r"(data[1].x), "r"(data[1].y), "r"(data[1].z), "r"(data[1].w)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint4 const &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" + "}\n" + : + : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint2 const &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %3, 0;\n" + " @p st.global.v2.u32 [%0], {%1, %2};\n" + "}\n" + : + : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint32_t const &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " @p st.global.u32 [%0], %1;\n" + "}\n" + : + : "l"(ptr), "r"(data), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint16_t const &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " @p st.global.u16 [%0], %1;\n" + "}\n" + : + : "l"(ptr), "h"(data), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + if (pred_guard) *(reinterpret_cast(ptr)) = D; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace arch } // namespace cutlass @@ -42,4 +300,6 @@ namespace arch { ///////////////////////////////////////////////////////////////////////////////////////////////// #include "memory_sm75.h" +#include "memory_sm80.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h index 195f8abf1..3fd121b90 100644 --- a/include/cutlass/arch/memory_sm75.h +++ b/include/cutlass/arch/memory_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -50,20 +50,20 @@ inline __device__ void ldsm(Array & D, void const* ptr); // ///////////////////////////////////////////////////////////////////////////////////////////////// -#if ! defined(CUDA_LDMATRIX_SUPPORTED) - #define CUDA_LDMATRIX_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2)) +#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) || (__CUDACC_VER_MAJOR__ >= 11) + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) +#define CUDA_LDMATRIX_ACTIVATED 1 #endif -#if ! defined(CUDA_LDMATRIX_ENABLED) - #define CUDA_LDMATRIX_ENABLED CUDA_LDMATRIX_SUPPORTED -#endif - -#if CUDA_LDMATRIX_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) - #define CUDA_LDMATRIX_ACTIVATED 1 +#define CUDA_LDMATRIX_SUPPORTED 1 #endif ///////////////////////////////////////////////////////////////////////////////////////////////// - +/* +#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED) && (__CUDACC_VER_MAJOR__ > 10) + #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED 1 +#endif #if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED) #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 1)) #endif @@ -71,8 +71,9 @@ inline __device__ void ldsm(Array & D, void const* ptr); #if ! defined(CUDA_NVVM_GET_SMEM_POINTER_ENABLED) #define CUDA_NVVM_GET_SMEM_POINTER_ENABLED CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED #endif +*/ -#if CUDA_NVVM_GET_SMEM_POINTER_ENABLED +#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) extern "C" { // // This NVVM intrinsic is subject to change in future versions of CUDA. @@ -85,19 +86,49 @@ inline __device__ void ldsm(Array & D, void const* ptr); ///////////////////////////////////////////////////////////////////////////////////////////////// -#if CUDA_NVVM_GET_SMEM_POINTER_ENABLED +/// CUTLASS helper to get SMEM pointer +inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) { + +// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to +// the previous internal intrinsics if they are available. +#if (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11) + // + // This NVVM intrinsic converts an address in shared memory to a plain + // unsigned integer. This is necessary to pass to shared memory instructions + // in inline PTX. + // + // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer() [only available in 10.2]. + // + //__device__ size_t __cvta_generic_to_shared(void* ptr); /// CUTLASS helper to get SMEM pointer - inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) { - return __nvvm_get_smem_pointer(const_cast(ptr)); - } + return static_cast(__cvta_generic_to_shared(ptr)); - /// CUTLASS helper to get SMEM pointer - inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) { - return __nvvm_get_smem_pointer(ptr); - } +#elif (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) + return __nvvm_get_smem_pointer(ptr); + +#elif defined(__CUDA_ARCH__) + + uint32_t smem_ptr; + + asm( + "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n" + : "=r"(smem_ptr) : "l"(ptr)); + + return smem_ptr; + +#else + + return 0; #endif +} + +/// CUTLASS helper to get SMEM pointer +inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) { + return cutlass_get_smem_pointer(const_cast(ptr)); +} + ///////////////////////////////////////////////////////////////////////////////////////////////// template <> @@ -235,5 +266,6 @@ inline __device__ void ldsm( } ///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace arch } // namespace cutlass diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h new file mode 100644 index 000000000..04c568760 --- /dev/null +++ b/include/cutlass/arch/memory_sm80.h @@ -0,0 +1,238 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Architecture-specific operators on memory added for SM80 +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/arch/cache_operation.h" + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + #define CUDA_CP_ASYNC_ACTIVATED 1 +#else + #define CUDA_CP_ASYNC_ACTIVATED 0 +#endif + +namespace cutlass { +namespace arch { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Initiates an asynchronous copy from global memory to shared memory. +/// +/// LDGSTS +/// +template < + /// Size of the access in bytes + int SizeInBytes, + /// Cache operation + CacheOperation::Kind cache_op = CacheOperation::Always> +struct cp_async; + +/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate +/// the entire transfer, zeros are written to SMEM if the guard predicate is false. +/// +/// LDGSTS +/// +template < + /// Size of the access in bytes + int SizeInBytes, + /// Cache operation + CacheOperation::Kind cache_op = CacheOperation::Always> +struct cp_async_zfill; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization +template < + /// Size of the access in bytes + int SizeInBytes> +struct cp_async { + /// Copy + CUTLASS_DEVICE + cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { + #if CUDA_CP_ASYNC_ACTIVATED + + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.ca.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred_guard), + "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes)); + + #else + using AccessType = Array; + + if (pred_guard) { + *static_cast(smem_ptr) = *static_cast(global_ptr); + } + #endif + } +}; + +/// Partial specialization +template < + /// Size of the access in bytes + int SizeInBytes> +struct cp_async_zfill { + /// Copy with zero fill + CUTLASS_DEVICE + cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) { + #if CUDA_CP_ASYNC_ACTIVATED + + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); + int src_in_bytes = (pred_guard ? SizeInBytes : 0); + + asm volatile( + "cp.async.ca.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr), + "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes)); + + #else + using AccessType = Array; + + if (pred_guard) { + *static_cast(smem_ptr) = *static_cast(global_ptr); + } + else { + AccessType zeros; + zeros.clear(); + *static_cast(smem_ptr) = zeros; + } + #endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization +template < + /// Size of the access in bytes + int SizeInBytes> +struct cp_async { + /// Copy + CUTLASS_DEVICE + cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { + #if CUDA_CP_ASYNC_ACTIVATED + + static_assert(SizeInBytes == 16, + "cp.async only supports CacheOperation::Global when access size is 16B."); + + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred_guard), + "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes)); + + #else + using AccessType = Array; + + if (pred_guard) { + *static_cast(smem_ptr) = *static_cast(global_ptr); + } + #endif + } +}; + +/// Partial specialization +template < + /// Size of the access in bytes + int SizeInBytes> +struct cp_async_zfill { + /// Copy with zero fill + CUTLASS_DEVICE + cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { + #if CUDA_CP_ASYNC_ACTIVATED + + static_assert(SizeInBytes == 16, + "cp.async only supports CacheOperation::Global when access size is 16B."); + + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); + int src_in_bytes = (pred_guard ? SizeInBytes : 0); + + asm volatile( + "cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr), + "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes)); + + #else + using AccessType = Array; + + if (pred_guard) { + *static_cast(smem_ptr) = *static_cast(global_ptr); + } + else { + AccessType zeros; + zeros.clear(); + *static_cast(smem_ptr) = zeros; + } + #endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block. +CUTLASS_DEVICE +void cp_async_fence() { + #if CUDA_CP_ASYNC_ACTIVATED + asm volatile("cp.async.commit_group;\n" ::); + #endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Blocks until all but previous cp.async.commit_group operations have committed. +template +CUTLASS_DEVICE void cp_async_wait() { + #if CUDA_CP_ASYNC_ACTIVATED + asm volatile("cp.async.wait_group %0;\n" ::"n"(N)); + #endif +} + +/// Blocks until all previous cp.async.commit_group operations have committed. +template <> +CUTLASS_DEVICE void cp_async_wait<0>() { + #if CUDA_CP_ASYNC_ACTIVATED + asm volatile("cp.async.wait_all;\n" ::); + #endif +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace arch +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h index e59b710fb..74c246954 100644 --- a/include/cutlass/arch/mma.h +++ b/include/cutlass/arch/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -51,11 +51,26 @@ struct OpMultiplyAddSaturate; ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Tag indicating the input is converted to a narrower type (BF16) +struct OpMultiplyAddFastBF16; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tag indicating the input is converted to a narrower type (F16) +struct OpMultiplyAddFastF16; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Tag indicating the complex multiply-add operation struct OpMultiplyAddComplex; ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Tag indicating the gaussian complex multiply-add operation +struct OpMultiplyAddGaussianComplex; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Tag indicating the inner product is defined by (XOR, POPC) struct OpXorPopc; diff --git a/include/cutlass/arch/mma_sm50.h b/include/cutlass/arch/mma_sm50.h index 8698a8b3c..fce521dce 100644 --- a/include/cutlass/arch/mma_sm50.h +++ b/include/cutlass/arch/mma_sm50.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h index 6e513cedc..ab0481ae4 100644 --- a/include/cutlass/arch/mma_sm60.h +++ b/include/cutlass/arch/mma_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h index 68a1b145f..9ec8857e8 100644 --- a/include/cutlass/arch/mma_sm61.h +++ b/include/cutlass/arch/mma_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm70.h b/include/cutlass/arch/mma_sm70.h index 57b50e000..b03ce2c1d 100644 --- a/include/cutlass/arch/mma_sm70.h +++ b/include/cutlass/arch/mma_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h index fb8a3dc52..ef65f20b9 100644 --- a/include/cutlass/arch/mma_sm75.h +++ b/include/cutlass/arch/mma_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h new file mode 100644 index 000000000..445ec388d --- /dev/null +++ b/include/cutlass/arch/mma_sm80.h @@ -0,0 +1,2091 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Matrix multiply +*/ + +#pragma once + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#endif + +#include "mma.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/numeric_types.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) + +#define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1 + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) +#define CUTLASS_ARCH_MMA_SM80_ENABLED +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace arch { + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 1688 - Float BF16, FP32 accumulation +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation - F32 = bf16 * bf16 + F32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 8>, + 32, + bfloat16_t, + layout::RowMajor, + bfloat16_t, + layout::ColumnMajor, + float, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 8>; + + using ElementA = bfloat16_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = bfloat16_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + uint32_t const *C = reinterpret_cast(&c); + uint32_t *D = reinterpret_cast(&d); + + asm( + "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : + "r"(A[0]), "r"(A[1]), + "r"(B[0]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]) + ); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 1684 - Float TF32 +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 4>, + 32, + tfloat32_t, + layout::RowMajor, + tfloat32_t, + layout::ColumnMajor, + float, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 4>; + + using ElementA = tfloat32_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = tfloat32_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + float const *C = reinterpret_cast(&c); + float *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) + : + "r"(A[0]), "r"(A[1]), + "r"(B[0]), + "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]) + ); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 1688 - Float TF32 +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32 +template <> +struct Mma, 32, tfloat32_t, layout::RowMajor, + tfloat32_t, layout::ColumnMajor, float, layout::RowMajor, + OpMultiplyAdd> { + using Shape = gemm::GemmShape<16, 8, 8>; + + using ElementA = tfloat32_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = tfloat32_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + float const *C = reinterpret_cast(&c); + float *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16816 +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F16 = F16 * F16 + F16 +template <> +struct Mma< + gemm::GemmShape<16, 8, 16>, + 32, + half_t, + layout::RowMajor, + half_t, + layout::ColumnMajor, + half_t, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 16>; + + using ElementA = half_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = half_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = half_t; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + uint32_t const *C = reinterpret_cast(&c); + uint32_t *D = reinterpret_cast(&d); + + asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" + : "=r"(D[0]), "=r"(D[1]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), + "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]) + ); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 16>, + 32, + bfloat16_t, + layout::RowMajor, + bfloat16_t, + layout::ColumnMajor, + float, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 16>; + + using ElementA = bfloat16_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = bfloat16_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + uint32_t const *C = reinterpret_cast(&c); + uint32_t *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F32 = F16 * F16 + F32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 16>, + 32, + half_t, + layout::RowMajor, + half_t, + layout::ColumnMajor, + float, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 16>; + + using ElementA = half_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = half_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + uint32_t const *C = reinterpret_cast(&c); + uint32_t *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, " + "{%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 884 - F64 +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F64 = F64 * F64 + F64 +template <> +struct Mma< + gemm::GemmShape<8,8,4>, + 32, + double, + layout::RowMajor, + double, + layout::ColumnMajor, + double, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<8,8,4>; + + using ElementA = double; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = double; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = double; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + + using ArchTag = arch::Sm80; + + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint64_t const & A = reinterpret_cast(a); + uint64_t const & B = reinterpret_cast(b); + + uint64_t const *C = reinterpret_cast(&c); + uint64_t *D = reinterpret_cast(&d); + + asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=l"(D[0]), "=l"(D[1]) + : "l"(A), "l"(B), "l"(C[0]), "l"(C[1])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16816 - S8 input, S32 accumulation +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + int8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, " + "{%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + uint8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, " + "{%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + int8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, " + "{%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + uint8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, " + "{%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + + +#else + assert(0); +#endif + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + int8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, " + "{%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + uint8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, " + "{%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + int8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, " + "{%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + uint8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, " + "{%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16832 - S8 input, S32 accumulation +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + int8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + uint8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + int8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + uint8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + int8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const * A = reinterpret_cast(&a); + uint32_t const * B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + uint8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + int8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + uint8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16864 - S4 input, S32 accumulation +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S4 * S4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::int4b_t, + layout::RowMajor, + cutlass::int4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::int4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::int4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U4 * S4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::uint4b_t, + layout::RowMajor, + cutlass::int4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::uint4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::int4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S4 * U4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::int4b_t, + layout::RowMajor, + cutlass::uint4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::int4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U4 * U4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::uint4b_t, + layout::RowMajor, + cutlass::uint4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::uint4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S4 * S4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::int4b_t, + layout::RowMajor, + cutlass::int4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::int4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::int4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const * A = reinterpret_cast(&a); + uint32_t const * B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U4 * S4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::uint4b_t, + layout::RowMajor, + cutlass::int4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::uint4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::int4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S4 * U4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::int4b_t, + layout::RowMajor, + cutlass::uint4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::int4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U4 * U4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::uint4b_t, + layout::RowMajor, + cutlass::uint4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::uint4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = B1 & B1 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,256>, + 32, + cutlass::uint1b_t, + layout::RowMajor, + cutlass::uint1b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,256>; + + using ElementA = cutlass::uint1b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint1b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int32_t; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 168256 - B1 input, S32 accumulation - XOR,POPC +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = B1 & B1 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,256>, + 32, + cutlass::uint1b_t, + layout::RowMajor, + cutlass::uint1b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpXorPopc> { + + using Shape = gemm::GemmShape<16,8,256>; + + using ElementA = cutlass::uint1b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint1b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpXorPopc; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace arch +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/arch/simd.h b/include/cutlass/arch/simd.h index 75b38001f..4520acc9b 100644 --- a/include/cutlass/arch/simd.h +++ b/include/cutlass/arch/simd.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/simd_sm60.h b/include/cutlass/arch/simd_sm60.h index cd0babd54..36030a366 100644 --- a/include/cutlass/arch/simd_sm60.h +++ b/include/cutlass/arch/simd_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/simd_sm61.h b/include/cutlass/arch/simd_sm61.h index e8d5c8897..94f1c617c 100644 --- a/include/cutlass/arch/simd_sm61.h +++ b/include/cutlass/arch/simd_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h index 9843e1349..88968abdc 100644 --- a/include/cutlass/arch/wmma.h +++ b/include/cutlass/arch/wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm70.h b/include/cutlass/arch/wmma_sm70.h index 6c989c9a1..94eeb93de 100644 --- a/include/cutlass/arch/wmma_sm70.h +++ b/include/cutlass/arch/wmma_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm72.h b/include/cutlass/arch/wmma_sm72.h index 477a72c3b..1b8cc1161 100644 --- a/include/cutlass/arch/wmma_sm72.h +++ b/include/cutlass/arch/wmma_sm72.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm75.h b/include/cutlass/arch/wmma_sm75.h index 2985be580..f630712fc 100644 --- a/include/cutlass/arch/wmma_sm75.h +++ b/include/cutlass/arch/wmma_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -120,8 +120,7 @@ struct Wmma< //////////////////////////////////////////////////////////////////////////////// // // WMMA template structure defines nvcuda::wmma::fragments and static assert for -// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1) -// (nvcuda::wmma targetting SASS instruction BMMA) +// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1). // //////////////////////////////////////////////////////////////////////////////// template < diff --git a/include/cutlass/array.h b/include/cutlass/array.h index be14a879e..0018b76f5 100644 --- a/include/cutlass/array.h +++ b/include/cutlass/array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -167,7 +167,7 @@ public: class const_iterator { /// Pointer to object - T *ptr_; + const T *ptr_; public: diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h index b340c890f..78081facc 100644 --- a/include/cutlass/array_subbyte.h +++ b/include/cutlass/array_subbyte.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h new file mode 100644 index 000000000..c3bd1782b --- /dev/null +++ b/include/cutlass/bfloat16.h @@ -0,0 +1,461 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! + \file + \brief Defines a proxy class for storing non-standard 16-bit floating point values with + 8 bits of exponent and 7 bit of mantissa. +*/ +#pragma once + +#if !defined(__CUDACC_RTC__) +#include +#include +#include +#endif + +#include "cutlass/cutlass.h" + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Floating-point type with 8 bits of exponent and 7 bits of mantissa. +struct alignas(2) bfloat16_t { + + // + // Data members + // + + /// Storage type + uint16_t storage; + + // + // Methods + // + + /// Constructs from an unsigned short + CUTLASS_HOST_DEVICE + static bfloat16_t bitcast(uint16_t x) { + bfloat16_t h; + h.storage = x; + return h; + } + + /// Default constructor + CUTLASS_HOST_DEVICE + bfloat16_t() { } + + /// Floating-point conversion - round toward nearest + CUTLASS_HOST_DEVICE + explicit bfloat16_t(float x) { + + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11) + + asm("cvt.rn.bf16.f32 %0, %1;\n" : "=h"(storage) : "f"(x)); + + #else + uint32_t bits = reinterpret_cast(x); + + if ((bits & 0x7f800000) != 0x7f800000) { + + bool mantissa_bit = ((bits & (1 << 16)) != 0); + bool round_bit = ((bits & (1 << 15)) != 0); + bool sticky_bit = ((bits & ((1 << 15) - 1)) != 0); + + if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) { + bits += uint32_t(1 << 16); + } + } + else if (bits & ~0xff800000) { + bits = 0x7fffffff; + } + + storage = uint16_t((bits >> 16) & 0xffff); + #endif + } + + /// Floating-point conversion - round toward nearest + CUTLASS_HOST_DEVICE + explicit bfloat16_t(double x): bfloat16_t(float(x)) { + + } + + /// Integer conversion - round toward nearest + CUTLASS_HOST_DEVICE + explicit bfloat16_t(int x) { + float flt = static_cast(x); + storage = uint16_t(reinterpret_cast(flt) >> 16); + } + + /// Converts to float + CUTLASS_HOST_DEVICE + operator float() const { + unsigned bits = (unsigned(storage) << 16); + return reinterpret_cast(bits); + } + + /// Converts to float + CUTLASS_HOST_DEVICE + operator double() const { + return double(float(*this)); + } + + /// Converts to int + CUTLASS_HOST_DEVICE + explicit operator int() const { + return int(float(*this)); + } + + /// Casts to bool + CUTLASS_HOST_DEVICE + operator bool() const { + return (float(*this) != 0.0f); + } + + /// Obtains raw bits + CUTLASS_HOST_DEVICE + uint16_t raw() const { + return storage; + } + /// Returns the sign bit + CUTLASS_HOST_DEVICE + bool signbit() const { + return ((raw() & 0x8000) != 0); + } + + /// Returns the biased exponent + CUTLASS_HOST_DEVICE + int exponent_biased() const { + return int((raw() >> 7) & 0x0ff); + } + + /// Returns the unbiased exponent + CUTLASS_HOST_DEVICE + int exponent() const { + return exponent_biased() - 127; + } + + /// Returns the mantissa + CUTLASS_HOST_DEVICE + int mantissa() const { + return int(raw() & 0x7f); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTLASS_HOST_DEVICE +bool signbit(cutlass::bfloat16_t const& h) { + return h.signbit(); +} + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t abs(cutlass::bfloat16_t const& h) { + return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fffffff); +} + +CUTLASS_HOST_DEVICE +bool isnan(cutlass::bfloat16_t const& h) { + return (h.exponent_biased() == 0x0ff) && h.mantissa(); +} + +CUTLASS_HOST_DEVICE +bool isfinite(cutlass::bfloat16_t const& h) { + return (h.exponent_biased() != 0x0ff); +} + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t nan_bf16(const char*) { + // NVIDIA canonical NaN + return cutlass::bfloat16_t::bitcast(0x7fff); +} + +CUTLASS_HOST_DEVICE +bool isinf(cutlass::bfloat16_t const& h) { + return (h.exponent_biased() == 0x0ff) && !h.mantissa(); +} + +CUTLASS_HOST_DEVICE +bool isnormal(cutlass::bfloat16_t const& h) { + return h.exponent_biased() && h.exponent_biased() != 0x0ff; +} + +CUTLASS_HOST_DEVICE +int fpclassify(cutlass::bfloat16_t const& h) { + int exp = h.exponent_biased(); + int mantissa = h.mantissa(); + if (exp == 0x0ff) { + if (mantissa) { + return FP_NAN; + } + else { + return FP_INFINITE; + } + } + else if (!exp) { + if (mantissa) { + return FP_SUBNORMAL; + } + else { + return FP_ZERO; + } + } + return FP_NORMAL; +} + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t sqrt(cutlass::bfloat16_t const& h) { +#if defined(__CUDACC_RTC__) + return cutlass::bfloat16_t(sqrtf(float(h))); +#else + return cutlass::bfloat16_t(std::sqrt(float(h))); +#endif +} + +CUTLASS_HOST_DEVICE +bfloat16_t copysign(bfloat16_t const& a, bfloat16_t const& b) { + + uint16_t a_mag = (reinterpret_cast(a) & 0x7fff); + uint16_t b_sign = (reinterpret_cast(b) & 0x8000); + uint16_t result = (a_mag | b_sign); + + return reinterpret_cast(result); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Standard Library operations and definitions +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace std { + +#if !defined(__CUDACC_RTC__) +/// Numeric limits +template <> +struct numeric_limits { + static bool const is_specialized = true; + static bool const is_signed = true; + static bool const is_integer = false; + static bool const is_exact = false; + static bool const has_infinity = true; + static bool const has_quiet_NaN = true; + static bool const has_signaling_NaN = false; + static std::float_denorm_style const has_denorm = std::denorm_present; + static bool const has_denorm_loss = true; + static std::float_round_style const round_style = std::round_to_nearest; + static bool const is_iec559 = false; + static bool const is_bounded = true; + static bool const is_modulo = false; + static int const digits = 7; + + /// Least positive value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); } + + /// Minimum finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); } + + /// Maximum finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); } +}; +#endif + +} // namespace std + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Arithmetic operators +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTLASS_HOST_DEVICE +bool operator==(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) == float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator!=(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) != float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator<(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) < float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator<=(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) <= float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator>(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) > float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator>=(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) >= float(rhs); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator+(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return bfloat16_t(float(lhs) + float(rhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator-(bfloat16_t const& lhs) { + return bfloat16_t(-float(lhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator-(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return bfloat16_t(float(lhs) - float(rhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator*(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return bfloat16_t(float(lhs) * float(rhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator/(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return bfloat16_t(float(lhs) / float(rhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator+=(bfloat16_t & lhs, bfloat16_t const& rhs) { + lhs = bfloat16_t(float(lhs) + float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator-=(bfloat16_t & lhs, bfloat16_t const& rhs) { + lhs = bfloat16_t(float(lhs) - float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator*=(bfloat16_t & lhs, bfloat16_t const& rhs) { + lhs = bfloat16_t(float(lhs) * float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator/=(bfloat16_t & lhs, bfloat16_t const& rhs) { + lhs = bfloat16_t(float(lhs) / float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator++(bfloat16_t & lhs) { + float tmp(lhs); + ++tmp; + lhs = bfloat16_t(tmp); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator--(bfloat16_t & lhs) { + float tmp(lhs); + --tmp; + lhs = bfloat16_t(tmp); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator++(bfloat16_t & lhs, int) { + bfloat16_t ret(lhs); + float tmp(lhs); + tmp++; + lhs = bfloat16_t(tmp); + return ret; +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator--(bfloat16_t & lhs, int) { + bfloat16_t ret(lhs); + float tmp(lhs); + tmp--; + lhs = bfloat16_t(tmp); + return ret; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// User-defined literals +// + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t operator "" _bf16(long double x) { + return cutlass::bfloat16_t(float(x)); +} + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t operator "" _bf16(unsigned long long int x) { + return cutlass::bfloat16_t(int(x)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h index 20c4a64a7..6f7d73bb9 100644 --- a/include/cutlass/complex.h +++ b/include/cutlass/complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -35,6 +35,9 @@ #include "cutlass/half.h" #include "cutlass/real.h" +#include "cutlass/bfloat16.h" +#include "cutlass/tfloat32.h" + #if !defined(__CUDACC_RTC__) #include #endif @@ -370,6 +373,15 @@ template CUTLASS_HOST_DEVICE complex conj(complex const &z) { return complex(real(z), -imag(z)); } +/// Indentity transform for non-complex types +template +CUTLASS_HOST_DEVICE T conj(T const &z) { + static_assert( !std::is_same::value && + !std::is_same::value && + !std::is_same>::value && + !std::is_same>::value, "May not be a complex data type"); + return z; +} /// Projects the complex number z onto the Riemann sphere template @@ -429,6 +441,7 @@ template struct RealType< complex > { using Type = T; +CUTLASS_HOST_DEVICE static complex from_real(double x) { return complex(static_cast(x)); } diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h index e2615755d..82613c245 100644 --- a/include/cutlass/coord.h +++ b/include/cutlass/coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -360,6 +360,29 @@ public: namespace cutlass { + +/// Scalar multiplication +template +CUTLASS_HOST_DEVICE +Coord operator*(Index s, Coord coord) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Rank; ++i) { + coord[i] *= s; + } + return coord; +} + +/// Scalar multiplication +template +CUTLASS_HOST_DEVICE +Coord operator*(Coord coord, Index s) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Rank; ++i) { + coord[i] *= s; + } + return coord; +} + /// Scalar division template CUTLASS_HOST_DEVICE @@ -419,3 +442,4 @@ Coord<4> make_Coord(int _0, int _1, int _2, int _3) { //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass + diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h index d9dc78905..a87ecfa70 100644 --- a/include/cutlass/core_io.h +++ b/include/cutlass/core_io.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -33,9 +33,14 @@ #include "cutlass/coord.h" #include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/gemm/gemm.h" namespace cutlass { +/////////////////////////////////////////////////////////////////////////////////////////////////// +// stream operators for cutlass namespace // /////////////////////////////////////////////////////////////////////////////////////////////////// template @@ -47,8 +52,6 @@ std::ostream& operator<<(std::ostream& out, Coord const& coord) { return out; } -/////////////////////////////////////////////////////////////////////////////////////////////////// - inline std::istream & operator>>(std::istream &stream, half_t &x) { float tmp; @@ -62,6 +65,16 @@ std::ostream & operator<<(std::ostream &out, half_t const &x) { return out << float(x); } +inline +std::ostream & operator<<(std::ostream &out, bfloat16_t const &x) { + return out << float(x); +} + +inline +std::ostream & operator<<(std::ostream &out, tfloat32_t const &x) { + return out << float(x); +} + /////////////////////////////////////////////////////////////////////////////////////////////////// /// Helper to enable formatted printing of CUTLASS scalar types to an ostream @@ -98,7 +111,54 @@ inline std::ostream &operator<<(std::ostream &out, ScalarIO const &scal return out << unsigned(scalar.value); } + +/// Default printing to ostream for MatrixShape +template +inline +std::ostream & operator<<(std::ostream &out, cutlass::MatrixShape const &matrix_shape) { + out << "cutlass::MatrixShape::(kRow, kColumn) {" + << cutlass::MatrixShape::kRow <<"," + << cutlass::MatrixShape::kColumn <<"}"; + return out; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// stream operators for cutlass::gemm namespace // +/////////////////////////////////////////////////////////////////////////////////////////////////// +namespace gemm { + +/// Default printing to ostream for GemmShape +template +inline +std::ostream & operator<<(std::ostream &out, cutlass::gemm::GemmShape const &gemm_shape) { + out << "cutlass::GemmShape::(kM, kN, kK) {" + << cutlass::gemm::GemmShape::kM <<"," + << cutlass::gemm::GemmShape::kN <<"," + << cutlass::gemm::GemmShape::kK << "}"; + return out; +} + +} //namespace gemm +/////////////////////////////////////////////////////////////////////////////////////////////////// + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// stream operators for cutlass::layout namespace // +/////////////////////////////////////////////////////////////////////////////////////////////////// +namespace layout { + +/// Default printing to ostream for PitchLinearShape +template < int Contiguous, int Strided> +inline +std::ostream & operator<<(std::ostream &out, cutlass::layout::PitchLinearShape const &pitch_linear_shape) { + out << "cutlass::layout::PitchLinearShape::(kContiguous, kStrided) {" + << cutlass::layout::PitchLinearShape::kContiguous <<"," + << cutlass::layout::PitchLinearShape::kStrided <<"}"; + return out; +} + +} //namespace layout /////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass - +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h index b5a0e5f41..860dc3e56 100644 --- a/include/cutlass/cutlass.h +++ b/include/cutlass/cutlass.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -55,6 +55,8 @@ enum class Status { kErrorNotSupported, ///< Operation is not supported on current device. kErrorWorkspaceNull, ///< The given workspace is null when it is required to be non-null. kErrorInternal, ///< An error within CUTLASS occurred. + kErrorArchMismatch, ///< CUTLASS runs on a device that it was not compiled for. + kErrorInsufficientDriver, ///< CUTLASS runs with a driver that is too old. kInvalid ///< Status is unspecified. }; @@ -78,6 +80,10 @@ static char const* cutlassGetStatusString(cutlass::Status status) { return "Error Workspace Null"; case cutlass::Status::kErrorInternal: return "Error Internal"; + case cutlass::Status::kErrorInsufficientDriver: + return "Error Insufficient Driver"; + case cutlass::Status::kErrorArchMismatch: + return "Erroor Architecture Mismatch"; case cutlass::Status::kInvalid: break; } diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h index 4a992bb3c..f5166ab16 100644 --- a/include/cutlass/device_kernel.h +++ b/include/cutlass/device_kernel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h new file mode 100644 index 000000000..c0f42146e --- /dev/null +++ b/include/cutlass/epilogue/thread/activation.h @@ -0,0 +1,119 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief This extends the contents of cutlass/functional.h with frequently used activation functions. + +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/complex.h" + +#include "cutlass/array.h" +#include "cutlass/half.h" +#include "cutlass/functional.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// ReLu operator - propagates NaNs +template +struct ReLu { + CUTLASS_HOST_DEVICE + T operator()(T const & threshold, T const &value) const { + if (value < threshold) { + value = threshold; + } + return value; + } +}; + +template +struct ReLu> { + CUTLASS_HOST_DEVICE + Array operator()(T const & threshold, Array const &frag) const { + Array result; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + T value = frag[i]; + if (value < threshold) { + value = threshold; + } + result[i] = value; + } + return result; + } +}; + +// Sigmoid operator +template +struct Sigmoid { + CUTLASS_HOST_DEVICE + T operator()(T const &scalar) const { + return T(1) / (T(1) + exp(-scalar)); + } +}; + +template <> +struct Sigmoid { + CUTLASS_HOST_DEVICE + float operator()(float const &scalar) const { + return 1.0f / (1.0f + expf(-scalar)); + } +}; + +template +struct Sigmoid > { + CUTLASS_HOST_DEVICE + Array operator()(Array const &rhs) const { + Array y; + Sigmoid sigmoid_op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < int(rhs.size()); ++i) { + y[i] = sigmoid_op(rhs[i]); + } + + return y; + } +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/epilogue/thread/conversion_op.h b/include/cutlass/epilogue/thread/conversion_op.h index 32b885bc6..ad17d4149 100644 --- a/include/cutlass/epilogue/thread/conversion_op.h +++ b/include/cutlass/epilogue/thread/conversion_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -101,7 +101,7 @@ public: CUTLASS_HOST_DEVICE FragmentOutput operator()( FragmentAccumulator const &accumulator, - FragmentOutput const &source, + FragmentOutput const &source = FragmentOutput(), ElementCompute uniform = ElementCompute(0)) const { // Convert to destination numeric type diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h index dd8236b3c..8b5f6ead1 100644 --- a/include/cutlass/epilogue/thread/linear_combination.h +++ b/include/cutlass/epilogue/thread/linear_combination.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -165,6 +165,28 @@ public: return destination_converter(intermediate); } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + ComputeFragment intermediate; + multiplies mul_accumulator; + + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h index 9fe4b2b37..25611bd36 100644 --- a/include/cutlass/epilogue/thread/linear_combination_clamp.h +++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -178,6 +178,40 @@ public: return destination_converter(intermediate); } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + + ComputeFragment intermediate; + + multiplies mul_accumulator; + + minimum min_accumulator; + maximum max_accumulator; + + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + /// Clamping constant value + ElementCompute const kClamp = + ElementCompute((1U << (sizeof_bits::value - 1)) - 1); + + intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1)); + intermediate = min_accumulator(intermediate, kClamp); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -278,7 +312,7 @@ public: beta_ = ElementCompute(1); } } - + /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()( @@ -316,6 +350,37 @@ public: return destination_converter(scaled_accumulator); } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()(FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Compute linear scaling in floating point + ComputeFragment intermediate; + + multiplies mul_add_accumulator; + + // Float min-max + intermediate = mul_add_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + // Convert floats back to INT + FragmentAccumulator scaled_accumulator; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + scaled_accumulator[i] = static_cast(intermediate[i]); + } + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(scaled_accumulator); + } }; #endif // Conditional guards to enable partial specialization for packed integers @@ -410,7 +475,7 @@ class FastLinearCombinationClamp { beta_ = ElementCompute(1); } } - + /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator, @@ -453,6 +518,41 @@ class FastLinearCombinationClamp { return destination_converter(intermediate); } + + /// Computes linear scaling: D = alpha * accumulator + beta * source + CUTLASS_HOST_DEVICE + FragmentOutput operator()(FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + FastNumericArrayConverter + accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Compute linear scaling in floating point + ComputeFragment intermediate; + + multiplies mul_accumulator; + + minimum min_accumulator; + maximum max_accumulator; + + // Float min-max + intermediate = mul_accumulator(alpha_, converted_accumulator); + + /// Clamping constant value + ElementCompute const kClamp = + ElementCompute(1 << (sizeof_bits::value - 1)); + + intermediate = max_accumulator(intermediate, -kClamp); + intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1)); + + // Convert to destination numeric type + FastNumericArrayConverter + destination_converter; + + return destination_converter(intermediate); + } }; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h index bfe6be787..3934af104 100644 --- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h +++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -185,6 +185,39 @@ public: destination_converter(intermediate.real), destination_converter(intermediate.imag)); } + + /// Computes linear scaling: D = alpha * accumulator + beta * source + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator( + accumulator_converter(accumulator.real), + accumulator_converter(accumulator.imag)); + + // Perform binary operations + ComputeFragment intermediate; + + multiplies > mul_op; + multiply_add > mul_add_op; + + // complex multiply-add: I = alpha * AB + I + intermediate.real = mul_add_op(alpha_.real(), converted_accumulator.real); + intermediate.imag = mul_add_op(alpha_.real(), converted_accumulator.imag); + + intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real); + intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return FragmentOutput( + destination_converter(intermediate.real), + destination_converter(intermediate.imag)); + } }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h index 9afeb3eb8..7a2fa9e8a 100644 --- a/include/cutlass/epilogue/thread/linear_combination_relu.h +++ b/include/cutlass/epilogue/thread/linear_combination_relu.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -23,8 +23,7 @@ * **************************************************************************************************/ /*! \file - \brief Functor performing linear combination operations used by epilogues. Values are clamped before - converting to the output element type. + \brief Functor performing linear combination with a maximum operation used by epilogues. */ #pragma once @@ -34,6 +33,7 @@ #include "cutlass/array.h" #include "cutlass/functional.h" #include "cutlass/numeric_conversion.h" +#include "cutlass/epilogue/thread/activation.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -43,8 +43,7 @@ namespace thread { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Applies a linear combination operator to an array of elements then clamps the output before -/// converting to the output element type. +/// Applies a linear combination operator to an array of elements. /// /// D = alpha * accumulator + beta * source + uniform /// @@ -75,10 +74,10 @@ public: ElementCompute alpha; ///< scales accumulators ElementCompute beta; ///< scales source tensor - ElementCompute threshold; ///< Relu threshold + ElementCompute threshold; ///< minimum value that is output ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory - + ElementCompute const *threshold_ptr; ///< pointer to threshold scalar - if not null, loads from memory // // Methods // @@ -87,16 +86,17 @@ public: Params(): alpha(ElementCompute(1)), beta(ElementCompute(0)), - threshold(ElementCompute(0)), + threshold(ElementCompute(0)), alpha_ptr(nullptr), - beta_ptr(nullptr) { } + beta_ptr(nullptr), + threshold_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params( ElementCompute alpha, ElementCompute beta, - ElementCompute threshold = ElementCompute(0) - ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) { + ElementCompute threshold = ElementCompute(0) + ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) { } @@ -104,8 +104,8 @@ public: Params( ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, - ElementCompute threshold = ElementCompute(0) - ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { + ElementCompute const *threshold_ptr = nullptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) { } }; @@ -128,7 +128,7 @@ public: alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); - threshold_ = params.threshold; + threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold); } /// Returns true if source is needed @@ -144,13 +144,12 @@ public: beta_ = ElementCompute(1); } } - + /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()( FragmentAccumulator const &accumulator, - FragmentOutput const &source, - ElementCompute uniform = ElementCompute(0)) const { + FragmentOutput const &source) const { // Convert source to interal compute numeric type NumericArrayConverter source_converter; @@ -160,18 +159,44 @@ public: ComputeFragment converted_accumulator = accumulator_converter(accumulator); // Perform binary operations - ComputeFragment intermediate; multiplies mul_add_source; multiply_add mul_add_accumulator; - - maximum max_accumulator; + ReLu relu; intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X - - intermediate = max_accumulator(intermediate, threshold_); + + // Compute threshold optionally + intermediate = relu(threshold_, intermediate); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + ComputeFragment intermediate; + + multiplies mul_accumulator; + ReLu relu; + + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + // Compute threshold optionally + intermediate = relu(threshold_, intermediate); // Convert to destination numeric type NumericArrayConverter destination_converter; @@ -180,24 +205,24 @@ public: } }; + ///////////////////////////////////////////////////////////////////////////////////////////////// // Conditional guards to enable partial specialization for packed integers -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && \ - ((__CUDACC_VER_MAJOR__ > 10) || \ - ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) -/// Applies a linear combination operator to an array of elements then clamps the output before -/// converting to the output element type. +/// Applies a linear combination operator to an array of elements. /// /// D = alpha * accumulator + beta * source + uniform /// +/// Special handling for int types + template < typename ElementOutput_, ///< Data type used to load and store tensors int Count, ///< Number of elements computed per operation FloatRoundStyle Round > -class LinearCombinationRelu { +class LinearCombinationRelu { public: using ElementOutput = ElementOutput_; @@ -217,10 +242,10 @@ public: ElementCompute alpha; ///< scales accumulators ElementCompute beta; ///< scales source tensor - ElementCompute threshold; ///< Relu threshold + ElementCompute threshold; ///< minimum value that is output ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory - + ElementCompute const *threshold_ptr; ///< pointer to threshold scalar - if not null, loads from memory // // Methods // @@ -229,16 +254,17 @@ public: Params(): alpha(ElementCompute(1)), beta(ElementCompute(0)), - threshold(ElementCompute(0)), + threshold(ElementCompute(0)), alpha_ptr(nullptr), - beta_ptr(nullptr) { } + beta_ptr(nullptr), + threshold_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params( ElementCompute alpha, ElementCompute beta, - ElementCompute threshold = ElementCompute(0) - ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) { + ElementCompute threshold = ElementCompute(0) + ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) { } @@ -246,8 +272,8 @@ public: Params( ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, - ElementCompute threshold = ElementCompute(0) - ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { + ElementCompute const *threshold_ptr = nullptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) { } }; @@ -270,7 +296,7 @@ public: alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); - threshold_ = params.threshold; + threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold); } /// Returns true if source is needed @@ -286,13 +312,12 @@ public: beta_ = ElementCompute(1); } } - + /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()( FragmentAccumulator const &accumulator, - FragmentOutput const &source, - ElementCompute uniform = ElementCompute(0)) const { + FragmentOutput const &source) const { // Convert source to interal compute numeric type NumericArrayConverter source_converter; @@ -302,21 +327,16 @@ public: ComputeFragment converted_accumulator = accumulator_converter(accumulator); // Perform binary operations - ComputeFragment intermediate; multiplies mul_add_source; multiply_add mul_add_accumulator; - - maximum max_accumulator; + ReLu relu; intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X - - // Clamp to theshold - intermediate = max_accumulator(intermediate, threshold_); - // Convert back to accumulator data type + // Convert floats back to INT FragmentAccumulator scaled_accumulator; CUTLASS_PRAGMA_UNROLL @@ -324,8 +344,46 @@ public: scaled_accumulator[i] = static_cast(intermediate[i]); } - // Convert to destination numeric type and pack - NumericArrayConverter destination_converter; + // Compute threshold optionally + scaled_accumulator = relu(threshold_, scaled_accumulator); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(scaled_accumulator); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + ComputeFragment intermediate; + + multiplies mul_accumulator; + ReLu relu; + + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + // Convert floats back to INT + FragmentAccumulator scaled_accumulator; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + scaled_accumulator[i] = static_cast(intermediate[i]); + } + + // Compute threshold optionally + scaled_accumulator = relu(threshold_, scaled_accumulator); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; return destination_converter(scaled_accumulator); } @@ -338,3 +396,6 @@ public: } // namespace thread } // namespace epilogue } // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h new file mode 100644 index 000000000..3a65c49ac --- /dev/null +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -0,0 +1,206 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/array.h" +#include "cutlass/functional.h" +#include "cutlass/numeric_conversion.h" + +#include "cutlass/epilogue/thread/activation.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies a linear combination operator to an array of elements. +/// +/// D = alpha * accumulator + beta * source + uniform +/// +template < + typename ElementOutput_, ///< Data type used to load and store tensors + int Count, ///< Number of elements computed per operation + typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type + typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest +> +class LinearCombinationSigmoid { +public: + + using ElementOutput = ElementOutput_; + using ElementAccumulator = ElementAccumulator_; + using ElementCompute = ElementCompute_; + + static int const kCount = Count; + + using FragmentOutput = Array; + using FragmentAccumulator = Array; + using ComputeFragment = Array; + + static FloatRoundStyle const kRound = Round; + + /// Host-constructable parameters structure + struct Params { + + ElementCompute alpha; ///< scales accumulators + ElementCompute beta; ///< scales source tensor + ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory + ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params(): + alpha(ElementCompute(1)), + beta(ElementCompute(0)), + alpha_ptr(nullptr), + beta_ptr(nullptr) { } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute alpha, + ElementCompute beta + ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { + + } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute const *alpha_ptr, + ElementCompute const *beta_ptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { + + } + }; + +private: + + // + // Data members + // + + ElementCompute alpha_; + ElementCompute beta_; + +public: + + /// Constructs the function object, possibly loading from pointers in host memory + CUTLASS_HOST_DEVICE + LinearCombinationSigmoid(Params const ¶ms) { + + alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); + beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); + } + + /// Returns true if source is needed + CUTLASS_HOST_DEVICE + bool is_source_needed() const { + return beta_ != ElementCompute(0); + } + + /// Functionally required for serial reduction in the epilogue + CUTLASS_HOST_DEVICE + void set_k_partition(int k_partition) { + if (k_partition) { + beta_ = ElementCompute(1); + } + } + + /// Computes linear scaling: D = alpha * accumulator + beta * source + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator, + FragmentOutput const &source) const { + + // Convert source to interal compute numeric type + NumericArrayConverter source_converter; + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_source = source_converter(source); + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + + ComputeFragment intermediate; + + multiplies mul_add_source; + multiply_add mul_add_accumulator; + Sigmoid sigmoid; + + intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform + intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X + + intermediate = sigmoid(intermediate); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + + ComputeFragment intermediate; + + multiplies mul_add_accumulator; + Sigmoid sigmoid; + + intermediate = mul_add_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + intermediate = sigmoid(intermediate); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass diff --git a/include/cutlass/epilogue/thread/reduction_op.h b/include/cutlass/epilogue/thread/reduction_op.h index b33332e93..0331f0fad 100644 --- a/include/cutlass/epilogue/thread/reduction_op.h +++ b/include/cutlass/epilogue/thread/reduction_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h index c3c40babf..67fccf05c 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -45,6 +45,7 @@ #include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h" #include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h" +#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h" #include "cutlass/epilogue/warp/tile_iterator_tensor_op.h" #include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h" #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h" @@ -76,6 +77,7 @@ template < /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load() int ElementsPerAccess, /// Multiply-add operator + /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) typename Operator_ = arch::OpMultiplyAddComplex> struct DefaultEpilogueComplexTensorOp { @@ -146,6 +148,91 @@ struct DefaultEpilogueComplexTensorOp { >; }; +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization and defines sensible defaults for epilogues for complex*complex case +// 3 real-valued mma operations (Gaussian Complex) +// A = (ar + j ai), B = (br +j bi), D = AB +// P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) +// D = dr + j di = (P1 - P3) + j (P1 + P2) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + typename Shape_, + typename WarpMmaTensorOp_, + int PartitionsK, + typename OutputOp_, + int ElementsPerAccess +> +struct DefaultEpilogueComplexTensorOp { + + using Shape = Shape_; + using WarpMmaTensorOp = WarpMmaTensorOp_; + static int const kPartitionsK = PartitionsK; + using OutputOp = OutputOp_; + static int const kElementsPerAccess = ElementsPerAccess; + using Operator = arch::OpMultiplyAddGaussianComplex; + + using ElementOutput = typename OutputOp::ElementOutput; + using LayoutC = typename WarpMmaTensorOp::LayoutC; + using ElementAccumulator = typename WarpMmaTensorOp::ElementC; + + // + // Thread map + // + + using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp< + Shape, + typename WarpMmaTensorOp::Shape, + kPartitionsK, + ElementOutput, + kElementsPerAccess + >::Type; + + using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator< + OutputTileThreadMap, + ElementOutput + >; + + using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename WarpMmaTensorOp::Policy::Operator::ElementC, + typename WarpMmaTensorOp::Policy::Operator::FragmentC, + LayoutC + >; + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + ElementAccumulator, + LayoutC + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator< + typename OutputTileThreadMap::CompactedThreadMap, + ElementAccumulator + >; + + /// Hard-coded padding elements added + using Padding = cutlass::MatrixShape<0, 0>; + + // + // Define the epilogue + // + using Epilogue = cutlass::epilogue::threadblock::Epilogue< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputTileIterator, + AccumulatorFragmentIterator, + WarpTileIterator, + SharedLoadIterator, + OutputOp, + Padding + >; +}; + //////////////////////////////////////////////////////////////////////////////// } // namespace threadblock diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h index 081bcbac2..bb2fdb6b8 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -148,6 +148,44 @@ struct DefaultEpiloguePlanarComplex< ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines sensible defaults for epilogues. +template < + typename ThreadblockShape_, + typename WarpMmaOperator_, + int PartitionsK, + typename OutputOp_, + int ElementsPerAccess +> +struct DefaultEpiloguePlanarComplex< + ThreadblockShape_, + WarpMmaOperator_, + arch::OpClassTensorOp, + arch::Sm80, + PartitionsK, + OutputOp_, + ElementsPerAccess> { + + using RealEpilogue = DefaultEpilogueTensorOp< + ThreadblockShape_, + WarpMmaOperator_, + PartitionsK, + OutputOp_, + ElementsPerAccess + >; + + using Epilogue = EpiloguePlanarComplex< + ThreadblockShape_, + WarpMmaOperator_, + PartitionsK, + typename RealEpilogue::OutputTileIterator, + typename RealEpilogue::AccumulatorFragmentIterator, + typename RealEpilogue::WarpTileIterator, + typename RealEpilogue::SharedLoadIterator, + OutputOp_, + typename RealEpilogue::Padding + >; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Defines sensible defaults for epilogues. diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h index d39ad1d94..00bf26d35 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,6 +39,7 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_clamp.h" #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h index 5afb1f22c..51ebab37d 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,16 +39,20 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_clamp.h" #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" #include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h" #include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h" +#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h" #include "cutlass/epilogue/warp/tile_iterator_tensor_op.h" +#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h" #include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h" #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h" #include "cutlass/epilogue/threadblock/shared_load_iterator.h" +#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h" #include "cutlass/epilogue/threadblock/epilogue.h" #include "cutlass/epilogue/threadblock/interleaved_epilogue.h" @@ -61,6 +65,177 @@ namespace threadblock { //////////////////////////////////////////////////////////////////////////////// +namespace detail { + +template < + typename ElementOutput, + typename ElementAccumulator, + int ElementsPerAccess, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp< + WarpShape, + InstructionShape, + ElementAccumulator, + layout::RowMajor + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator< + ThreadMap, + ElementAccumulator + >; +}; + +/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts. +template < + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp< + half_t, + float, + 8, + ThreadblockShape, + WarpShape, + InstructionShape, + ThreadMap> { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed< + WarpShape, + InstructionShape, + float, + 32, + 16, + 8, + 8 + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed< + ThreadMap, + float, + 32, + 16, + 8, + 8 + >; +}; + +/// Partial specialization for int8_t x 16 <= int32_t x 16 epilogues avoids shared memory bank conflicts. +template < + int K, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp< + int8_t, + int32_t, + 16, + gemm::GemmShape<128, 128, K>, + gemm::GemmShape<64, 64, K>, + InstructionShape, + ThreadMap> { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed< + gemm::GemmShape<64, 64, K>, + InstructionShape, + int32_t, + 32, + 8, + 16, + 8 + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed< + ThreadMap, + int32_t, + 32, + 8, + 16, + 8 + >; +}; + +/// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts. +template < + int K, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp< + int8_t, + int32_t, + 8, + gemm::GemmShape<128, 64, K>, + gemm::GemmShape<64, 32, K>, + InstructionShape, + ThreadMap> { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed< + gemm::GemmShape<64, 32, K>, + InstructionShape, + int32_t, + 32, + 8, + 8, + 8 + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed< + ThreadMap, + int32_t, + 32, + 8, + 8, + 8 + >; +}; + +/// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts. +template < + int K, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp< + int8_t, + int32_t, + 8, + gemm::GemmShape<64, 64, K>, + gemm::GemmShape<32, 32, K>, + InstructionShape, + ThreadMap> { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed< + gemm::GemmShape<32, 32, K>, + InstructionShape, + int32_t, + 32, + 8, + 8, + 8 + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed< + ThreadMap, + int32_t, + 32, + 8, + 8, + 8 + >; +}; + +} // namespace detail + +//////////////////////////////////////////////////////////////////////////////// + /// Defines sensible defaults for epilogues for TensorOps. template < typename Shape_, @@ -98,25 +273,33 @@ struct DefaultEpilogueTensorOp { ElementOutput >; - using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp< - typename WarpMmaTensorOp::Shape, - typename WarpMmaTensorOp::Policy::Operator::Shape, - typename WarpMmaTensorOp::Policy::Operator::ElementC, - typename WarpMmaTensorOp::Policy::Operator::FragmentC, - LayoutC - >; + using AccumulatorFragmentIterator = typename std::conditional::value, + cutlass::epilogue::warp::FragmentIteratorComplexTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename WarpMmaTensorOp::Policy::Operator::ElementC, + typename WarpMmaTensorOp::Policy::Operator::FragmentC, + LayoutC>, + cutlass::epilogue::warp::FragmentIteratorTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename WarpMmaTensorOp::Policy::Operator::ElementC, + typename WarpMmaTensorOp::Policy::Operator::FragmentC, + LayoutC> >::type; - using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp< - typename WarpMmaTensorOp::Shape, - typename WarpMmaTensorOp::Policy::Operator::Shape, + /// Support several implementations depending on structure of epilogue + using DefaultIterators = detail::DefaultIteratorsTensorOp< + ElementOutput, ElementAccumulator, - LayoutC + kElementsPerAccess, + Shape, + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename OutputTileThreadMap::CompactedThreadMap >; - using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator< - typename OutputTileThreadMap::CompactedThreadMap, - ElementAccumulator - >; + using WarpTileIterator = typename DefaultIterators::WarpTileIterator; + using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator; /// Hard-coded padding elements added using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits::value * 4>; @@ -184,6 +367,7 @@ struct DefaultInterleavedEpilogueTensorOp { }; //////////////////////////////////////////////////////////////////////////////// + } // namespace threadblock } // namespace epilogue } // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h index 8a08e0362..7fec5110f 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,6 +39,7 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_clamp.h" #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h index f0435e92f..58425c286 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,6 +39,7 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_clamp.h" #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h index 788e07a7d..8e8f4d339 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h index 6f4bd2ad9..736e55253 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -146,54 +146,6 @@ struct DefaultInterleavedThreadMapTensorOp { //////////////////////////////////////////////////////////////////////////////// -/// Defines the optimal thread map for TensorOp accumulator layouts -template -struct DefaultInterleavedConvThreadMapTensorOp { - using ThreadblockShape = ThreadblockShape_; - using WarpShape = WarpShape_; - static int const kPartitionsK = PartitionsK; - using Element = Element_; - static int const kElementsPerAccess = ElementsPerAccess; - static int const kInterleavedK = InterleavedK; - - // - // Definitions - // - - struct Detail { - /// Tensor Operations fundamentally perform operations on 8 rows - static int const kTensorOpRows = 8; - static int const kWarpSize = 32; - - static_assert(!(ThreadblockShape::kM % WarpShape::kM) && - !(ThreadblockShape::kM % WarpShape::kM), - "Divisibility"); - - /// Number of warps - using WarpCount = - gemm::GemmShape; - - /// Number of participating threads - static int const kThreads = WarpCount::kCount * kWarpSize; - }; - - // - // ThreadMap - // - - /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept - /// InterleavedOutputTileThreadMap - using Type = InterleavedConvOutputTileThreadMap< - MatrixShape, - MatrixShape, - Detail::kThreads, kElementsPerAccess, sizeof_bits::value>; -}; - -//////////////////////////////////////////////////////////////////////////////// - } // namespace threadblock } // namespace epilogue } // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h index 4c4068a37..45aba393c 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h index 376887c39..34ec750d2 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h index f197112b6..f14be1ff8 100644 --- a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h index fe6877aab..078684201 100644 --- a/include/cutlass/epilogue/threadblock/epilogue.h +++ b/include/cutlass/epilogue/threadblock/epilogue.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -175,15 +175,106 @@ public: OutputOp const &output_op, ///< Output operator OutputTileIterator destination_iterator, ///< Tile iterator for destination AccumulatorTile const &accumulators, ///< Complete warp-level accumulator tile - OutputTileIterator source_iterator, ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) - int64_t imag_stride_dest = 0, ///< Arguments required for planar complex case - not used in real-valued case - int64_t imag_stride_src = 0) { ///< - - typename OutputTileIterator::Fragment source_fragment; - + OutputTileIterator source_iterator) { ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) + if (!output_op.is_source_needed()) { - source_iterator.clear_mask(); + compute_source_not_needed_(output_op, destination_iterator, accumulators); } + else { + compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator); + } + } + +private: + + /// Streams the result to global memory + CUTLASS_DEVICE + void compute_source_not_needed_( + OutputOp const &output_op, ///< Output operator + OutputTileIterator destination_iterator, ///< Tile iterator for destination + AccumulatorTile const &accumulators) { ///< Complete warp-level accumulator tile + + // + // Iterator over warp-level accumulator fragment + // + + AccumulatorFragmentIterator accum_fragment_iterator(accumulators); + + // + // Iterate over accumulator tile + // + + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) { + + // + // Convert and store fragment + // + + __syncthreads(); + + typename AccumulatorFragmentIterator::Fragment accum_fragment; + + accum_fragment_iterator.load(accum_fragment); + ++accum_fragment_iterator; + + this->warp_tile_iterator_.store(accum_fragment); + + __syncthreads(); + + // + // Load fragments from shared memory + // + + typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK]; + + shared_load_iterator_.load(aligned_accum_fragment[0]); + + // If the number of k-slices is > 1 - perform a reduction amongst the k-slices + if (kPartitionsK > 1) + { + plus add_fragments; + const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK; + + CUTLASS_PRAGMA_UNROLL + for ( int i = 1; i < kPartitionsK; ++i) { + shared_load_iterator_.add_tile_offset({tile_row_offset , 0}); + shared_load_iterator_.load(aligned_accum_fragment[i]); + aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]); + } + + shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0}); + } + + // + // Compute the output result + // + + typename OutputTileIterator::Fragment output_fragment; + + apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment[0]); + + + // + // Store the final result + // + + destination_iterator.store(output_fragment); + ++destination_iterator; + + } + } + + + /// Streams the result to global memory + CUTLASS_DEVICE + void compute_source_needed_( + OutputOp const &output_op, ///< Output operator + OutputTileIterator destination_iterator, ///< Tile iterator for destination + AccumulatorTile const &accumulators, ///< Complete warp-level accumulator tile + OutputTileIterator source_iterator) { ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) + + typename OutputTileIterator::Fragment source_fragment; source_fragment.clear(); @@ -265,8 +356,6 @@ public: } } -private: - /// Helper to invoke the output functor over each vector of output CUTLASS_DEVICE void apply_output_operator_( @@ -294,6 +383,30 @@ private: output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]); } } + + /// Helper to invoke the output functor over each vector of output + CUTLASS_DEVICE + void apply_output_operator_source_not_needed_( + typename OutputTileIterator::Fragment &output_fragment, + OutputOp const &output_op, ///< Output operator + typename SharedLoadIterator::Fragment const &aligned_accum_fragment) { + + OutputAccessType *output_frag_ptr = + reinterpret_cast(&output_fragment); + + AccumulatorAccessType const *compute_frag_ptr = + reinterpret_cast(&aligned_accum_fragment); + + int const kOutputOpIterations = + OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kOutputOpIterations; ++i) { + + // Call the output operator + output_frag_ptr[i] = output_op(compute_frag_ptr[i]); + } + } }; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/threadblock/epilogue_base.h b/include/cutlass/epilogue/threadblock/epilogue_base.h index a8a0dc49f..a9b5a4140 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_base.h +++ b/include/cutlass/epilogue/threadblock/epilogue_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h index 8362748e1..6cb996361 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h +++ b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/include/cutlass/epilogue/threadblock/epilogue_workspace.h index 72eb8d2e4..36d196a37 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_workspace.h +++ b/include/cutlass/epilogue/threadblock/epilogue_workspace.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h index 0a730ef1c..b616545b9 100644 --- a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h +++ b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h index fd28ac751..4eb5e3784 100644 --- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h +++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -490,67 +490,6 @@ struct InterleavedOutputTileThreadMap { //////////////////////////////////////////////////////////////////////////////// -/// Template metaprogram for partitioning a 4D interleaved layout across warps -/// to achieve several performance objectives: -/// -/// - coalesced memory accesses in units of 64 Byte lines -/// - minimal address arithmetic -/// - minimal predicate calculations -/// -template -struct InterleavedConvOutputTileThreadMap { - using WarpCount = WarpCount_; - - static int const kWarpSize = 32; - static int const kThreads = Threads; - static int const kWarpCount = kThreads / kWarpSize; - - static int const kElementsPerAccess = ElementsPerAccess; - static int const kElementSize = ElementSize; - - // - // Metaprogram computation - // - - struct Detail {}; - - // - // Output - // - - using Iterations = Iterations_; - - using Delta = MatrixShape; - - /// Initial offset function - CUTLASS_HOST_DEVICE - static MatrixCoord initial_offset(int thread_idx) { - int warp_idx = thread_idx / kWarpSize; - int lane_idx = thread_idx % kWarpSize; - - // Compute warp location - MatrixCoord warp_footprint{ - Delta::kRow * Iterations::kRow, - Delta::kColumn * Iterations::kColumn, - }; - - MatrixCoord warp_offset{warp_idx % WarpCount::kRow, - warp_idx / WarpCount::kRow}; - - // Compute per-lane offset - MatrixCoord thread_offset_in_warp{lane_idx / 4, - (lane_idx % 4) * kElementsPerAccess}; - - MatrixCoord thread_offset_in_threadblock_tile = - warp_footprint * warp_offset + thread_offset_in_warp; - - return thread_offset_in_threadblock_tile; - } -}; - -//////////////////////////////////////////////////////////////////////////////// - } // namespace threadblock } // namespace epilogue } // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h index 486d16c7c..f3c88300b 100644 --- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h +++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -41,7 +41,7 @@ #include "cutlass/tensor_ref.h" #include "cutlass/transform/pitch_linear_thread_map.h" #include "cutlass/epilogue/threadblock/output_tile_thread_map.h" - +#include "cutlass/arch/memory.h" //////////////////////////////////////////////////////////////////////////////// @@ -306,10 +306,15 @@ public: bool guard = row_guard && mask_.predicates[column]; - if (guard) { - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = - memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess]; - } + cutlass::arch::global_load< + AccessType, + sizeof(AccessType) + >( + frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + + column], + (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / + kElementsPerAccess], + guard); } if (row + 1 < ThreadMap::Iterations::kRow) { @@ -365,11 +370,12 @@ public: bool guard = row_guard && mask_.predicates[column]; - if (guard) { - - memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] = - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]; - } + cutlass::arch::global_store( + frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + + column], + (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / + kElementsPerAccess], + guard); } if (row + 1 < ThreadMap::Iterations::kRow) { @@ -660,9 +666,13 @@ public: bool guard = col_guard && mask_.predicates[iteration_contiguous_]; - if (guard) { - *frag_ptr = *memory_pointer; - } + cutlass::arch::global_load< + AccessType, + sizeof(AccessType) + >( + *frag_ptr, + (void *)memory_pointer, + guard); } /// Stores a fragment to memory @@ -678,9 +688,8 @@ public: bool guard = col_guard && mask_.predicates[iteration_contiguous_]; - if (guard) { - *memory_pointer = *frag_ptr; - } + cutlass::arch::global_store( + *frag_ptr, (void *)memory_pointer, guard); } /// Overrides the internal iteration index @@ -732,6 +741,7 @@ public: } }; +/////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// } // namespace threadblock diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/include/cutlass/epilogue/threadblock/shared_load_iterator.h index 5e4a64b1b..0aa3dbb19 100644 --- a/include/cutlass/epilogue/threadblock/shared_load_iterator.h +++ b/include/cutlass/epilogue/threadblock/shared_load_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -96,6 +96,15 @@ public: ThreadMap::kElementsPerAccess, kAlignment>; + /// Vector type used for SMEM loads + using LoadType = AlignedArray< + Element, + const_min(128 / sizeof_bits::value, ThreadMap::kElementsPerAccess), + const_min(16, kAlignment) + >; + + static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements; + private: // @@ -149,7 +158,6 @@ public: CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { - AccessType *frag_ptr = reinterpret_cast(&frag); CUTLASS_PRAGMA_UNROLL for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { @@ -169,15 +177,19 @@ public: int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - AccessType const *memory_pointer = reinterpret_cast(byte_pointer); + LoadType *frag_ptr = reinterpret_cast(&frag); + LoadType const *memory_pointer = reinterpret_cast(byte_pointer); CUTLASS_PRAGMA_UNROLL for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; - frag_ptr[frag_idx] = - memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess]; + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kLoadsPerAccess; ++v) { + frag_ptr[frag_idx * kLoadsPerAccess + v] = + memory_pointer[(column * ThreadMap::Delta::kColumn / kElementsPerAccess) * kLoadsPerAccess + v]; + } } } } diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h new file mode 100644 index 000000000..d37b07d56 --- /dev/null +++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h @@ -0,0 +1,559 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Epilogue for threadblock scoped GEMMs using Tensor Ops optimized for mixed-precision. + + This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading. + + When the fragment is loaded into registers, it matches the row-major thread map assumed by + the predicated tile iterator writing to global memory. + + The epilogue rearranges the result of a matrix product through shared memory to match canonical + tensor layouts in global memory. Epilogues support conversion and reduction operations. + +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/array.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/tensor_ref.h" + +#include "cutlass/epilogue/threadblock/output_tile_thread_map.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator used to load output tile from shared memory in epilogue. +/// +/// Satisfies: ReadableTileIterator +/// +template < + typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap) + typename Element_, ///< Accumulator data type + int ElementSizeBits_, ///< Size of accumulator in bits + int OutputSizeBits_, ///< Size of output element in bits + int ElementsPerAccess, ///< Vector length of output vector + int ContiguousLanes ///< Number of lanes in the warp writing to contiguous elements + /// in the global memory tensor +> +class SharedLoadIteratorMixed; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator used to load output tile from shared memory in epilogue. +/// +/// Satisfies: ReadableTileIterator +/// +template < + typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap) + typename Element_ ///< Accumulator data type +> +class SharedLoadIteratorMixed { +public: + using ThreadMap = ThreadMap_; + using Shape = typename ThreadMap::Shape; + + using Element = Element_; + + using Layout = layout::RowMajor; + using TensorRef = TensorRef; + using ConstTensorRef = typename TensorRef::ConstTensorRef; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + using TensorCoord = MatrixCoord; + + static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; + + static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits::value / 8; + + static int const kThreads = ThreadMap::kThreads; + + /// Fragment object + using Fragment = Array< + Element, + ThreadMap::Iterations::kColumn * + ThreadMap::Iterations::kRow * + ThreadMap::Iterations::kGroup * + ThreadMap::Iterations::kCluster * + ThreadMap::kElementsPerAccess>; + + /// Memory access size + using AccessType = AlignedArray< + Element, + ThreadMap::kElementsPerAccess, + kAlignment>; + + /// Vector type used for SMEM loads + using LoadType = AlignedArray< + Element, + const_min(128 / sizeof_bits::value, ThreadMap::kElementsPerAccess), + const_min(16, kAlignment) + >; + + static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements; + +private: + + // + // Data members + // + + /// Byte-level pointer + LoadType const *pointers_[kLoadsPerAccess]; + + /// Stride along adjacent rows in units of LoadType + int stride_; + +public: + + // + // Methods + // + + /// Constructor + CUTLASS_DEVICE + SharedLoadIteratorMixed( + TensorRef ref, + int thread_idx + ): + stride_((ref.stride(0) / LoadType::kElements)) { + + TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx); + + // Initialize pointers + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] = reinterpret_cast(ref.data()); + + int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess; + int bank_offset = (col_idx * sizeof(LoadType) / 128) % kLoadsPerAccess; + + col_idx += (bank_offset + i) % kLoadsPerAccess; + + pointers_[i] += thread_offset.row() * stride_ + col_idx; + } + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_ += pointer_offset / LoadType::kElements; + } + } + + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { + + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + + int row_ptr_offset = + row * ThreadMap::Delta::kRow * stride_ + + group * ThreadMap::Delta::kGroup* stride_ + + cluster * ThreadMap::Delta::kCluster * stride_ + + pointer_offset / LoadType::kElements; + + int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); + + LoadType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { + + int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kLoadsPerAccess; ++v) { + + int vector_idx = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess); + + LoadType const *memory_pointer = pointers_[v] + row_ptr_offset; + + frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx]; + } + } + } + } + } + } + + /// Loads a fragment + CUTLASS_DEVICE + void load(Fragment &frag) { + + load_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for int32_t x 16 => int8_t x 16 +template < + typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap) +> +class SharedLoadIteratorMixed { +public: + using ThreadMap = ThreadMap_; + using Shape = typename ThreadMap::Shape; + + using Element = int32_t; + + using Layout = layout::RowMajor; + using TensorRef = TensorRef; + using ConstTensorRef = typename TensorRef::ConstTensorRef; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + using TensorCoord = MatrixCoord; + + static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; + + static int const kAlignment = 16; + + static int const kThreads = ThreadMap::kThreads; + + /// Fragment object + using Fragment = Array< + Element, + ThreadMap::Iterations::kColumn * + ThreadMap::Iterations::kRow * + ThreadMap::Iterations::kGroup * + ThreadMap::Iterations::kCluster * + ThreadMap::kElementsPerAccess>; + + /// Memory access size + using AccessType = AlignedArray< + Element, + 16, + kAlignment>; + + /// Vector type used for SMEM loads + using LoadType = AlignedArray< + Element, + 4, + 16 + >; + + static int const kLoadsPerAccess = 4; + +private: + + // + // Data members + // + + /// Byte-level pointer + LoadType const *pointers_[kLoadsPerAccess]; + + /// Stride along adjacent rows in units of LoadType + int stride_; + +public: + + // + // Methods + // + + /// Constructor + CUTLASS_DEVICE + SharedLoadIteratorMixed( + TensorRef ref, + int thread_idx + ): + stride_((ref.stride(0) / LoadType::kElements)) { + + TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx); + + // Initialize pointers + LoadType const *base_ptr = reinterpret_cast(ref.data()) + thread_offset.row() * stride_; + + int lane_col_idx = thread_offset.column() / 16; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + int lane_offset = (lane_col_idx % 2) * 4 | ((lane_col_idx / 2) * 8) | ((lane_col_idx / 2) ^ i); + + pointers_[i] = base_ptr + lane_offset; + } + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += pointer_offset / LoadType::kElements; + } + } + + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { + + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + + int row_ptr_offset = + row * ThreadMap::Delta::kRow * stride_ + + group * ThreadMap::Delta::kGroup* stride_ + + cluster * ThreadMap::Delta::kCluster * stride_ + + pointer_offset / LoadType::kElements; + + int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); + + LoadType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { + + int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kLoadsPerAccess; ++v) { + + LoadType const *memory_pointer = pointers_[v]; + + frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset]; + } + } + } + } + } + } + + /// Loads a fragment + CUTLASS_DEVICE + void load(Fragment &frag) { + + load_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for int32_t x 8 => int8_t x 8 +template < + typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap) +> +class SharedLoadIteratorMixed { +public: + using ThreadMap = ThreadMap_; + using Shape = typename ThreadMap::Shape; + + using Element = int32_t; + + using Layout = layout::RowMajor; + using TensorRef = TensorRef; + using ConstTensorRef = typename TensorRef::ConstTensorRef; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + using TensorCoord = MatrixCoord; + + static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; + + static int const kAlignment = 8; + + static int const kThreads = ThreadMap::kThreads; + + /// Fragment object + using Fragment = Array< + Element, + ThreadMap::Iterations::kColumn * + ThreadMap::Iterations::kRow * + ThreadMap::Iterations::kGroup * + ThreadMap::Iterations::kCluster * + ThreadMap::kElementsPerAccess>; + + /// Memory access size + using AccessType = AlignedArray< + Element, + 8, + kAlignment>; + + /// Vector type used for SMEM loads + using LoadType = AlignedArray< + Element, + 4, + 16 + >; + + static int const kLoadsPerAccess = 2; + +private: + + // + // Data members + // + + /// Byte-level pointer + LoadType const *pointers_[kLoadsPerAccess]; + + /// Stride along adjacent rows in units of LoadType + int stride_; + +public: + + // + // Methods + // + + /// Constructor + CUTLASS_DEVICE + SharedLoadIteratorMixed( + TensorRef ref, + int thread_idx + ): + stride_((ref.stride(0) / LoadType::kElements)) { + + TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx); + + // Initialize pointers + LoadType const *base_ptr = reinterpret_cast(ref.data()) + thread_offset.row() * stride_; + + int lane_col_idx = thread_offset.column() / 8; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + int lane_offset = (lane_col_idx % 8) * 2 | ((lane_col_idx / 4) ^ i); + + pointers_[i] = base_ptr + lane_offset; + } + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += pointer_offset / LoadType::kElements; + } + } + + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { + + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + + int row_ptr_offset = + row * ThreadMap::Delta::kRow * stride_ + + group * ThreadMap::Delta::kGroup* stride_ + + cluster * ThreadMap::Delta::kCluster * stride_ + + pointer_offset / LoadType::kElements; + + int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); + + LoadType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { + + int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kLoadsPerAccess; ++v) { + + LoadType const *memory_pointer = pointers_[v]; + + frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset]; + } + } + } + } + } + } + + /// Loads a fragment + CUTLASS_DEVICE + void load(Fragment &frag) { + + load_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h index d369a835d..1bab9104c 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h new file mode 100644 index 000000000..4c9564924 --- /dev/null +++ b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h @@ -0,0 +1,188 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile + that participate in one warp-level store operation. + + Typically, the accumulator tile is the largest single block of register-backed storage + within the kernel. Storing it to memory is best accomplished by partitioning it into + smaller tiles and storing these sequentially. + + Round trips through shared memory during the Epilogue phase require partitioning, as + shared memory capacity is typically insufficient for a threadblock's total accumulator + size. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/layout/matrix.h" + +#include "cutlass/epilogue/warp/tensor_op_policy.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace warp { + +//////////////////////////////////////////////////////////////////////////////// + +/// +template < + typename WarpShape, ///< shape of warp-level GEMM (concept: MatrixShape) + typename OperatorShape, ///< matrix multiply operation shape (concept: gemm::GemmShape) + typename OperatorElementC, ///< matrix multiply operation data type (concept: data type) + typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array) + typename Layout ///< target shared memory layout +> +class FragmentIteratorGaussianComplexTensorOp; + +//////////////////////////////////////////////////////////////////////////////// + + +/// Partial specialization for row-major shared memory +template < + typename WarpShape_, ///< shape of the warp-level GEMM tile + typename OperatorShape_, ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape) + typename OperatorElementC_, ///< underlying real-valued matrix multiply operation data type + typename OperatorFragmentC_ ///< underlying real-valued matrix multiply operation fragment (concept: Array) +> +class FragmentIteratorGaussianComplexTensorOp { +public: + + using WarpShape = WarpShape_; + using OperatorShape = OperatorShape_; + using OperatorElementC = OperatorElementC_; + using OperatorFragmentC = OperatorFragmentC_; + using Layout = layout::RowMajor; + + using Policy = TensorOpPolicy; + + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array< + complex, + Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>; + + /// Size of one part of accumulator of 3-part accumulator in units of number of OperatorElementC + static int const kElementsAccumulatorPerPart = + OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn; + + /// Offset into the accumulator fragment part 1 + static int const kPart1Index = kElementsAccumulatorPerPart * 0; + + /// Offset into the accumulator fragment part 2 + static int const kPart2Index = kElementsAccumulatorPerPart * 1; + + /// Offset into the accumulator fragment part 3 + static int const kPart3Index = kElementsAccumulatorPerPart * 2; + + /// This is the complete warp-level accumulator tile holding part1, part2, and part3 + using AccumulatorTile = Array; + + /// This is the complete warp-level accumulator tile holding final output of complex type + using OutputAccumulatorTile = Array, kElementsAccumulatorPerPart>; + + /// Number of times this iterator can be incremented + static int const kIterations = Policy::kIterations; + +private: + + /// Internal access type + using AccessType = Array; + + using FragmentAccessType = Array, Policy::kElementsPerAccess>; + +private: + + // + // Data members + // + + /// Accumulator tile + AccessType const *accumulators_; + + /// Internal index + int index_; + +public: + + /// Constructs an iterator + CUTLASS_HOST_DEVICE + FragmentIteratorGaussianComplexTensorOp(AccumulatorTile const &accum): + accumulators_(reinterpret_cast(&accum)), + index_(0) { + } + + /// Increments + CUTLASS_HOST_DEVICE + FragmentIteratorGaussianComplexTensorOp &operator++() { + ++index_; + return *this; + } + + /// Decrements + CUTLASS_HOST_DEVICE + FragmentIteratorGaussianComplexTensorOp &operator--() { + --index_; + return *this; + } + + /// Loads a fragment from the referenced part of the accumulator tile + CUTLASS_HOST_DEVICE + void load(Fragment &frag, int index_offset = 0) const { + + int index = index_ + index_offset; + + FragmentAccessType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int accumulator_access_offset = + index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess; + + auto const & part1_accum_array = accumulators_[accumulator_access_offset + kPart1Index]; + auto const & part2_accum_array = accumulators_[accumulator_access_offset + kPart2Index / Policy::kElementsPerAccess]; + auto const & part3_accum_array = accumulators_[accumulator_access_offset + kPart3Index / Policy::kElementsPerAccess]; + + // Pack parts 1, 2, and 3 into a structure. This is likely to result in MOVs + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Policy::kElementsPerAccess; ++i) { + + frag_ptr[n][i].real() = part1_accum_array[i] - part3_accum_array[i]; + frag_ptr[n][i].imag() = part1_accum_array[i] + part2_accum_array[i]; + } + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace epilogue +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/include/cutlass/epilogue/warp/fragment_iterator_simt.h index 160844203..6d75e5697 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_simt.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h index e19f12b93..f620e4bdd 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h index 15c095ffc..1abbbdc03 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h index b96b4c5bc..79106b111 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/simt_policy.h b/include/cutlass/epilogue/warp/simt_policy.h index 1d010c684..3e096978d 100644 --- a/include/cutlass/epilogue/warp/simt_policy.h +++ b/include/cutlass/epilogue/warp/simt_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tensor_op_policy.h b/include/cutlass/epilogue/warp/tensor_op_policy.h index c02656a52..82e685b84 100644 --- a/include/cutlass/epilogue/warp/tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_simt.h b/include/cutlass/epilogue/warp/tile_iterator_simt.h index 2bf92e017..a9d03db1c 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_simt.h +++ b/include/cutlass/epilogue/warp/tile_iterator_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h index d934c05ad..04c361f5e 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h new file mode 100644 index 000000000..82a93e2d0 --- /dev/null +++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h @@ -0,0 +1,675 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/pitch_linear.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/epilogue/warp/tensor_op_policy.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Template for reading and writing tiles of accumulators to shared memory. This is optimized +/// for mixed-precision epilogues in which the accumulators are 32b in width, but the output +/// data type is smaller. +template < + typename WarpShape_, ///< shape of warp-level GEMM (concept: GemmShape) + typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape) + typename Element_, ///< data type of accumulator element + int ElementSizeBits, ///< Size of accumulator element in bits + int OutputSizeBits, ///< Size of output element in bits + int OutputElementCount, ///< number of elements in output vector + int ContiguousLanes ///< Number of consecutive lanes writing to contiguous memory +> +class TileIteratorTensorOpMixed { +public: + + using WarpShape = WarpShape_; + using OperatorShape = OperatorShape_; + using Element = Element_; + using Layout = layout::RowMajor; + static int const kOutputElementCount = OutputElementCount; + + using TensorRef = TensorRef; ///< Tensor Reference object + using TensorCoord = MatrixCoord; ///< Logical coordinate in referenced tensor + using Index = typename TensorRef::Index; + using LongIndex = typename TensorRef::LongIndex; + + using Policy = TensorOpPolicy; + + /// Shape of the tile in memory + using Shape = MatrixShape< + Policy::kRowsPerIteration, + WarpShape::kN + >; + + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array< + Element, + Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>; + + /// This is the complete warp-level accumulator tile. + //using AccumulatorTile = typename Operator::FragmentC; + + /// Number of times this iterator can be incremented + static int const kIterations = Policy::kIterations; + + // Internal constants + struct Detail { + static int const kLanesInQuad = 4; + + /// Number of pointers needed to write accumulators + static int const kPointerCount = + (OutputElementCount * sizeof_bits::value) / (const_min(128, OutputElementCount * sizeof_bits::value)); + + static_assert(kPointerCount <= 4, "Can only accommodate four pointers at present."); + static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32)."); + }; + + /// Padding quantity + using Padding = MatrixShape< + 0, + Detail::kLanesInQuad * Policy::kElementsPerAccess>; + +private: + + /// Storage type for accessing memory + using AccessType = AlignedArray; + + // + // Data members + // + + /// Internal pointer to memory + AccessType *pointers_[Detail::kPointerCount]; + + /// Stride in units of AccessType + int stride_; + + /// Logical column in which warp tile is aligned + int warp_column_; + +public: + + /// Default constructor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed() { + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] = nullptr; + } + } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed( + TensorRef const &ref, + unsigned lane_id + ): + stride_(ref.stride()[0] / Policy::kElementsPerAccess), + warp_column_(0) { + + int quad_id = (lane_id / Detail::kLanesInQuad); + int lane_in_quad = (lane_id % Detail::kLanesInQuad); + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + AccessType *ptr = reinterpret_cast(ref.data()) + quad_id * stride_; + int column_idx = (lane_in_quad % 2) + (((lane_in_quad / 2) + i) % Detail::kPointerCount) * 2; + + ptr += column_idx; + + if (i == 0) { + pointers_[0 % Detail::kPointerCount] = ptr; + } + else if (i == 1) { + pointers_[1 % Detail::kPointerCount] = ptr; + } + else if (i == 2) { + pointers_[2 % Detail::kPointerCount] = ptr; + } + else if (i == 3) { + pointers_[3 % Detail::kPointerCount] = ptr; + } + } + } + + /// Adds a pointer offset + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] += pointer_offset / Policy::kElementsPerAccess; + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] += tile_offset.row() * Shape::kRow * stride_ + + tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess; + } + + warp_column_ += tile_offset.column() * Shape::kColumn; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) { + return add_tile_offset(tile_offset); + } + + /// Store + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + + AccessType const *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess; + int ptr_idx = ((column_idx * sizeof_bits::value) / 1024) % Detail::kPointerCount; + + AccessType *ptr; + if (ptr_idx == 0) { + ptr = pointers_[0 % Detail::kPointerCount]; + } + else if (ptr_idx == 1) { + ptr = pointers_[1 % Detail::kPointerCount]; + } + else if (ptr_idx == 2) { + ptr = pointers_[2 % Detail::kPointerCount]; + } + else if (ptr_idx == 3) { + ptr = pointers_[3 % Detail::kPointerCount]; + } + + int offset = n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess; +#if 0 + // Using inline PTX to avoid generic memory + AccessType *smem_ptr = pointers_[ptr_idx]; + smem_ptr[offset] = frag_ptr[n]; +#else + uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr); + uint32_t const *data = reinterpret_cast(frag_ptr + n); + uint32_t offset_in_bytes = offset * sizeof(AccessType); + + asm volatile( + "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n" + : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1]) + ); +#endif + } + } + + /// Store + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) { + store_with_pointer_offset(frag, 0); + } + + /// Load + CUTLASS_HOST_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const { + + AccessType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess; + int ptr_idx = ((column_idx * sizeof_bits::value) / 1024) % Detail::kPointerCount; + + AccessType const *smem_ptr = pointers_[ptr_idx]; + frag_ptr[n] = smem_ptr[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess]; + } + } + + /// Load + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for int32_t x 16 => int8_t x 16 +template < + typename WarpShape_, ///< shape of warp-level GEMM (concept: GemmShape) + typename OperatorShape_ ///< matrix multiply operation shape (concept: gemm::GemmShape) +> +class TileIteratorTensorOpMixed { +public: + + using WarpShape = WarpShape_; + using OperatorShape = OperatorShape_; + using Element = int32_t; + using Layout = layout::RowMajor; + static int const kOutputElementCount = 16; + + using TensorRef = TensorRef; ///< Tensor Reference object + using TensorCoord = MatrixCoord; ///< Logical coordinate in referenced tensor + using Index = typename TensorRef::Index; + using LongIndex = typename TensorRef::LongIndex; + + using Policy = TensorOpPolicy; + + /// Shape of the tile in memory + using Shape = MatrixShape< + Policy::kRowsPerIteration, + WarpShape::kN + >; + + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array< + Element, + Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>; + + /// This is the complete warp-level accumulator tile. + //using AccumulatorTile = typename Operator::FragmentC; + + /// Number of times this iterator can be incremented + static int const kIterations = Policy::kIterations; + + // Internal constants + struct Detail { + static int const kLanesInQuad = 4; + + /// Number of pointers needed to write accumulators + static int const kPointerCount = 2; + + /// Offsets added + static int const kOffsetCount = 4; + + static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32)."); + }; + + /// Padding quantity + using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>; + +private: + + /// Storage type for accessing memory + using AccessType = AlignedArray; + + // + // Data members + // + + /// Internal pointer to memory + AccessType *pointers_[Detail::kPointerCount]; + + /// Stride in units of AccessType + int stride_; + + /// Uniform offset in bytes added to warp tile iterator + int uniform_offset_[Detail::kOffsetCount]; + +public: + + /// Default constructor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed() { + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] = nullptr; + } + } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed( + TensorRef const &ref, + unsigned lane_id + ): + stride_(ref.stride()[0] / AccessType::kElements) { + + int quad_id = (lane_id / Detail::kLanesInQuad); + int lane_in_quad = (lane_id % Detail::kLanesInQuad); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Detail::kPointerCount; ++i) { + AccessType *ptr = reinterpret_cast(ref.data()) + quad_id * stride_; + int column_idx = lane_in_quad ^ (i * 2); + + ptr += column_idx; + + if (i == 0) { + pointers_[0] = ptr; + } + else if (i == 1) { + pointers_[1] = ptr; + } + } + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Detail::kOffsetCount; ++i) { + uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType); + } + } + + /// Adds a pointer offset + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] += pointer_offset / AccessType::kElements; + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) { + + int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + + tile_offset.column() * Shape::kColumn / AccessType::kElements; + + pointers_[0] += ptr_offset; + pointers_[1] += ptr_offset; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Detail::kOffsetCount; ++i) { + uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType); + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) { + return add_tile_offset(tile_offset); + } + + /// Store + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + + AccessType const *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int ptr_idx = (n / 4); + int offset_idx = (n % 4); + + AccessType *ptr; + if (ptr_idx == 0) { + ptr = pointers_[0]; + } + else if (ptr_idx == 1) { + ptr = pointers_[1]; + } + + int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements; + +#if 0 + // + // Using inline PTX to avoid generic memory + // + AccessType *smem_ptr = pointers_[ptr_idx]; + smem_ptr[offset] = frag_ptr[n]; +#else + uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr); + uint32_t const *data = reinterpret_cast(frag_ptr + n); + uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx]; + + asm volatile( + "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n" + : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1]) + ); +#endif + } + } + + /// Store + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) { + store_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for int32_t x 8 => int8_t x 8 +template < + typename WarpShape_, ///< shape of warp-level GEMM (concept: GemmShape) + typename OperatorShape_ ///< matrix multiply operation shape (concept: gemm::GemmShape) +> +class TileIteratorTensorOpMixed { +public: + + using WarpShape = WarpShape_; + using OperatorShape = OperatorShape_; + using Element = int32_t; + using Layout = layout::RowMajor; + static int const kOutputElementCount = 8; + + using TensorRef = TensorRef; ///< Tensor Reference object + using TensorCoord = MatrixCoord; ///< Logical coordinate in referenced tensor + using Index = typename TensorRef::Index; + using LongIndex = typename TensorRef::LongIndex; + + using Policy = TensorOpPolicy; + + /// Shape of the tile in memory + using Shape = MatrixShape< + Policy::kRowsPerIteration, + WarpShape::kN + >; + + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array< + Element, + Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>; + + /// This is the complete warp-level accumulator tile. + //using AccumulatorTile = typename Operator::FragmentC; + + /// Number of times this iterator can be incremented + static int const kIterations = Policy::kIterations; + + // Internal constants + struct Detail { + static int const kLanesInQuad = 4; + + /// Number of pointers needed to write accumulators + static int const kPointerCount = 2; + + static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32)."); + }; + + /// Padding quantity + using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>; + +private: + + /// Storage type for accessing memory + using AccessType = AlignedArray; + + // + // Data members + // + + /// Internal pointer to memory + AccessType *pointers_[Detail::kPointerCount]; + + /// Stride in units of AccessType + int stride_; + +public: + + /// Default constructor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed() { + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] = nullptr; + } + } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed( + TensorRef const &ref, + unsigned lane_id + ): + stride_(ref.stride()[0] / AccessType::kElements) { + + int quad_id = (lane_id / Detail::kLanesInQuad); + int lane_in_quad = (lane_id % Detail::kLanesInQuad); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Detail::kPointerCount; ++i) { + AccessType *ptr = reinterpret_cast(ref.data()) + quad_id * stride_; + int column_idx = lane_in_quad ^ (i * 2); + + ptr += column_idx; + + if (i == 0) { + pointers_[0] = ptr; + } + else if (i == 1) { + pointers_[1] = ptr; + } + } + } + + /// Adds a pointer offset + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] += pointer_offset / AccessType::kElements; + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) { + + int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + + tile_offset.column() * Shape::kColumn / AccessType::kElements; + + pointers_[0] += ptr_offset; + pointers_[1] += ptr_offset; + + if (tile_offset.column() % 2) { + auto tmp = pointers_[0]; + pointers_[0] = pointers_[1]; + pointers_[1] = tmp; + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) { + return add_tile_offset(tile_offset); + } + + /// Store + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + + AccessType const *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int ptr_idx = (n / 4); + + AccessType *ptr; + if (ptr_idx == 0) { + ptr = pointers_[0]; + } + else if (ptr_idx == 1) { + ptr = pointers_[1]; + } + + int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4; + +#if 0 + // + // Using inline PTX to avoid generic memory + // + AccessType *smem_ptr = pointers_[ptr_idx]; + smem_ptr[offset] = frag_ptr[n]; +#else + uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr); + uint32_t const *data = reinterpret_cast(frag_ptr + n); + uint32_t offset_in_bytes = offset * sizeof(AccessType); + + asm volatile( + "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n" + : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1]) + ); +#endif + } + } + + /// Store + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) { + store_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h index a9ca2315d..8ffb5ec12 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h index e8299f9d2..6017b5c7e 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h index 631d423e5..b0ecc5eb6 100644 --- a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h index fc312c7a6..7b938d371 100644 --- a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h index ebc821ed6..036b08e23 100644 --- a/include/cutlass/fast_math.h +++ b/include/cutlass/fast_math.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index f712e04a7..13ee7f542 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -96,6 +96,16 @@ struct multiply_add { } }; +/// Fused multiply-add +template +struct and_add { + CUTLASS_HOST_DEVICE + T operator()(T const &a, T const &b, T const &c) const { + return ((a & b) + c); + } +}; + + /// Fused multiply-add template struct xor_add { @@ -1207,6 +1217,212 @@ struct multiply_add, Array, Array> { ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Fused multiply-add +template +struct multiply_add, Array, Array> { + + CUTLASS_HOST_DEVICE + Array operator()( + Array const &a, + Array const &b, + Array const &c) const { + + Array result; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + + unsigned *result_ptr = reinterpret_cast(&result); + unsigned const *a_ptr = reinterpret_cast(&a); + unsigned const *b_ptr = reinterpret_cast(&b); + unsigned const *c_ptr = reinterpret_cast(&c); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" + : "=r"(result_ptr[i]) + : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i]) + ); + } + + if (N % 2) { + + uint16_t *result_ptr = reinterpret_cast(&result); + uint16_t const *a_residual_ptr = reinterpret_cast(&a); + uint16_t const *b_residual_ptr = reinterpret_cast(&b); + uint16_t const *c_residual_ptr = reinterpret_cast(&c); + + asm ("fma.rn.bf16 %0, %1, %2, %3;\n" + : "=h"(result_ptr[N - 1]) + : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1]) + ); + } + + #else + + multiply_add op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = op(a[i], b[i], c[i]); + } + #endif + + return result; + } + + CUTLASS_HOST_DEVICE + Array operator()( + bfloat16_t const &a, + Array const &b, + Array const &c) const { + + Array result; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + + unsigned *result_ptr = reinterpret_cast(&result); + + unsigned const *b_ptr = reinterpret_cast(&b); + unsigned const *c_ptr = reinterpret_cast(&c); + + unsigned a_packed = static_cast(a.raw()); + a_packed = (a_packed | (a_packed << 16)); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" + : "=r"(result_ptr[i]) + : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i]) + ); + } + + if (N % 2) { + + uint16_t *result_ptr = reinterpret_cast(&result); + uint16_t const *a_residual_ptr = reinterpret_cast(&a); + uint16_t const *b_residual_ptr = reinterpret_cast(&b); + uint16_t const *c_residual_ptr = reinterpret_cast(&c); + + asm ("fma.rn.bf16 %0, %1, %2, %3;\n" + : "=h"(result_ptr[N - 1]) + : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1]) + ); + } + + #else + + multiply_add op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = op(a, b[i], c[i]); + } + #endif + + return result; + } + + CUTLASS_HOST_DEVICE + Array operator()( + Array const &a, + bfloat16_t const &b, + Array const &c) const { + + Array result; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + + unsigned *result_ptr = reinterpret_cast(&result); + + unsigned const *a_ptr = reinterpret_cast(&a); + unsigned const *c_ptr = reinterpret_cast(&c); + + unsigned b_packed = static_cast(b.raw()); + b_packed = (b_packed | (b_packed << 16)); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" + : "=r"(result_ptr[i]) + : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i]) + ); + } + + if (N % 2) { + + uint16_t *result_ptr = reinterpret_cast(&result); + uint16_t const *a_residual_ptr = reinterpret_cast(&a); + uint16_t const *b_residual_ptr = reinterpret_cast(&b); + uint16_t const *c_residual_ptr = reinterpret_cast(&c); + + asm ("fma.rn.bf16 %0, %1, %2, %3;\n" + : "=h"(result_ptr[N - 1]) + : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1]) + ); + } + + #else + + multiply_add op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = op(a[i], b, c[i]); + } + #endif + + return result; + } + + CUTLASS_HOST_DEVICE + Array operator()( + Array const &a, + Array const &b, + bfloat16_t const &c) const { + + Array result; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + + unsigned *result_ptr = reinterpret_cast(&result); + + unsigned const *a_ptr = reinterpret_cast(&a); + unsigned const *b_ptr = reinterpret_cast(&b); + + unsigned c_packed = static_cast(c.raw()); + c_packed = (c_packed | (c_packed << 16)); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" + : "=r"(result_ptr[i]) + : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed) + ); + } + + if (N % 2) { + + uint16_t *result_ptr = reinterpret_cast(&result); + uint16_t const *a_residual_ptr = reinterpret_cast(&a); + uint16_t const *b_residual_ptr = reinterpret_cast(&b); + uint16_t const *c_residual_ptr = reinterpret_cast(&c); + + asm ("fma.rn.bf16 %0, %1, %2, %3;\n" + : "=h"(result_ptr[N - 1]) + : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0]) + ); + } + + #else + + multiply_add op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = op(a[i], b[i], c); + } + #endif + + return result; + } +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h index fff34dc4d..c65b3f006 100644 --- a/include/cutlass/gemm/device/default_gemm_configuration.h +++ b/include/cutlass/gemm/device/default_gemm_configuration.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -422,6 +422,342 @@ struct DefaultGemmConfiguration< using Operator = arch::OpMultiplyAddSaturate; }; +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm75, + uint1b_t, + uint1b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 512>; + using WarpShape = GemmShape<64, 64, 512>; + using InstructionShape = GemmShape<8, 8, 128>; + static int const kStages = 2; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpXorPopc; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template +struct DefaultGemmConfiguration { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 16>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombination< + ElementC, 128 / sizeof_bits::value, ElementAccumulator, + ElementAccumulator>; + + using Operator = typename platform::conditional< + (platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value), + arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type; +}; + +//////////////////////////////////////////////////////////////////////////////// +template +struct DefaultGemmConfiguration { + + static int const kAlignmentA = 1; + static int const kAlignmentB = 1; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 16>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombination< + ElementC, 128 / sizeof_bits::value, ElementAccumulator, + ElementAccumulator>; + + using Operator = arch::OpMultiplyAdd; +}; + + +template <> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + complex, + complex, + complex, + complex + > { + + static int const kAlignmentA = 1; + static int const kAlignmentB = 1; + + using ThreadblockShape = GemmShape<64, 64, 16>; + using WarpShape = GemmShape<32, 32, 16>; + using InstructionShape = GemmShape<8, 8, 4>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombination< + complex, 1, complex, + complex>; + + using Operator = arch::OpMultiplyAddComplex; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + int8_t, + int8_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 32>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + int8_t, + uint8_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 32>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint8_t, + int8_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 32>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint8_t, + uint8_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 32>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + int4b_t, + int4b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 128>; + using WarpShape = GemmShape<64, 64, 128>; + using InstructionShape = GemmShape<16, 8, 64>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + int4b_t, + uint4b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 128>; + using WarpShape = GemmShape<64, 64, 128>; + using InstructionShape = GemmShape<16, 8, 64>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint4b_t, + int4b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 128>; + using WarpShape = GemmShape<64, 64, 128>; + using InstructionShape = GemmShape<16, 8, 64>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint4b_t, + uint4b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 128>; + using WarpShape = GemmShape<64, 64, 128>; + using InstructionShape = GemmShape<16, 8, 64>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint1b_t, + uint1b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 512>; + using WarpShape = GemmShape<64, 64, 512>; + using InstructionShape = GemmShape<16, 8, 256>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAdd; +}; + +//////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// } // namespace device } // namespace gemm diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h index c91aac204..70383e15e 100644 --- a/include/cutlass/gemm/device/gemm.h +++ b/include/cutlass/gemm/device/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -193,7 +193,7 @@ template < ElementAccumulator_>::EpilogueOutputOp, /// Threadblock-level swizzling operator typename ThreadblockSwizzle_ = - typename threadblock::GemmCohortThreadblockSwizzle, + typename threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultGemmConfiguration::EpilogueOutputOp, /// Threadblock-level swizzling operator - typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle, + typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultGemmConfiguration; diff --git a/include/cutlass/gemm/device/gemm_splitk_parallel.h b/include/cutlass/gemm/device/gemm_splitk_parallel.h index df11ba5b5..73f1c240b 100644 --- a/include/cutlass/gemm/device/gemm_splitk_parallel.h +++ b/include/cutlass/gemm/device/gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h index 4b57fa0d9..091290901 100644 --- a/include/cutlass/gemm/device/gemm_universal.h +++ b/include/cutlass/gemm/device/gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -89,7 +89,7 @@ template < OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, /// Threadblock-level swizzling operator - typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle, + typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultGemmConfiguration + struct MapArguments { + using ElementA = ElementA_; + using LayoutA = LayoutA_; + static ComplexTransform const kTransformA = TransformA; + static int const kAlignmentA = AlignmentA; + using ElementB = ElementB_; + using LayoutB = LayoutB_; + static ComplexTransform const kTransformB = TransformB; + static int const kAlignmentB = AlignmentB; + using LayoutC = LayoutC_; + }; + + template < + typename ElementA_, + typename LayoutA_, + ComplexTransform TransformA, + int AlignmentA, + typename ElementB_, + typename LayoutB_, + ComplexTransform TransformB, + int AlignmentB, + typename LayoutC_ + > + struct MapArguments< + ElementA_, + LayoutA_, + TransformA, + AlignmentA, + ElementB_, + LayoutB_, + TransformB, + AlignmentB, + LayoutC_, + true + > { + using ElementA = ElementB_; + using LayoutA = typename layout::LayoutTranspose::type; + static ComplexTransform const kTransformA = TransformB; + static int const kAlignmentA = AlignmentB; + using ElementB = ElementA_; + using LayoutB = typename layout::LayoutTranspose::type; + static ComplexTransform const kTransformB = TransformA; + static int const kAlignmentB = AlignmentA; + using LayoutC = typename layout::LayoutTranspose::type; + }; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + template class GemmUniversalAdapter { public: using GemmKernel = GemmKernel_; - static_assert(std::is_same::value, - "Universal adapter expects the kernel to be row-major and transposes its arguments."); + static bool const kInternalTranspose = + std::is_same::value; using ThreadblockShape = typename GemmKernel::Mma::Shape; using WarpShape = typename GemmKernel::WarpShape; @@ -56,26 +120,39 @@ public: using OperatorClass = typename GemmKernel::OperatorClass; using ArchTag = typename GemmKernel::ArchTag; - + // Type, layout, and complex transform deliberately exchanged with B - using ElementA = typename GemmKernel::ElementB; - using LayoutA = typename layout::LayoutTranspose::type; - using TensorRefA = TensorRef; - static ComplexTransform const kTransformA = GemmKernel::kTransformB; + using MapArguments = detail::MapArguments< + typename GemmKernel::ElementA, + typename GemmKernel::LayoutA, + GemmKernel::kTransformA, + GemmKernel::kAlignmentA, + typename GemmKernel::ElementB, + typename GemmKernel::LayoutB, + GemmKernel::kTransformB, + GemmKernel::kAlignmentB, + typename GemmKernel::LayoutC, + kInternalTranspose + >; + + using ElementA = typename MapArguments::ElementA; + using LayoutA = typename MapArguments::LayoutA; + static ComplexTransform const kTransformA = MapArguments::kTransformA; static int const kAlignmentA = GemmKernel::kAlignmentA; - // Type, layout, and complex transform deliberately exchanged with A - using ElementB = typename GemmKernel::ElementA; - using LayoutB = typename layout::LayoutTranspose::type; - using TensorRefB = TensorRef; - static ComplexTransform const kTransformB = GemmKernel::kTransformA; + using ElementB = typename MapArguments::ElementB; + using LayoutB = typename MapArguments::LayoutB; + static ComplexTransform const kTransformB = MapArguments::kTransformB; static int const kAlignmentB = GemmKernel::kAlignmentB; - + using ElementC = typename GemmKernel::ElementC; - using LayoutC = cutlass::layout::ColumnMajor; + using LayoutC = typename MapArguments::LayoutC; + static int const kAlignmentC = GemmKernel::kAlignmentC; + + using TensorRefA = TensorRef; + using TensorRefB = TensorRef; using TensorRefC = TensorRef; using TensorRefD = TensorRef; - static int const kAlignmentC = GemmKernel::kAlignmentC; using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC; @@ -99,7 +176,12 @@ public: /// Helper to construct a transposed equivalent for the underying GEMM operator static Arguments to_underlying_arguments(Arguments const &args) { - return args.transposed_problem(); + if (kInternalTranspose) { + return args.transposed_problem(); + } + else { + return args; + } } /// Determines whether the GEMM can execute the given problem. diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h index de0ee183a..18ccb3469 100644 --- a/include/cutlass/gemm/device/gemm_universal_base.h +++ b/include/cutlass/gemm/device/gemm_universal_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h index 011e03c95..78d0a6da6 100644 --- a/include/cutlass/gemm/gemm.h +++ b/include/cutlass/gemm/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -400,7 +400,8 @@ enum class GemmUniversalMode { kGemm, kGemmSplitKParallel, kBatched, - kArray + kArray, + kInvalid }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h index f3f6a1495..0aba2d3a7 100644 --- a/include/cutlass/gemm/kernel/default_gemm.h +++ b/include/cutlass/gemm/kernel/default_gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -49,6 +49,7 @@ #include "cutlass/gemm/kernel/gemm_pipelined.h" #include "cutlass/gemm/threadblock/default_mma_core_sm75.h" #include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" #include "cutlass/gemm/threadblock/default_mma.h" #include "cutlass/gemm/threadblock/default_mma_core_simt.h" #include "cutlass/gemm/threadblock/threadblock_swizzle.h" @@ -116,6 +117,68 @@ template < struct DefaultGemm; //////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Ampere Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of A matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// If true, kernel is configured to support serial reduction in the + /// epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator> +struct DefaultGemm { + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, + ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, + ThreadblockShape, WarpShape, InstructionShape, Stages, + Operator>::ThreadblockMma; + + static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK; + + /// Define the epilogue + using Epilogue = + typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp, + EpilogueOutputOp::kCount>::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; +//////////////////////////////////////////////////////////////////////////////// + /// Partial specialization for Turing Architecture template < /// Element type for A matrix operand @@ -201,6 +264,75 @@ struct DefaultGemm< }; //////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout +template < + /// Element type for A matrix operand + typename ElementA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Number of Interleaved k + int InterleavedK, + /// If true, kernel is configured to support serial reduction in the + /// epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator, + /// Is Beta zero or not + bool IsBetaZero> +struct DefaultGemm< + ElementA, layout::ColumnMajorInterleaved, kAlignmentA, + ElementB, layout::RowMajorInterleaved, kAlignmentB, ElementC, + layout::ColumnMajorInterleaved, int32_t, + arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, + InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, + SplitKSerial, Operator, IsBetaZero> { + using LayoutA = layout::ColumnMajorInterleaved; + using LayoutB = layout::RowMajorInterleaved; + using LayoutC = layout::ColumnMajorInterleaved; + + using ElementAccumulator = int32_t; + + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80, + ThreadblockShape, WarpShape, InstructionShape, Stages, Operator, + true>::ThreadblockMma; + + static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK; + + /// Define the epilogue + using Epilogue = typename cutlass::epilogue::threadblock:: + DefaultInterleavedEpilogueTensorOp< + ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp, + 64 / sizeof_bits::value, InterleavedK, + IsBetaZero>::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; + +//////////////////////////////////////////////////////////////////////////////// + /// Partial specialization for Turing Integer Matrix Multiply Interleaved layout template < /// Element type for A matrix operand @@ -439,6 +571,80 @@ struct DefaultGemm< //////////////////////////////////////////////////////////////////////////////// +/// Partial specialization for Ampere +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of A matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages + int Stages, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator> +struct DefaultGemm, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + SplitKSerial, + Operator> { + + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, + ElementAccumulator, layout::RowMajor, arch::OpClassSimt, arch::Sm80, + ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages, + Operator>::ThreadblockMma; + + static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount; + static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars"); + + /// Define the epilogue + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + typename Mma::Operator, + EpilogueOutputOp, + kEpilogueElementsPerAccess + >::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; + //////////////////////////////////////////////////////////////////////////////// /// Partial specialization for SIMT DP4A @@ -516,7 +722,6 @@ struct DefaultGemm; }; - #if defined(CUTLASS_ARCH_WMMA_ENABLED) //////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Wmma Gemm Kernel diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h index a9ef4e316..15b1430c7 100644 --- a/include/cutlass/gemm/kernel/default_gemm_complex.h +++ b/include/cutlass/gemm/kernel/default_gemm_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -49,7 +49,9 @@ #include "cutlass/gemm/kernel/gemm_pipelined.h" #include "cutlass/gemm/threadblock/default_mma_core_sm75.h" #include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h" #include "cutlass/gemm/threadblock/default_mma.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h" #include "cutlass/gemm/threadblock/default_mma_core_simt.h" #include "cutlass/gemm/threadblock/threadblock_swizzle.h" #include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h" @@ -101,6 +103,7 @@ template < /// Complex elementwise transformation on B operand ComplexTransform TransformB, /// Multiply-add operator + // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) typename Operator, /// If true, kernel is configured to support serial reduction in the epilogue bool SplitKSerial @@ -109,6 +112,64 @@ struct DefaultGemmComplex; //////////////////////////////////////////////////////////////////////////////// +/// Partial specialization for Ampere Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Multiply-add operator + // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial + > +struct DefaultGemmComplex< + ElementA, LayoutA, ElementB, LayoutB, ElementC, + layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp, + arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, + EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> { + + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex< + ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator, + layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, + WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma; + + /// Define the epilogue + using Epilogue = + typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp< + ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp, + EpilogueOutputOp::kCount, Operator>::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; + //////////////////////////////////////////////////////////////////////////////// } // namespace kernel diff --git a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h index 3664fecee..870084834 100644 --- a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h +++ b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -49,6 +49,7 @@ #include "cutlass/epilogue/threadblock/default_epilogue_planar_complex.h" #include "cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h" +#include "cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -222,6 +223,122 @@ struct DefaultGemmPlanarComplexUniversal< ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization for multiple pipeline stages. +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Layout type for C and D matrix operands + typename LayoutC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Operator class tag + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation performed by GEMM + typename Operator + > +struct DefaultGemmPlanarComplexUniversal< + ElementA, + LayoutA, + TransformA, + kAlignmentA, + ElementB, + LayoutB, + TransformB, + kAlignmentB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + Operator, + typename std::enable_if<(Stages > 2)>::type +> { + + /// Define planar complex valued variants instead + using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexMultistage< + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementAccumulator, + LayoutC, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + Stages, + TransformA, + TransformB, + Operator + >::ThreadblockMma; + + /// Planar complex epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex< + ThreadblockShape, + typename Mma::Policy::Operator, + OperatorClass, + ArchTag, + ThreadblockShape::kK / WarpShape::kK, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + /// Define the kernel in terms of the default kernel + using GemmKernel = kernel::GemmPlanarComplex< + Mma, + Epilogue, + ThreadblockSwizzle + >; + + // Array variant + using GemmArrayKernel = kernel::GemmPlanarComplexArray< + Mma, + Epilogue, + ThreadblockSwizzle + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace kernel } // namespace gemm } // namespace cutlass diff --git a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h index f50ead046..e23965d33 100644 --- a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h +++ b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/default_gemm_universal.h b/include/cutlass/gemm/kernel/default_gemm_universal.h index 23db577ce..579005cb4 100644 --- a/include/cutlass/gemm/kernel/default_gemm_universal.h +++ b/include/cutlass/gemm/kernel/default_gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/default_gemv.h b/include/cutlass/gemm/kernel/default_gemv.h index 08a307903..36ae339c4 100755 --- a/include/cutlass/gemm/kernel/default_gemv.h +++ b/include/cutlass/gemm/kernel/default_gemv.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h index 36cf67311..6700659a1 100644 --- a/include/cutlass/gemm/kernel/gemm.h +++ b/include/cutlass/gemm/kernel/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h index 30ff1d301..f63571b02 100644 --- a/include/cutlass/gemm/kernel/gemm_array.h +++ b/include/cutlass/gemm/kernel/gemm_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h index 8bf4354a8..eb638375c 100644 --- a/include/cutlass/gemm/kernel/gemm_batched.h +++ b/include/cutlass/gemm/kernel/gemm_batched.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h index 293592e74..6caa0eae3 100644 --- a/include/cutlass/gemm/kernel/gemm_pipelined.h +++ b/include/cutlass/gemm/kernel/gemm_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h index 3d975bb2a..e05112569 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -421,6 +421,13 @@ public: cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(); + // Early exit if CTA is out of range + if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() || + params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) { + + return; + } + int offset_k = 0; int problem_size_k = params.problem_size.k(); diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h index efb500b28..00841d469 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -377,6 +377,14 @@ public: ThreadblockSwizzle threadblock_swizzle; cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(); + + // Early exit if CTA is out of range + if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() || + params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) { + + return; + } + int batch_idx = threadblock_tile_offset.k(); int problem_size_m = params.problem_size.m(); diff --git a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h index 2c5978aa8..973897521 100644 --- a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h +++ b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h index 11831d8d6..6efd50a7f 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.h +++ b/include/cutlass/gemm/kernel/gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ public: using OperatorClass = typename Mma::Operator::OperatorClass; using ThreadblockShape = typename Mma::Shape; using WarpShape = typename Mma::Operator::Shape; - using InstructionShape = typename Mma::Policy::Operator::Shape; + using InstructionShape = typename Mma::Policy::Operator::InstructionShape; using ArchTag = typename Mma::ArchTag; static int const kStages = Mma::kStages; @@ -259,9 +259,9 @@ public: Arguments const &args, void *workspace = nullptr) { - ptr_A = args.ptr_A; - ptr_B = args.ptr_B; - ptr_C = args.ptr_C; + ptr_A = const_cast(args.ptr_A); + ptr_B = const_cast(args.ptr_B); + ptr_C = const_cast(args.ptr_C); ptr_D = args.ptr_D; output_op = args.epilogue; @@ -303,6 +303,10 @@ public: return Status::kSuccess; } + static Status can_implement(Arguments const &args) { + return can_implement(args.problem_size); + } + /// Executes one GEMM CUTLASS_DEVICE void operator()(Params const ¶ms, SharedStorage &shared_storage) { diff --git a/include/cutlass/gemm/kernel/gemv_batched_strided.h b/include/cutlass/gemm/kernel/gemv_batched_strided.h index 852edde29..ea8d9bdf8 100755 --- a/include/cutlass/gemm/kernel/gemv_batched_strided.h +++ b/include/cutlass/gemm/kernel/gemv_batched_strided.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma.h b/include/cutlass/gemm/thread/mma.h index 41ea8b49c..15dfe4338 100644 --- a/include/cutlass/gemm/thread/mma.h +++ b/include/cutlass/gemm/thread/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm50.h b/include/cutlass/gemm/thread/mma_sm50.h index 78c77bef2..04658f7bc 100644 --- a/include/cutlass/gemm/thread/mma_sm50.h +++ b/include/cutlass/gemm/thread/mma_sm50.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h index 66fed7e17..16d0d61c2 100644 --- a/include/cutlass/gemm/thread/mma_sm60.h +++ b/include/cutlass/gemm/thread/mma_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm61.h b/include/cutlass/gemm/thread/mma_sm61.h index 13bbb5429..83e31b237 100644 --- a/include/cutlass/gemm/thread/mma_sm61.h +++ b/include/cutlass/gemm/thread/mma_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_gemv_core.h b/include/cutlass/gemm/threadblock/default_gemv_core.h index de234b851..9d692d6db 100755 --- a/include/cutlass/gemm/threadblock/default_gemv_core.h +++ b/include/cutlass/gemm/threadblock/default_gemv_core.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma.h b/include/cutlass/gemm/threadblock/default_mma.h index 11af1de48..3ebe14e6b 100644 --- a/include/cutlass/gemm/threadblock/default_mma.h +++ b/include/cutlass/gemm/threadblock/default_mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -38,6 +38,8 @@ #include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h" #include "cutlass/gemm/threadblock/default_mma_core_sm70.h" #include "cutlass/gemm/threadblock/default_mma_core_sm75.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" + #if defined(CUTLASS_ARCH_WMMA_ENABLED) #include "cutlass/gemm/threadblock/default_mma_core_wmma.h" #endif //CUTLASS_ARCH_WMMA_ENABLED @@ -203,6 +205,58 @@ struct DefaultMma +struct DefaultMma { + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float, + LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2, + arch::OpMultiplyAddFastF16>; + + // Define iterators over tiles from the A operand + using IteratorA = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>; + + // Define iterators over tiles from the B operand + using IteratorB = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>; + + // Define the threadblock-scoped pipelined matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + IteratorB, typename MmaCore::SmemIteratorB, float, + layout::RowMajor, typename MmaCore::MmaPolicy>; +}; + +//////////////////////////////////////////////////////////////////////////////// + /// Specialization for column-major-interleaved output template < /// Element type for A matrix operand @@ -271,6 +325,214 @@ struct DefaultMma +struct DefaultMma { + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + Stages, Operator>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB, + MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, + typename MmaCore::MmaPolicy, Stages>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for row-major output (OperatorClass TensorOp) +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the multistage mainloop + int Stages, + /// Operation perfomed by GEMM + typename Operator + > +struct DefaultMma { + static cutlass::arch::CacheOperation::Kind const CacheOpA = + ((sizeof_bits::value * kAlignmentA) == 128) + ? cutlass::arch::CacheOperation::Global + : cutlass::arch::CacheOperation::Always; + + static cutlass::arch::CacheOperation::Kind const CacheOpB = + ((sizeof_bits::value * kAlignmentB) == 128) + ? cutlass::arch::CacheOperation::Global + : cutlass::arch::CacheOperation::Always; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, Operator, false, CacheOpA, CacheOpB>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB, + MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, + typename MmaCore::MmaPolicy, Stages>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for column-major-interleaved output +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the multistage mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + /// Number of Interleaved K + int InterleavedK> +struct DefaultMma, OperatorClass, + ArchTag, ThreadblockShape, WarpShape, InstructionShape, + Stages, Operator, true> { + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, + layout::ColumnMajorInterleaved, OperatorClass, Stages, + Operator, true>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB, + MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, + typename MmaCore::MmaPolicy, Stages>; +}; + //////////////////////////////////////////////////////////////////////////////// /// Specialization for SIMT IDP4A Kernels diff --git a/include/cutlass/gemm/threadblock/default_mma_core.h b/include/cutlass/gemm/threadblock/default_mma_core.h index f346709e6..a7ac7c44b 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core.h +++ b/include/cutlass/gemm/threadblock/default_mma_core.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -40,6 +40,8 @@ #include "cutlass/gemm/warp/mma.h" #include "cutlass/gemm/threadblock/mma_pipelined.h" #include "cutlass/gemm/threadblock/mma_singlestage.h" +#include "cutlass/arch/cache_operation.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -86,6 +88,17 @@ template < /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. bool AccumulatorsInRowMajor = false + /// Cache operation of operand A + , cutlass::arch::CacheOperation::Kind CacheOpA = + cutlass::arch::CacheOperation::Global, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB = + cutlass::arch::CacheOperation::Global, + /// per-element transformation for elements of A + ComplexTransform TransformA = ComplexTransform::kNone, + /// per-element transformation for elements of B + ComplexTransform TransformB = ComplexTransform::kNone, + bool IsComplex = false // (is_complex::value || is_complex::value) > struct DefaultMmaCore; diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h index 9eaa6a7a5..be5014937 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm50.h b/include/cutlass/gemm/threadblock/default_mma_core_sm50.h index 37aee4762..782cd7aea 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm50.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm50.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h index a9ec80fd1..30b3b3c0a 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h index 51b5878f5..e7a2adcb1 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -598,6 +598,523 @@ struct DefaultMmaCore +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = float; + using LayoutA = layout::ColumnMajor; + using ElementB = float; + using LayoutB = layout::RowMajor; + using ElementC = float; + using LayoutC = LayoutC_; + using OperatorClass = arch::OpClassTensorOp; + + /// Number of warps present + using WarpCount = GemmShape< + Shape::kM / WarpShape::kM, + Shape::kN / WarpShape::kN, + Shape::kK / WarpShape::kK + >; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && + !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size." + ); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 256; + + /// Default Operator + using Operator = arch::OpMultiplyAdd; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(half_t))>; + + // Shared memory layout + using SmemLayoutB = + layout::RowMajorTensorOpMultiplicandCongruous::value, + int(128 / sizeof(half_t))>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutA, + 1, + IteratorThreadMapA + >; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutB, + 0, + IteratorThreadMapB + >; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaTensorOp, + MatrixShape<0, 0>, + MatrixShape<0, 0>, + WarpCount::kK + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: row-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = float; + using LayoutA = layout::RowMajor; + using ElementB = float; + using LayoutB = layout::ColumnMajor; + using ElementC = float; + using LayoutC = LayoutC_; + using OperatorClass = arch::OpClassTensorOp; + + /// Number of warps present + using WarpCount = GemmShape< + Shape::kM / WarpShape::kM, + Shape::kN / WarpShape::kN, + Shape::kK / WarpShape::kK + >; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && + !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size." + ); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 256; + + /// Default Operator + using Operator = arch::OpMultiplyAdd; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousA = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedA = + kWarpSize / kWarpThreadArrangementContiguousA; + + static int const kWarpThreadArrangementContiguousB = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedB = + kWarpSize / kWarpThreadArrangementContiguousB; + + // + // Shared memory layouts + // + + using SmemLayoutA = + layout::RowMajorTensorOpMultiplicandCrosswise::value, + Shape::kK>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutA, + 0, + IteratorThreadMapA + >; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutB, + 1, + IteratorThreadMapB + >; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaTensorOp, + MatrixShape<0, 0>, + MatrixShape<0, 0>, + WarpCount::kK + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: row-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = float; + using LayoutA = layout::RowMajor; + using ElementB = float; + using LayoutB = layout::RowMajor; + using ElementC = float; + using LayoutC = LayoutC_; + using OperatorClass = arch::OpClassTensorOp; + + /// Number of warps present + using WarpCount = GemmShape< + Shape::kM / WarpShape::kM, + Shape::kN / WarpShape::kN, + Shape::kK / WarpShape::kK + >; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && + !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size." + ); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 256; + + /// Default Operator + using Operator = arch::OpMultiplyAdd; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousA = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedA = + kWarpSize / kWarpThreadArrangementContiguousA; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // Shared memory layout + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(half_t))>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutA, + 0, + IteratorThreadMapA + >; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutB, + 0, + IteratorThreadMapB + >; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaTensorOp, + MatrixShape<0, 0>, + MatrixShape<0, 0>, + WarpCount::kK + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: column-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = float; + using LayoutA = layout::ColumnMajor; + using ElementB = float; + using LayoutB = layout::ColumnMajor; + using ElementC = float; + using LayoutC = LayoutC_; + using OperatorClass = arch::OpClassTensorOp; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 256; + + /// Default Operator + using Operator = arch::OpMultiplyAdd; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousB = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedB = + kWarpSize / kWarpThreadArrangementContiguousB; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(half_t))>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileIterator< + MatrixShape, half_t, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileIterator< + MatrixShape, half_t, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, MatrixShape<0, 0>, + WarpCount::kK>; +}; + //////////////////////////////////////////////////////////////////////////////// /// Partial specialization: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h new file mode 100644 index 000000000..d9b3d9a0c --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h @@ -0,0 +1,2130 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Defines basic properties needed by CTA-level GEMMs assuming + expectations about data layout of the global memory fragments, data types, + and internal tile sizes. + + Partial specializations for threadblock::Mma operations targeting TensorOp + instructions. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" + +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/gemm/warp/mma_simt_policy.h" +#include "cutlass/gemm/warp/mma_simt.h" +#include "cutlass/gemm/warp/default_mma_tensor_op.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" + +#include "cutlass/gemm/threadblock/default_mma_core.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h" + +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/pitch_linear_thread_map.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h" +#include "cutlass/gemm/threadblock/mma_multistage.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for double-precision +/// +/// A: column-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = double; + using LayoutA = layout::ColumnMajor; + using ElementB = double; + using LayoutB = layout::ColumnMajor; + using ElementC = double; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +/// Partial specialization for double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = double; + using LayoutA = layout::ColumnMajor; + using ElementB = double; + using LayoutB = layout::RowMajor; + using ElementC = double; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + // Shared memory layout + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for double-precision +/// +/// A: row-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = double; + using LayoutA = layout::RowMajor; + using ElementB = double; + using LayoutB = layout::ColumnMajor; + using ElementC = double; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise; + + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Partial specialization for double-precision +/// +/// A: row-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = double; + using LayoutA = layout::RowMajor; + using ElementB = double; + using LayoutB = layout::RowMajor; + using ElementC = double; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise; + + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b; + + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for float-precision +/// +/// ElementA: complex +/// ElementB: complex +/// ElementC: complex +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout for A operand + typename LayoutA_, + /// Layout for B operand + typename LayoutB_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// per-element transformation for elements of A + ComplexTransform TransformA_, + /// per-element transformation for elements of B + ComplexTransform TransformB_ + > +struct DefaultMmaCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, LayoutA_, + complex, LayoutB_, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + Operator_, + false, + CacheOpA, + CacheOpB, + TransformA_, TransformB_, true> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = LayoutA_; + using ElementB = complex; + using LayoutB = LayoutB_; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + static const ComplexTransform TransformA = TransformA_; + static const ComplexTransform TransformB = TransformB_; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + static_assert( + platform::is_same::value || + platform::is_same::value, + "The operator tag must indicate complex multiplication."); + + // + // Underlying template + // + + using MmaComplexCore = DefaultMultistageMmaComplexCore< + Shape, WarpShape, InstructionShape, + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + arch::OpClassTensorOp, + kStages, + TransformA, + TransformB, + Operator, + kCacheOpA, + kCacheOpB + >; + + // + // Shared memory layouts + // + + using SmemLayoutA = typename MmaComplexCore::SmemLayoutA; + + // Shared memory layout + using SmemLayoutB = typename MmaComplexCore::SmemLayoutB; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA; + + /// Shared memory iterator to A operand + using SmemIteratorA = typename MmaComplexCore::SmemIteratorA; + + /// ThreadMap of iterator B + using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB; + + /// Shared memory iterator to B operand + using SmemIteratorB = typename MmaComplexCore::SmemIteratorB; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename MmaComplexCore::MmaTensorOp; + + /// Policy used to define MmaPipelined + using MmaPolicy = typename MmaComplexCore::MmaPolicy; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for double-precision +/// +/// ElementA: complex +/// ElementB: complex +/// ElementC: complex +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout for A operand + typename LayoutA_, + /// Layout for B operand + typename LayoutB_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// per-element transformation for elements of A + ComplexTransform TransformA_, + /// per-element transformation for elements of B + ComplexTransform TransformB_ + > +struct DefaultMmaCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, LayoutA_, + complex, LayoutB_, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + Operator_, + false, + CacheOpA, + CacheOpB, + TransformA_, TransformB_, true> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = LayoutA_; + using ElementB = complex; + using LayoutB = LayoutB_; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + static const ComplexTransform TransformA = TransformA_; + static const ComplexTransform TransformB = TransformB_; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + static_assert( + platform::is_same::value || + platform::is_same::value, + "The operator tag must indicate complex multiplication."); + + // + // Underlying template + // + + using MmaComplexCore = DefaultMultistageMmaComplexCore< + Shape, WarpShape, InstructionShape, + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + arch::OpClassTensorOp, + kStages, + TransformA, + TransformB, + Operator, + kCacheOpA, + kCacheOpB + >; + + // + // Shared memory layouts + // + + using SmemLayoutA = typename MmaComplexCore::SmemLayoutA; + + // Shared memory layout + using SmemLayoutB = typename MmaComplexCore::SmemLayoutB; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA; + + /// Shared memory iterator to A operand + using SmemIteratorA = typename MmaComplexCore::SmemIteratorA; + + /// ThreadMap of iterator B + using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB; + + /// Shared memory iterator to B operand + using SmemIteratorB = typename MmaComplexCore::SmemIteratorB; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename MmaComplexCore::MmaTensorOp; + + /// Policy used to define MmaPipelined + using MmaPolicy = typename MmaComplexCore::MmaPolicy; +}; + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: column-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(ElementA))>; + + // Shared memory layout + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(ElementB))>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: row-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = layout::ColumnMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousA = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedA = + kWarpSize / kWarpThreadArrangementContiguousA; + + static int const kWarpThreadArrangementContiguousB = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedB = + kWarpSize / kWarpThreadArrangementContiguousB; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: column-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + + using LayoutA = layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = layout::ColumnMajor; + + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousB = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedB = + kWarpSize / kWarpThreadArrangementContiguousB; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(ElementA))>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: row-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousA = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedA = + kWarpSize / kWarpThreadArrangementContiguousA; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // Shared memory layout + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(ElementB))>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: column-major-interleaved +/// B: row-major-interleaved +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Number of interleaved K + int InterleavedK> +struct DefaultMmaCore, ElementB_, + layout::RowMajorInterleaved, ElementC_, + LayoutC_, arch::OpClassTensorOp, Stages, Operator_, + AccumulatorsInRowMajor, CacheOpA, CacheOpB> { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::ColumnMajorInterleaved; + using ElementB = ElementB_; + using LayoutB = layout::RowMajorInterleaved; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + static int const kInterleavedK = InterleavedK; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = + kAccessSizeInBits / sizeof_bits::value; + + static int const kWarpThreadArrangementContiguous = + kInterleavedK / kElementsPerAccess; + + static int const kWarpThreadArrangementStrided = + kWarpSize / kWarpThreadArrangementContiguous; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, kInterleavedK>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, kInterleavedK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = transform::TransposePitchLinearThreadMap< + IteratorThreadMapA, + layout::PitchLinearShape>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapB = transform::TransposePitchLinearThreadMap< + IteratorThreadMapB, + layout::PitchLinearShape>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by Simt + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = layout::ColumnMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape<0, 0>, + MatrixShape<0, Shape::kK / 32>, + WarpCount::kK>; +}; + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by Simt + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape<0, 0>, + MatrixShape<0, 0>, + WarpCount::kK>; +}; + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by Simt + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = layout::ColumnMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape, + MatrixShape<0, Shape::kK / 32>, + WarpCount::kK>; +}; + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by Simt + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape, + MatrixShape<0, 0>, + WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass diff --git a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h index ef51be23a..821449432 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h new file mode 100644 index 000000000..2f4a07961 --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h @@ -0,0 +1,130 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/arch/arch.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" +#include "cutlass/gemm/threadblock/default_mma.h" +#include "cutlass/gemm/threadblock/mma_planar_complex_multistage.h" + +#include "cutlass/numeric_types.h" +#include "cutlass/transform/threadblock/predicated_tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Operator class tag + typename OperatorClass_, + /// Tag indicating architecture to tune for + typename ArchTag_, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape_, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape_, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape_, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transformation on operand B + ComplexTransform TransformB = ComplexTransform::kNone, + /// Math operator tag (e.g. arch::OpMultiplyAdd) + typename Operator = arch::OpMultiplyAdd +> +struct DefaultMmaPlanarComplexMultistage { + + // Construct a planar complex variant from the real-valued variant + using RealMmaMultistage = typename DefaultMma< + ElementA_, + LayoutA_, + kAlignmentA, + ElementB_, + LayoutB_, + kAlignmentB, + ElementAccumulator_, + LayoutC_, + OperatorClass_, + ArchTag_, + ThreadblockShape_, + WarpShape_, + InstructionShape_, + Stages, + Operator + >::ThreadblockMma; + + using ThreadblockMma = MmaPlanarComplexMultistage< + ThreadblockShape_, + typename RealMmaMultistage::IteratorA, + typename RealMmaMultistage::SmemIteratorA, + cutlass::arch::CacheOperation::Global, + typename RealMmaMultistage::IteratorB, + typename RealMmaMultistage::SmemIteratorB, + cutlass::arch::CacheOperation::Global, + ElementAccumulator_, + LayoutC_, + typename RealMmaMultistage::Policy, + Stages, + TransformA, + TransformB + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h new file mode 100644 index 000000000..7f3d534a1 --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h @@ -0,0 +1,154 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/arch/arch.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/threadblock/predicated_tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Element type for internal accumulation + typename ElementAccumulator_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Operator class tag + typename OperatorClass_, + /// Tag indicating architecture to tune for + typename ArchTag_, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape_, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape_, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape_, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transformation on operand B + ComplexTransform TransformB = ComplexTransform::kNone, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator = arch::OpMultiplyAddComplex, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor = false> +struct DefaultMultistageMmaComplex; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for row-major output +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the multistage mainloop + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator> +struct DefaultMultistageMmaComplex { + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, TransformA, TransformB, Operator>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB, + MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, + typename MmaCore::MmaPolicy, Stages>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h new file mode 100644 index 000000000..613c88e3e --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h @@ -0,0 +1,113 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines basic properties needed by CTA-level GEMMs assuming + expectations about data layout of the global memory fragments, data types, + and internal tile sizes. + + Partial specializations for threadblock::Mma operations targeting TensorOp + instructions. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/complex.h" + +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/gemm/warp/mma_simt_policy.h" +#include "cutlass/gemm/warp/mma_simt.h" +#include "cutlass/gemm/warp/default_mma_tensor_op.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" + +#include "cutlass/gemm/threadblock/default_mma_core.h" + +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/pitch_linear_thread_map.h" + +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Template defininng default matrix multiply operators inferred from +/// threadblock tile size, global memory data layout, and target math +/// instruction. +template < + /// Shape of threadblock-scoped matrix multiply operator + typename Shape, + /// Shape of warp-level matrix multiply operator + typename WarpShape, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape, + /// Element data type of A operand + typename ElementA, + /// Layout of operand A + typename LayoutA, + /// Element data type of B operand + typename ElementB, + /// Layout of operand B + typename LayoutB, + /// Data type of accumulator + typename ElementC, + /// Layout of accumulator + typename LayoutC, + /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp) + typename OperatorClass, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator = arch::OpMultiplyAddComplex, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA = + cutlass::arch::CacheOperation::Global, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB = + cutlass::arch::CacheOperation::Global> +struct DefaultMultistageMmaComplexCore; + + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h new file mode 100644 index 000000000..230e8d768 --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h @@ -0,0 +1,1113 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines basic properties needed by CTA-level GEMMs assuming + expectations about data layout of the global memory fragments, data types, + and internal tile sizes. + + Partial specializations for threadblock::Mma operations targeting TensorOp + instructions. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" + +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/gemm/warp/mma_simt_policy.h" +#include "cutlass/gemm/warp/mma_simt.h" +#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" + +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h" + +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/pitch_linear_thread_map.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h" +#include "cutlass/gemm/threadblock/mma_multistage.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, layout::ColumnMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped 128 + static int const kAccessSizeInBits = 128; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b; + + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + + +/// Partial specialization for complex double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, layout::ColumnMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + using Operator = Operator_; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped 128 + static int const kAccessSizeInBits = 128; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex double-precision +/// +/// A: row-major +/// B: column-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, layout::RowMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped 128 + static int const kAccessSizeInBits = 128; + + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + + +/// Partial specialization for complex double-precision +/// +/// A: row-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, layout::RowMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped 128 + static int const kAccessSizeInBits = 128; + + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + + +/// Partial specialization for complex floating-point +/// +/// A: column-major +/// B: column-major +/// Operator: arch::OpMultiplyAddComplex +/// Math Instruction: MMA.1688.F32.TF32 +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, layout::ColumnMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped + static int const kAccessSizeInBits = 64; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + + +/// Partial specialization for complex floating-point +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex +/// Math Instruction: MMA.1688.F32.TF32 +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, layout::ColumnMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped + static int const kAccessSizeInBits = 64; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex floating-point +/// +/// A: row-major +/// B: column-major +/// Operator: arch::OpMultiplyAddComplex +/// Math Instruction: MMA.1688.F32.TF32 +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, layout::RowMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped + static int const kAccessSizeInBits = 64; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise; + + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex floating-point +/// +/// A: row-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex +/// Math Instruction: MMA.1688.F32.TF32 +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, layout::RowMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped + static int const kAccessSizeInBits = 64; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise; + + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/gemv.h b/include/cutlass/gemm/threadblock/gemv.h deleted file mode 100755 index 54da93a98..000000000 --- a/include/cutlass/gemm/threadblock/gemv.h +++ /dev/null @@ -1,140 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/*! \file - \brief Template for a threadblock-scoped GEMV kernel. -*/ - -#pragma once - -#include "cutlass/cutlass.h" -#include "cutlass/array.h" -#include "cutlass/numeric_types.h" -#include "cutlass/matrix_shape.h" - -#include "cutlass/gemm/gemm.h" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace gemm { -namespace threadblock { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// Structure to compute the matrix-vector product using SIMT math instructions. -template < - class Core_ //< GemvCore -> -class Gemv { -public: - using Shape = typename Core_::Shape; - - /// The MMA operator that computes GEMV - using Operator = typename Core_::Operator; - - /// Iterates over A in global memory - using IteratorA = typename Core_::IteratorA; - - /// Iterates over B in global memory - using IteratorB = typename Core_::IteratorB; - - /// Fragment of operand C loaded from global memory - using IteratorC = typename Core_::IteratorC; - - /// Fragment of operand A loaded from global memory - using FragmentA = typename IteratorA::Fragment; - - /// Fragment of operand B loaded from global memory - using FragmentB = typename IteratorB::Fragment; - - /// Fragment of operand accumulator loaded/stored to global memory - using FragmentC = typename Operator::FragmentC; - - /// Shape of the per-thread GEMV operation - using ThreadShape = typename Core_::ThreadShape; - -public: - CUTLASS_DEVICE - Gemv() { } - - CUTLASS_DEVICE - void operator()( - GemmCoord const &problem_size, ///< problem size of batched GEMV - FragmentC &accum, ///< destination accumulator tile - IteratorA iterator_A, ///< iterator over A operand in global memory - IteratorB iterator_B, ///< iterator over B operand in global memory - FragmentC const &src_accum) { ///< source accumualtor tile - - // - // Prologue - // - - FragmentA frag_A; - FragmentB frag_B; - frag_A.clear(); - frag_B.clear(); - - iterator_A.load(frag_A); - iterator_B.load(frag_B); - ++iterator_A; - ++iterator_B; - - // - // Mainloop - // - Operator thread_mma; - int gemm_k = problem_size.k(); - - if (gemm_k < Shape::kK) - { - iterator_A.clear_mask(); - iterator_B.clear_mask(); - } - - // iterate over K to accumulate result - CUTLASS_GEMM_LOOP - for (; gemm_k > 0; gemm_k -= Shape::kK) { - thread_mma(accum, frag_A, frag_B, accum); - - iterator_A.load(frag_A); - iterator_B.load(frag_B); - ++iterator_A; - ++iterator_B; - - if (gemm_k < Shape::kK) - { - iterator_A.clear_mask(); - iterator_B.clear_mask(); - } - } - - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace threadblock -} // namespace gemm -} // namespace cutlass diff --git a/include/cutlass/gemm/threadblock/mma_base.h b/include/cutlass/gemm/threadblock/mma_base.h index 7e6d4fe64..dbf3d31f5 100644 --- a/include/cutlass/gemm/threadblock/mma_base.h +++ b/include/cutlass/gemm/threadblock/mma_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h new file mode 100644 index 000000000..0431c3060 --- /dev/null +++ b/include/cutlass/gemm/threadblock/mma_multistage.h @@ -0,0 +1,526 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/threadblock/mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Used for partial specialization + typename Enable = bool> +class MmaMultistage : + public MmaBase { +public: + ///< Base class + using Base = MmaBase; + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + ///< Iterates over tiles of A operand in global memory + using IteratorA = IteratorA_; + ///< Iterates over tiles of B operand in global memory + using IteratorB = IteratorB_; + ///< Data type of accumulator matrix + using ElementC = ElementC_; + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + ///< Policy describing tuning details + using Policy = Policy_; + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Minimum architecture is Sm80 to support cp.async + using ArchTag = arch::Sm80; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + /// Internal structure exposed for introspection. + struct Detail { + + static_assert(Base::kWarpGemmIterations > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand A + static int const AsyncCopyIterationsPerStageA = + IteratorA::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load one stage of operand B + static int const AsyncCopyIterationsPerStageB = + IteratorB::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of cp.async instructions to load on group of operand A + static int const kAccessesPerGroupA = + (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB = + (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + }; + + private: + + using WarpLoadedFragmentA = typename Operator::FragmentA; + using WarpLoadedFragmentB = typename Operator::FragmentB; + using WarpTransformedFragmentA = typename Operator::TransformedFragmentA; + using WarpTransformedFragmentB = typename Operator::TransformedFragmentB; + + private: + + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + MmaMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx), + smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) + { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B, + int group_start_A = 0, int group_start_B = 0) { + iterator_A.set_iteration_index(group_start_A * + IteratorA::kAccessesPerVector); + this->smem_iterator_A_.set_iteration_index(group_start_A); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) { + if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) { + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_A.get(); + + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_A.valid()); + + ++iterator_A; + } + + ++this->smem_iterator_A_; + } + } + + iterator_B.set_iteration_index(group_start_B * + IteratorB::kAccessesPerVector); + this->smem_iterator_B_.set_iteration_index(group_start_B); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) { + if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_B.get(); + + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_B.valid()); + + ++iterator_B; + } + ++this->smem_iterator_B_; + } + } + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations, + ///< destination accumulator tile + FragmentC &accum, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + ///< initial value of accumulator + FragmentC const &src_accum) { + + // + // Prologue + // + + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < Base::kStages - 1; + ++stage, --gemm_k_iterations) { + + if (gemm_k_iterations == 0) { + iterator_A.clear_mask(); + iterator_B.clear_mask(); + } + + iterator_A.set_iteration_index(0); + this->smem_iterator_A_.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + int src_bytes = (iterator_A.valid() ? kSrcBytes : 0); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_A.get(), iterator_A.valid()); + + ++iterator_A; + } + + ++this->smem_iterator_A_; + } + + iterator_B.set_iteration_index(0); + this->smem_iterator_B_.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_B.get(), iterator_B.valid()); + + ++iterator_B; + } + + ++this->smem_iterator_B_; + } + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Defines the boundary of a stage of cp.async. + cutlass::arch::cp_async_fence(); + } + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA warp_loaded_frag_A[2]; + WarpLoadedFragmentB warp_loaded_frag_B[2]; + WarpTransformedFragmentA warp_transformed_frag_A[2]; + WarpTransformedFragmentB warp_transformed_frag_B[2]; + + Operator warp_mma; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (gemm_k_iterations == 0) { + iterator_A.clear_mask(); + iterator_B.clear_mask(); + } + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0], + warp_loaded_frag_A[0], warp_loaded_frag_B[0]); + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > (-Base::kStages + 1);) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k > 0) + warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + warp_loaded_frag_A[warp_mma_k % 2], + warp_loaded_frag_B[warp_mma_k % 2]); + + warp_mma( + accum, + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + accum + ); + + // Issue global->shared copies for the this stage + if (warp_mma_k < Base::kWarpGemmIterations - 1) { + int group_start_iteration_A, group_start_iteration_B; + + group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA; + group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, + group_start_iteration_B); + } + + if (warp_mma_k + 2 == Base::kWarpGemmIterations) { + int group_start_iteration_A, group_start_iteration_B; + group_start_iteration_A = + (warp_mma_k + 1) * Detail::kAccessesPerGroupA; + group_start_iteration_B = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, + group_start_iteration_B); + + // Inserts a memory fence between stages of cp.async instructions. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages have committed. + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_A_.add_tile_offset( + {0, -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations; + if (gemm_k_iterations == 0) { + iterator_A.clear_mask(); + iterator_B.clear_mask(); + } + } + + // Do any conversions feeding the first stage at the end of the loop so + // we can start right away on mma instructions + if (warp_mma_k + 1 == Base::kWarpGemmIterations) + warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2], + warp_transformed_frag_B[(warp_mma_k + 1) % 2], + warp_loaded_frag_A[(warp_mma_k + 1) % 2], + warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + } + + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/mma_pipelined.h b/include/cutlass/gemm/threadblock/mma_pipelined.h index 735950cf7..80954f6c4 100644 --- a/include/cutlass/gemm/threadblock/mma_pipelined.h +++ b/include/cutlass/gemm/threadblock/mma_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -75,7 +75,7 @@ template < typename IteratorA_::Element, IteratorA_::Fragment::kElements>, /// - /// Transformation applied to A operand + /// Transformation applied to B operand typename TransformB_ = NumericArrayConverter< typename SmemIteratorB_::Element, typename IteratorB_::Element, diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h index 9491f56fd..b37b41846 100644 --- a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h +++ b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h new file mode 100644 index 000000000..18e63b580 --- /dev/null +++ b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h @@ -0,0 +1,642 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/array_planar_complex.h" +#include "cutlass/functional.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/mma_planar_complex_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Transformation applied to A + ComplexTransform TransformA = ComplexTransform::kNone, + /// Transformation applied to B + ComplexTransform TransformB = ComplexTransform::kNone +> +class MmaPlanarComplexMultistage : + public MmaPlanarComplexBase { +public: + ///< Base class + using Base = MmaPlanarComplexBase; + + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + + ///< Iterates over tiles of A operand in global memory + using IteratorA = IteratorA_; + + ///< Iterates over tiles of B operand in global memory + using IteratorB = IteratorB_; + + ///< Data type of accumulator matrix + using ElementC = ElementC_; + + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + + ///< Policy describing tuning details + using Policy = Policy_; + + ///< Archtecture tag + using ArchTag = arch::Sm80; + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Transformation applied to A + static ComplexTransform const kTransformA = TransformA; + + /// Transformation applied to B + static ComplexTransform const kTransformB = TransformB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC = ArrayPlanarComplex< + typename Policy::Operator::FragmentC::Element, + Policy::Operator::FragmentC::kElements + >; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Internal structure exposed for introspection. + struct Detail { + + static_assert(Base::kWarpGemmIterations > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of LDGSTS instructions to load one stage of operand A + static int const TBLDGSTSIterationsA = + IteratorA::ThreadMap::Iterations::kCount; + + /// Number of LDGSTS instructions to load one stage of operand B + static int const TBLDGSTSIterationsB = + IteratorB::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of LDGSTS instructions to load on group of operand A + static int const kAccessesPerGroupA = + (TBLDGSTSIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + + /// Number of LDGSTS instructions to load on group of operand B + static int const kAccessesPerGroupB = + (TBLDGSTSIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + }; + + private: + + using WarpFragmentA = typename Operator::FragmentA; + using WarpFragmentB = typename Operator::FragmentB; + + private: + + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + MmaPlanarComplexMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx), + smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) + { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + +private: + + CUTLASS_DEVICE + void copy_tiles_and_advance( + IteratorA &iterator_A_real, + IteratorA &iterator_A_imag, + + IteratorB &iterator_B_real, + IteratorB &iterator_B_imag, + + int group_start_A = 0, + int group_start_B = 0) { + + iterator_A_real.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector); + iterator_A_imag.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector); + this->smem_iterator_A_.set_iteration_index(group_start_A); + + // LDGSTS for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) { + + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast(this->smem_iterator_A_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + + auto gmem_ptr_real = iterator_A_real.get(); + auto gmem_ptr_imag = iterator_A_imag.get(); + + bool pred_guard = iterator_A_real.valid(); + cutlass::arch::cp_async( + dst_ptr + v, + gmem_ptr_real, + pred_guard); + cutlass::arch::cp_async( + dst_ptr + v + (Base::SharedStorage::kImaginaryStrideA / IteratorA::ThreadMap::kElementsPerAccess), + reinterpret_cast(gmem_ptr_imag), + pred_guard); + + ++iterator_A_real; + ++iterator_A_imag; + } + + ++this->smem_iterator_A_; + } + + iterator_B_real.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector); + iterator_B_imag.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector); + this->smem_iterator_B_.set_iteration_index(group_start_B); + + // LDGSTS for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast(this->smem_iterator_B_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + auto gmem_ptr_real = iterator_B_real.get(); + auto gmem_ptr_imag = iterator_B_imag.get(); + + bool pred_guard = iterator_B_real.valid(); + cutlass::arch::cp_async( + dst_ptr + v, + gmem_ptr_real, + pred_guard); + cutlass::arch::cp_async( + dst_ptr + v + (Base::SharedStorage::kImaginaryStrideB / IteratorB::ThreadMap::kElementsPerAccess), + reinterpret_cast(gmem_ptr_imag), + pred_guard); + + ++iterator_B_real; + ++iterator_B_imag; + } + ++this->smem_iterator_B_; + } + } + + CUTLASS_DEVICE + void warp_mma_planar_complex( + Operator & warp_mma, + FragmentC &accum, + WarpFragmentA const & real_A, + WarpFragmentA const & imag_A, + WarpFragmentB const & real_B, + WarpFragmentB const & imag_B) { + + cutlass::negate> neg_op_B; + + WarpFragmentB neg_real_B = neg_op_B(real_B); + WarpFragmentB neg_imag_B = neg_op_B(imag_B); + + warp_mma(accum.real, real_A, real_B, accum.real); + + if (kTransformB == ComplexTransform::kNone) { + warp_mma(accum.imag, real_A, imag_B, accum.imag); + } + else { + warp_mma(accum.imag, real_A, neg_imag_B, accum.imag); + } + + if (kTransformA == ComplexTransform::kNone) { + warp_mma(accum.imag, imag_A, real_B, accum.imag); + } + else { + warp_mma(accum.imag, imag_A, neg_real_B, accum.imag); + } + + if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) { + warp_mma(accum.real, imag_A, imag_B, accum.real); + } + else { + warp_mma(accum.real, imag_A, neg_imag_B, accum.real); + } + } + +public: + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations, + ///< destination accumulator tile + FragmentC &accum, + ///< iterator over A operand in global memory + IteratorA iterator_A_real, + ///< iterator over A operand in global memory + IteratorA iterator_A_imag, + ///< iterator over B operand in global memory + IteratorB iterator_B_real, + ///< iterator over B operand in global memory + IteratorB iterator_B_imag, + ///< initial value of accumulator + FragmentC const &src_accum) { + + // + // Prologue + // + + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < Base::kStages - 1; + ++stage, --gemm_k_iterations) { + + if (gemm_k_iterations == 0) { + iterator_A_real.clear_mask(); + iterator_A_imag.clear_mask(); + iterator_B_real.clear_mask(); + iterator_B_imag.clear_mask(); + } + + iterator_A_real.set_iteration_index(0); + iterator_A_imag.set_iteration_index(0); + + this->smem_iterator_A_.set_iteration_index(0); + + // LDGSTS for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::TBLDGSTSIterationsA; ++j) { + + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast(this->smem_iterator_A_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8; + + bool pred_guard = iterator_A_real.valid(); + + auto src_ptr_real = iterator_A_real.get(); + auto src_ptr_imag = iterator_A_imag.get(); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, src_ptr_real, pred_guard); + + cutlass::arch::cp_async_zfill( + dst_ptr + v + + Base::SharedStorage::kImaginaryStrideA / + IteratorA::ThreadMap::kElementsPerAccess, + reinterpret_cast(src_ptr_imag), + pred_guard); + + ++iterator_A_real; + ++iterator_A_imag; + } + + ++this->smem_iterator_A_; + } + + iterator_B_real.set_iteration_index(0); + iterator_B_imag.set_iteration_index(0); + + this->smem_iterator_B_.set_iteration_index(0); + + // LDGSTS for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::TBLDGSTSIterationsB; ++j) { + + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast(this->smem_iterator_B_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8; + + bool pred_guard = iterator_B_real.valid(); + + auto src_ptr_real = iterator_B_real.get(); + auto src_ptr_imag = iterator_B_imag.get(); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, src_ptr_real, pred_guard); + + cutlass::arch::cp_async_zfill( + dst_ptr + v + + Base::SharedStorage::kImaginaryStrideB / + IteratorB::ThreadMap::kElementsPerAccess, + reinterpret_cast(src_ptr_imag), + pred_guard); + + ++iterator_B_real; + ++iterator_B_imag; + } + + ++this->smem_iterator_B_; + } + + // Move to the next stage + iterator_A_real.add_tile_offset({0, 1}); + iterator_A_imag.add_tile_offset({0, 1}); + + iterator_B_real.add_tile_offset({1, 0}); + iterator_B_imag.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Inserts a memory fence between stages of cp.async instructions + cutlass::arch::cp_async_fence(); + } + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + // Blocks until all but kStages-2 cp.async stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + + WarpFragmentA warp_frag_real_A[2]; + WarpFragmentA warp_frag_imag_A[2]; + + WarpFragmentB warp_frag_real_B[2]; + WarpFragmentB warp_frag_imag_B[2]; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_frag_real_A[0]); + this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA); + + this->warp_tile_iterator_B_.load(warp_frag_real_B[0]); + this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (gemm_k_iterations == 0) { + iterator_A_real.clear_mask(); + iterator_A_imag.clear_mask(); + iterator_B_real.clear_mask(); + iterator_B_imag.clear_mask(); + } + + // Start issuing the first group of the next stage outside of the mainloop + copy_tiles_and_advance(iterator_A_real, iterator_A_imag, iterator_B_real, iterator_B_imag); + + Operator warp_mma; + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > (-Base::kStages + 1);) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA); + + this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + // Issue global->shared copies for the next stage + int group_start_iteration_A, group_start_iteration_B; + + if (warp_mma_k + 1 == Base::kWarpGemmIterations) { + group_start_iteration_A = 0; + group_start_iteration_B = 0; + } + else { + group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA; + group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB; + } + + copy_tiles_and_advance( + iterator_A_real, + iterator_A_imag, + iterator_B_real, + iterator_B_imag, + group_start_iteration_A, + group_start_iteration_B); + + if (warp_mma_k + 2 == Base::kWarpGemmIterations) { + // Inserts a memory fence between stages of cp.async instructions + cutlass::arch::cp_async_fence(); + + // Blocks until all but kStages-2 cp.async stages have committed. + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A_real.add_tile_offset({0, 1}); + iterator_A_imag.add_tile_offset({0, 1}); + + iterator_B_real.add_tile_offset({1, 0}); + iterator_B_imag.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + + this->warp_tile_iterator_A_.add_tile_offset( + {0, -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations; + if (gemm_k_iterations == 0) { + iterator_A_real.clear_mask(); + iterator_A_imag.clear_mask(); + iterator_B_real.clear_mask(); + iterator_B_imag.clear_mask(); + } + } + + warp_mma_planar_complex( + warp_mma, + accum, + warp_frag_real_A[warp_mma_k % 2], + warp_frag_imag_A[warp_mma_k % 2], + warp_frag_real_B[warp_mma_k % 2], + warp_frag_imag_B[warp_mma_k % 2]); + } + + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h index fd9890a41..32d4d4ee6 100644 --- a/include/cutlass/gemm/threadblock/mma_singlestage.h +++ b/include/cutlass/gemm/threadblock/mma_singlestage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h index 1beec2c2a..03d71d319 100644 --- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h +++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -99,61 +99,13 @@ int RematerializeBlockDimZ() { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Threadblock swizzling function for GEMMs +template struct GemmIdentityThreadblockSwizzle { CUTLASS_HOST_DEVICE GemmIdentityThreadblockSwizzle() { } - int const kTile = 1; - - /// Returns the shape of the problem in units of logical tiles - CUTLASS_HOST_DEVICE - GemmCoord get_tiled_shape( - GemmCoord problem_size, - GemmCoord tile_size, - int split_k_slices) const { - - return GemmCoord( - (problem_size.m() + tile_size.m() - 1) / tile_size.m(), - (problem_size.n() + tile_size.n() - 1) / tile_size.n(), - split_k_slices); - } - - /// Computes CUDA grid dimensions given a size in units of logical tiles - CUTLASS_HOST_DEVICE - dim3 get_grid_shape(GemmCoord tiled_shape) const { - return dim3(tiled_shape.m() * kTile, (tiled_shape.n() + kTile - 1) / kTile, tiled_shape.k()); - } - - /// Obtains the threadblock offset (in units of threadblock-scoped tiles) - CUTLASS_DEVICE - GemmCoord get_tile_offset() const { - - int block_idx_x = RematerializeBlockIdxX(); - int block_idx_y = RematerializeBlockIdxY(); - - return GemmCoord{ - (block_idx_x / kTile), - (block_idx_y * kTile) + (block_idx_x % kTile), - RematerializeBlockIdxZ() - }; - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// A special version of GemmIdentityThreadblockSwizzle. See the choice of kTile below. -template -struct GemmCohortThreadblockSwizzle -{ - const int kTile = - (platform::is_same::value || - platform::is_same::value) - ? 4 - : 1; - - CUTLASS_HOST_DEVICE - GemmCohortThreadblockSwizzle() { } + int const kTile = N; /// Returns the shape of the problem in units of logical tiles CUTLASS_HOST_DEVICE @@ -271,8 +223,11 @@ struct GemmBatchedIdentityThreadblockSwizzle { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Threadblock swizzling function for split-K GEMMs +template struct GemmSplitKIdentityThreadblockSwizzle { + int const kTile = N; + /// Returns the shape of the problem in units of logical tiles CUTLASS_HOST_DEVICE GemmCoord get_tiled_shape( @@ -289,16 +244,20 @@ struct GemmSplitKIdentityThreadblockSwizzle { /// Computes CUDA grid dimensions given a size in units of logical tiles CUTLASS_HOST_DEVICE dim3 get_grid_shape(GemmCoord tiled_shape) const { - return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k()); + return dim3(tiled_shape.m() * kTile, (tiled_shape.n() + kTile - 1) / kTile, tiled_shape.k()); } /// Obtains the threadblock offset (in units of threadblock-scoped tiles) CUTLASS_DEVICE GemmCoord get_tile_offset() const { + + int block_idx_x = RematerializeBlockIdxX(); + int block_idx_y = RematerializeBlockIdxY(); + return GemmCoord{ - RematerializeBlockIdxX(), - RematerializeBlockIdxY(), + (block_idx_x / kTile), + (block_idx_y * kTile) + (block_idx_x % kTile), RematerializeBlockIdxZ() }; } diff --git a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h new file mode 100644 index 000000000..3c6772aff --- /dev/null +++ b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h @@ -0,0 +1,401 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/warp/mma_complex_tensor_op.h" +#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A elements + typename ElementA_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename ElementB_, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename ElementC_, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Complex transform on A operand + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transform on B operand + ComplexTransform TransformB = ComplexTransform::kNone, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_ = arch::OpMultiplyAddComplex> +struct DefaultMmaComplexTensorOp; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex case +// 4 real-valued mma operations +// A = (ar + j ai), B (br +j bi), D = AB +// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Real-valued underlying type of complex-valued A operand + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Real-valued underlying type of complex-valued B operand + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Real-valued underlying type of complex-valued C operand + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddComplex> { + + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + RealElementA, + cutlass::layout::RowMajor, + RealElementB, + cutlass::layout::ColumnMajor, + RealElementC, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex case using GaussianComplex operation +// 3 real-valued mma operations +// A = (ar + j ai), B = (br +j bi), D = AB +// P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) +// D = dr + j di = (P1 - P3) + j (P1 + P2) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Real-valued underlying type of complex-valued A operand + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Real-valued underlying type of complex-valued B operand + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Real-valued underlying type of complex-valued C operand + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddGaussianComplex> { + + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + RealElementA, + cutlass::layout::RowMajor, + RealElementB, + cutlass::layout::ColumnMajor, + RealElementC, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization - input and output types are complex*complex +// Use TF32 tensor operation internally +// 4 real-valued MMA.1688.F32.TF32 operations on TF32 +// A = (ar + j ai), B (br +j bi), D = AB +// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddComplex> { + + // Complex floating point tensor operation use MMA.1688.F32.TF32 mma instruction + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + tfloat32_t, + cutlass::layout::RowMajor, + tfloat32_t, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization - input and output types are complex*complex +// Use BF16 tensor operation internally +// 4 real-valued MMA.1688.F32.BF16 operations on BF16 +// A = (ar + j ai), B (br +j bi), D = AB +// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddFastBF16> { + + // Complex floating point tensor operation use MMA.1688.F32.BF16 mma instruction + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + bfloat16_t, + cutlass::layout::RowMajor, + bfloat16_t, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization - input and output types are complex*complex +// Use F16 tensor operation internally +// 4 real-valued MMA.1688.F32.F16 operations on F16 +// A = (ar + j ai), B (br +j bi), D = AB +// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddFastF16> { + + // Complex floating point tensor operation use MMA.1688.F32.F16 mma instruction + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + half_t, + cutlass::layout::RowMajor, + half_t, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; + +} // namespace warp +} // namespace gemm +} // namespace cutlass diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_tensor_op.h index f64f46f9f..ea9ab5c93 100644 --- a/include/cutlass/gemm/warp/default_mma_tensor_op.h +++ b/include/cutlass/gemm/warp/default_mma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -60,10 +60,7 @@ template < int PartitionsK = 1, /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. - bool AccumulatorsInRowMajor = false, - /// Number of partitions along N dimension per warp - int PartitionsN = 1 -> + bool AccumulatorsInRowMajor = false> struct DefaultMmaTensorOp; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -92,9 +89,7 @@ template < int PartitionsK, /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. - bool AccumulatorsInRowMajor, - /// Number of partitions along N dimension per warp - int PartitionsN> + bool AccumulatorsInRowMajor> struct DefaultMmaTensorOp { using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< cutlass::arch::Mma; + Policy, PartitionsK, AccumulatorsInRowMajor>; }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -117,3 +112,6 @@ struct DefaultMmaTensorOp { ///////////////////////////////////////////////////////////////////////////////////////////////// +#include "default_mma_tensor_op_sm80.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h new file mode 100644 index 000000000..06d3afa59 --- /dev/null +++ b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h @@ -0,0 +1,186 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/mma.h" +#include "cutlass/gemm/warp/mma_tensor_op.h" +#include "cutlass/gemm/warp/default_mma_tensor_op.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial Specialization - inputs and output types are float - uses BF16 internally +template < + /// Shape of one matrix production operation (concept: GemmShape) + typename WarpShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Number of partitions along K dimension + int PartitionsK, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor> +struct DefaultMmaTensorOp< + WarpShape_, + GemmShape<16, 8, 8>, + float, LayoutA, + float, LayoutB, + float, LayoutC, + arch::OpMultiplyAddFastBF16, + PartitionsK, AccumulatorsInRowMajor> { + + // Uses BF16 internally + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + GemmShape<16, 8, 8>, + 32, + bfloat16_t, cutlass::layout::RowMajor, + bfloat16_t, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + arch::OpMultiplyAdd + >, + cutlass::MatrixShape<1, 1> >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaTensorOp< + WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC, + Policy, PartitionsK, AccumulatorsInRowMajor>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial Specialization - inputs and output types are float - uses F16 internally +template < + /// Shape of one matrix production operation (concept: GemmShape) + typename WarpShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Number of partitions along K dimension + int PartitionsK, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor> +struct DefaultMmaTensorOp< + WarpShape_, + GemmShape<16, 8, 8>, + float, LayoutA, + float, LayoutB, + float, LayoutC, + arch::OpMultiplyAddFastF16, + PartitionsK, AccumulatorsInRowMajor> { + + // Uses F16 internally + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + GemmShape<16, 8, 8>, + 32, + half_t, cutlass::layout::RowMajor, + half_t, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + arch::OpMultiplyAdd + >, + cutlass::MatrixShape<1, 1> >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaTensorOp< + WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC, + Policy, PartitionsK, AccumulatorsInRowMajor>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial Specialization - inputs and output types are float - uses TF32 internally +template < + /// Shape of one matrix production operation (concept: GemmShape) + typename WarpShape_, + /// Shape of target matrix multiply instruction (concept: GemmShape) + typename InstructionShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Number of partitions along K dimension + int PartitionsK, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor> +struct DefaultMmaTensorOp< + WarpShape_, + InstructionShape_, + float, LayoutA, + float, LayoutB, + float, LayoutC, + arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> { + + // Uses TF32 internally + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + tfloat32_t, cutlass::layout::RowMajor, + tfloat32_t, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + arch::OpMultiplyAdd + >, + cutlass::MatrixShape<1, 1> >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaTensorOp< + WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC, + Policy, PartitionsK, AccumulatorsInRowMajor>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h index 11964944f..582fb472e 100644 --- a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h +++ b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -61,9 +61,7 @@ template < /// Operator describing the tensor operation typename Operator_ = arch::OpMultiplyAdd, /// Number of partitions along K dimension - int PartitionsK = 1, - /// Number of partitions along N dimension per warp - int PartitionsN = 1 + int PartitionsK = 1 > struct DefaultMmaTensorOpWmma; @@ -90,9 +88,7 @@ template < /// Operator describing the tensor operation typename Operator_, /// Number of partitions along K dimension - int PartitionsK, - /// Number of partitions along N dimension per warp - int PartitionsN> + int PartitionsK> struct DefaultMmaTensorOpWmma { using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< cutlass::arch::Wmma< @@ -116,8 +112,7 @@ struct DefaultMmaTensorOpWmma { ElementC, LayoutC, Policy, - PartitionsK, - PartitionsN>; + PartitionsK>; }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -127,4 +122,3 @@ struct DefaultMmaTensorOpWmma { } // namespace cutlass #endif - diff --git a/include/cutlass/gemm/warp/mma.h b/include/cutlass/gemm/warp/mma.h index 5fb96d9f2..16c736e2b 100644 --- a/include/cutlass/gemm/warp/mma.h +++ b/include/cutlass/gemm/warp/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h new file mode 100644 index 000000000..2dc72fd33 --- /dev/null +++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h @@ -0,0 +1,843 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing warp-level matrix multiply-accumulate operations targeting + Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/complex.h" +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/functional.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm80.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/warp/mma.h" + +#include "cutlass/gemm/warp/mma_tensor_op_policy.h" +#include "cutlass/gemm/warp/mma_tensor_op.h" + +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" +#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +template < + /// Data type of real & imag members of complex numbers in the SourceFragment + typename RealElement, + /// Destination fragment required by the mma operation + typename DestinationFragment, + /// Source fragment holding complex elements + typename SourceFragment, + /// Number of mma operations performed + typename MmaIterations, + /// Shape of operand elements + typename MmaOperandShape, + /// Complex transform on A operand + ComplexTransform Transform_, + /// Operand A or Operand B + Operand Operand_, + /// Floating-point rounding style + FloatRoundStyle Round_> +struct UnpackComplexConvertAndPackForMma; + +// Partial specialization for OperandA and Congruous smem layout +template < + typename RealElement, + typename DestinationFragment, + typename SourceFragment, + typename MmaIterations, + typename MmaOperandShape, + ComplexTransform Transform_, + FloatRoundStyle Round_> +struct UnpackComplexConvertAndPackForMma < + RealElement, + DestinationFragment, + SourceFragment, + MmaIterations, + MmaOperandShape, + Transform_, + Operand::kA, + Round_> { + + // + // Type definitions + // + static Operand const kOperand = Operand::kA; + static ComplexTransform const kTransform = Transform_; + static FloatRoundStyle const kRound = Round_; + + // Data type of elements in the destination fragment + using MmaElement = typename DestinationFragment::Element; + + // Numeric convertor MmaElement <= RealElement + using Converter = NumericConverter; + + // Operand layout parameters + using SourceFragmentLayout = layout::ColumnMajor; + static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow; + + /// Ctor + CUTLASS_DEVICE + UnpackComplexConvertAndPackForMma() {} + + CUTLASS_DEVICE + void operator()(DestinationFragment *dest, SourceFragment const &source) { + + Converter convert_op; + SourceFragmentLayout layout(kLdm); + + CUTLASS_PRAGMA_UNROLL + for(int i=0; i and apply rounding on real and imag parts + MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real()); + MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag()); + + // Unpack rounded complex and pack into DestinationFragment for mma operation + dest[i][pos] = a; + dest[i+MmaIterations::kRow][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b); + + } + } + } + } +}; + +// Partial specialization for OperandB and Congruous smem layout +template < + typename RealElement, + typename DestinationFragment, + typename SourceFragment, + typename MmaIterations, + typename MmaOperandShape, + ComplexTransform Transform_, + FloatRoundStyle Round_> +struct UnpackComplexConvertAndPackForMma < + RealElement, + DestinationFragment, + SourceFragment, + MmaIterations, + MmaOperandShape, + Transform_, + Operand::kB, + Round_> { + + // + // Type definitions + // + static Operand const kOperand = Operand::kB; + static ComplexTransform const kTransform = Transform_; + static FloatRoundStyle const kRound = Round_; + + // Data type of elements in the destination fragment + using MmaElement = typename DestinationFragment::Element; + + // Numeric convertor MmaElement <= RealElement + using Converter = NumericConverter; + + // Operand layout parameters + using SourceFragmentLayout = layout::RowMajor; + static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn; + + /// Ctor + CUTLASS_DEVICE + UnpackComplexConvertAndPackForMma() {} + + CUTLASS_HOST_DEVICE + void operator()(DestinationFragment *dest, SourceFragment const &source) { + + Converter convert_op; + SourceFragmentLayout layout(kLdm); + + CUTLASS_PRAGMA_UNROLL + for(int i=0; i apply rounding on real and imag parts + MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real()); + MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag()); + + // Unpack rounded complex and pack into DestinationFragment for mma operation + dest[i][pos] = a; + dest[i+MmaIterations::kColumn][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b); + } + } + } + } +}; +} // namespace detail + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Data type of A elements + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transform on B operand + ComplexTransform TransformB = ComplexTransform::kNone, + /// Used for partial specialization + typename Enable = bool +> +class MmaComplexTensorOp; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Data type of A elements + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB, + /// Used for partial specialization + typename Enable +> +class MmaComplexTensorOp< + Shape_, + complex, + LayoutA_, + complex, + LayoutB_, + complex, + LayoutC_, + Policy_, + TransformA, + TransformB, + Enable> { +public: + /// Shape of warp-level matrix operation (concept: GemmShape) + using Shape = Shape_; + + /// Data type of multiplicand A + using ElementA = complex; + + /// Layout of multiplicand A + using LayoutA = LayoutA_; + + /// Data type of multiplicand B + using ElementB = complex; + + /// Layout of multiplicand B + using LayoutB = LayoutB_; + + /// Data type of accumulator matrix C + using ElementC = complex; + + /// Layout of accumulator matrix C + using LayoutC = LayoutC_; + + /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp) + using Policy = Policy_; + + /// Shape of underlying instruction + using InstructionShape = typename Policy::Operator::Shape; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = TransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = TransformB; + + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; + + /// Number of threads participating in warp-level matrix product + static int const kThreadCount = 32; + +public: + + /// Iterates over the A operand in memory + using IteratorA = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kA, + ElementA, + LayoutA, + MatrixShape, + Policy::OpDelta::kRow, + 32, + 1 + >; + + /// Storage for A tile + using FragmentA = typename IteratorA::Fragment; + + /// Storage for transformed A tile + using TransformedFragmentA = FragmentA; + + /// Iterates over the B operand in memory + using IteratorB = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kB, + ElementB, + LayoutB, + MatrixShape, + Policy::OpDelta::kColumn, + 32, + 1 + >; + + /// Storage for B tile + using FragmentB = typename IteratorB::Fragment; + + /// Storage for transformed B tile + using TransformedFragmentB = FragmentB; + + static_assert( + !(Shape::kM % Policy::Operator::Shape::kM) && + !(Shape::kN % Policy::Operator::Shape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + /// Number of mma operations performed + using MmaIterations = MatrixShape< + Shape::kM / Policy::Operator::Shape::kM, + Shape::kN / Policy::Operator::Shape::kN + >; + + /// Iterates over the C operand in memory + using IteratorC = MmaTensorOpAccumulatorTileIterator< + MatrixShape, + ElementC, + LayoutC, + typename Policy::Operator::Shape, + typename Policy::OpDelta>; + + /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this + /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued + /// parts are stored consecutively followed by all imaginary parts. This matches the structure + /// of Tensor Cores which are always real-valued matrix multiplies. + using FragmentC = typename IteratorC::Fragment; + + static_assert( + FragmentC::kElements == 2 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements, + "Unexpected planar complex fragment length."); + +private: + + // + // Data members + // + + /// Underlying real-valued matrix multiply operator (concept: arch::Mma) + typename Policy::Operator mma; + +public: + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + MmaComplexTensorOp() {} + + /// Performs a warp-level matrix multiply-accumulate operation + CUTLASS_DEVICE + void operator()( + FragmentC &D, + FragmentA const &A, + FragmentB const &B, + FragmentC const &C + ) const { + + // Alias types for underlying real-valued matrix multiply operator + using MmaOperandA = typename Policy::Operator::FragmentA; + using MmaOperandB = typename Policy::Operator::FragmentB; + using MmaOperandC = typename Policy::Operator::FragmentC; + + static_assert(MmaOperandA::kElements == 1, + "This implementation only supports math instructions in which exactly one element is needed for the A operand." + "We can geneneralize later."); + + static_assert(MmaOperandB::kElements == 1, + "This implementation only supports math instructions in which exactly one element is needed for the B operand." + "We can geneneralize later."); + + D = C; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; ++m) { + + // mma(accum.real(), a.real(), b.real(), accum.real()); + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_A; + MmaOperandB operand_B; + + operand_A[0] = A[m].real(); + operand_B[0] = B[n].real(); + + // Real-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_A, operand_B, *accum); + } + + // mma(accum.imag(), a.real(), b.imag(), accum.imag()); + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_A; + MmaOperandB operand_B; + + operand_A[0] = A[m].real(); + operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag()); + + // Complex-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_A, operand_B, *accum); + } + + // mma(accum.real(), -a.imag(), b.imag(), accum.real()) + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_A; + MmaOperandB operand_B; + + // A imaginary part is intentionally negated + operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? A[m].imag() : -A[m].imag()); + operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag()); + + // Real-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_A, operand_B, *accum); + } + + // mma(accum.imag(), a.imag(), b.real(), accum.imag()) + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_A; + MmaOperandB operand_B; + + operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? -A[m].imag() : A[m].imag()); + operand_B[0] = B[n].real(); + + // Complex-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_A, operand_B, *accum); + } + } + } + + /// Transform the mma operands to the required types + CUTLASS_DEVICE + void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, + FragmentA const &A, FragmentB const &B) const { + //TODO: Implement this + dst_A = A; + dst_B = B; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex+complex => complex: +// Operands data type: complex +// Rounding: float -> tfloat32_t (round half_ulp_truncate nearest) +// Math instruction: MMA.1688.F32.TF32 +// Output data type: complex +// +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB, + /// Used for partial specialization + typename Enable +> +class MmaComplexTensorOp< + Shape_, + complex, + LayoutA_, + complex, + LayoutB_, + complex, + LayoutC_, + Policy_, + TransformA, + TransformB, + Enable> { +public: + /// Shape of warp-level matrix operation (concept: GemmShape) + using Shape = Shape_; + + /// Data type of members of complex multiplicand A + using RealElementA = float; + + /// Data type of multiplicand A + using ElementA = complex; + + /// Layout of multiplicand A + using LayoutA = LayoutA_; + + /// Data type of members of complex multiplicand B + using RealElementB = float; + + /// Data type of multiplicand B + using ElementB = complex; + + /// Layout of multiplicand B + using LayoutB = LayoutB_; + + /// Data type of members of complex accumulator matrix C + using RealElementC = float; + + /// Data type of accumulator matrix C + using ElementC = complex; + + /// Layout of accumulator matrix C + using LayoutC = LayoutC_; + + /// Shape of the warp in units of thread (concept: MmaLanePolicySimt) + using Policy = Policy_; + + /// Shape of underlying instruction + using InstructionShape = typename Policy::Operator::Shape; + + /// Underlying arch tag + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = TransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = TransformB; + + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; + + /// Number of threads participating in warp-level matrix product + static int const kThreadCount = 32; + +public: + + /// Iterates over the A operand in memory + using IteratorA = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kA, + ElementA, + LayoutA, + MatrixShape, + Policy::OpDelta::kRow, + 32, + 1 + >; + + /// Storage for A tile + using FragmentA = typename IteratorA::Fragment; + + /// Storage for transformed A tile + using TransformedFragmentA = + Array; + + /// Iterates over the B operand in memory + using IteratorB = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kB, + ElementB, + LayoutB, + MatrixShape, + Policy::OpDelta::kColumn, + 32, + 1 + >; + + /// Storage for B tile + using FragmentB = typename IteratorB::Fragment; + + /// Storage for transformed B tile + using TransformedFragmentB = + Array; + + static_assert( + !(Shape::kM % Policy::Operator::Shape::kM) && + !(Shape::kN % Policy::Operator::Shape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + /// Number of complex products operations performed (one complex product needs four mma instructions) + using MmaIterations = MatrixShape< + Shape::kM / Policy::Operator::Shape::kM, + Shape::kN / Policy::Operator::Shape::kN + >; + + /// Iterates over the C operand in memory + using IteratorC = MmaTensorOpAccumulatorTileIterator< + MatrixShape, + ElementC, + LayoutC, + typename Policy::Operator::Shape, + typename Policy::OpDelta>; + + /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this + /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued + /// parts are stored consecutively followed by all imaginary parts. This matches the structure + /// of Tensor Cores which are always real-valued matrix multiplies. + using FragmentC = typename IteratorC::Fragment; + +private: + + // + // Data members + // + + /// Underlying real-valued matrix multiply operator (concept: arch::Mma) + typename Policy::Operator mma; + +public: + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + MmaComplexTensorOp() {} + + /// Performs a warp-level matrix multiply-accumulate operation + CUTLASS_DEVICE + void operator()( + FragmentC &D, + TransformedFragmentA const &A, + TransformedFragmentB const &B, + FragmentC const &C + ) const { + + // Alias types for underlying real-valued matrix multiply operator + using InstMmaOperandA = typename Policy::Operator::FragmentA; + using InstMmaOperandB = typename Policy::Operator::FragmentB; + using MmaOperandC = typename Policy::Operator::FragmentC; + + static_assert(platform::is_same, typename Policy::Operator::Shape>::value, + "This implementation only supports MMA.1688 math instructions."); + + static_assert(InstMmaOperandA::kElements == 4, + "This implementation only supports math instructions in which exactly four element is needed for the A operand." + "We can geneneralize later."); + + static_assert(InstMmaOperandB::kElements == 2, + "This implementation only supports math instructions in which exactly two element is needed for the B operand." + "We can geneneralize later."); + + // Instruction Operands A & B holding real part followed by imaginary part for mma operations + InstMmaOperandA const *operand_A = reinterpret_cast(&A); + InstMmaOperandB const *operand_B = reinterpret_cast(&B); + + // + // Accumulate in place + // + D = C; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; ++m) { + + // mma(accum.real(), a.real(), b.real(), accum.real()); + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Real-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_A[m], operand_B[n], *accum); + } + + // mma(accum.imag(), a.real(), b.imag(), accum.imag()); + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Complex-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum); + } + + // mma(accum.real(), a.imag(), -b.imag(), accum.real()) + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // negate OperandB to accumulate -(a.imag()*b.imag()) + // negating OperandB emits less instrucitons than negating OperandA as OperandB has less elements + negate negate_op; + + // Real-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum); + } + + // mma(accum.imag(), a.imag(), b.real(), accum.imag()) + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Complex-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum); + } + } + } + + /// Transform the mma operands to the required types + CUTLASS_DEVICE + void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, + FragmentA const &A, FragmentB const &B) const { + // Alias types for underlying real-valued matrix multiply operator + using InstMmaOperandA = typename Policy::Operator::FragmentA; + using InstMmaOperandB = typename Policy::Operator::FragmentB; + + // + // Define conversions from source type to instruction operands' type + // + + FloatRoundStyle const kRoundA = FloatRoundStyle::round_half_ulp_trunc_dntz; + FloatRoundStyle const kRoundB = FloatRoundStyle::round_half_ulp_trunc_dntz; + + detail::UnpackComplexConvertAndPackForMma < + RealElementA, + InstMmaOperandA, + FragmentA, + MmaIterations, + MatrixShape<2, 2>, + kTransformA, + Operand::kA, + kRoundA> convert_A; + + detail::UnpackComplexConvertAndPackForMma < + RealElementB, + InstMmaOperandB, + FragmentB, + MmaIterations, + MatrixShape<2, 1>, + kTransformB, + Operand::kB, + kRoundB> convert_B; + + // Convert Fragment[A|B] holding complex to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element + convert_A(reinterpret_cast(&dst_A), A); + convert_B(reinterpret_cast(&dst_B), B); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO - partial specializations of real*complex and complex*real + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h new file mode 100644 index 000000000..b95af0df1 --- /dev/null +++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h @@ -0,0 +1,2448 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/platform/platform.h" +#include "cutlass/fast_math.h" + +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 128b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicandCongruous128b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 8) && !(Shape::kStrided % 4), "Divisibility."); + + static_assert(sizeof_bits::value == 128, "This is specialized for 128b accesses."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCongruous128b; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 1; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<8, 4>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + Shape::kContiguous / Delta::kContiguous, + InstructionShape::kStrided / Delta::kStrided + >; + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) { + + int quad_pair = lane_id / 8; + int quad = lane_id / 4; + int lane = lane_id % 4; + + int row = (quad & 1) * 4 + (lane ^ quad_pair); + + byte_offset_ = (row + quad_pair * stride_) * sizeof(AccessType); + + pointer_= reinterpret_cast(ref.data()); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + pointer_ += offset; + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + int offset = + (tile_offset.contiguous() * Shape::kContiguous) + + (tile_offset.strided() * InstructionShape::kStrided * stride_); + + add_pointer_offset(offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + pointer_ += stride_ * InstructionShape::kStrided; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + int access_idx = c + s * Policy::Iterations::kContiguous; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c + + Policy::Delta::kStrided * s * stride_; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = + tile_offset.contiguous() * Shape::kContiguous + + tile_offset.strided() * InstructionShape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCongruous128b, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.strided(), tile_offset.contiguous()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCongruous128b, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.contiguous(), tile_offset.strided()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// +/// Partial specialization for complex +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Data type of underlying field of reals. + typename RealElement, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_> +class MmaTensorOpAccumulatorTileIterator< + Shape_, complex, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> { + public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand::kC; + + /// Element type + using Element = complex; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + using OpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + static_assert(platform::is_same::value, + "Layouts must be defined for logical MatrixCoord coordinate space."); + + /// Number of mma operations performed + using MmaIterations = MatrixShape; + }; + +private: + + // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire + // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements + // of that row. The accumulators within one row are assumed to be consecutive. + static int const kElementsPerAccess = InstructionShape::kN / 4; + static int const kRowsPerTile = 8; + static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators + /// are stored in a planar complex arrangement with the real parts as entirely contiguous + /// followed by the imaginary parts. + using Fragment = Array; + + static int const kRealIndex = 0; + static int const kImaginaryIndex = Shape::kCount / kThreads; + +private: + + /// Reference to output tensor + TensorRef ref_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator( + TensorRef const &ref, + int lane_id + ): + ref_(ref) { + + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + + MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess); + + ref_.add_coord_offset(lane_offset); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) { + ref_.add_pointer_offset(offset); + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn)); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator & operator++() { + // deliberate no-op + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator & operator--() { + // deliberate no-op + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + Fragment &frag, ///< fragment to load from the tensor + Index pointer_offset) const { ///< loads a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile; + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col; + + Element z = offset_ref.at({accum_m, accum_n}); + + frag[mma_accum_start + row * kElementsPerAccess + col + kRealIndex] = z.real(); + frag[mma_accum_start + row * kElementsPerAccess + col + kImaginaryIndex] = z.imag(); + } + } + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + Fragment &frag, ///< fragment to load from the tensor + Index byte_offset) const { ///< loads a tile with a linear offset + + load_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset) const { ///< loads a tile with a logical offset in units of whole tiles + + load(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset, ///< loads a tile with a logical offset in units of whole tiles + Index pointer_offset) const { ///< loads a tile with a logical offset AND a pointer offset + + load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } + + /// Stores a fragment to memory + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) const { + store_with_pointer_offset(frag, 0); + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_pointer_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index pointer_offset) const { ///< store a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile; + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col; + int idx = mma_accum_start + row * kElementsPerAccess + col; + + Element z(frag[kRealIndex + idx], frag[kImaginaryIndex + idx]); + + offset_ref.at({accum_m, accum_n}) = z; + } + } + } + } + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_byte_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index byte_offset) const { ///< store a tile with a linear offset + + store_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Stores a fragment to memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + Fragment &frag, ///< fragment to store to the tensor + TensorCoord const &tile_offset) const { ///< stores a tile with a logical offset in units of whole tiles + + store(frag, tile_offset, 0); + } + + /// Stores a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + /// fragment to store to the tensor + Fragment const &frag, + /// stores a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// stores a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 128b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicandCrosswise128x4, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 8), "Divisibility."); + + static_assert(sizeof_bits::value == 128, "This is specialized for 128b accesses."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCrosswise128x4; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 1; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<4, 8>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + InstructionShape::kContiguous / Delta::kContiguous, + Shape::kStrided / Delta::kStrided + >; + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) { + + int quad = lane_id / 4; + int liq = lane_id % 4; + + int c = liq + (quad & 1) * 4; + int s = (quad / 2); + + byte_offset_ = (c + s * stride_) * sizeof(AccessType); + + pointer_= reinterpret_cast(ref.data()); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + pointer_ += offset; + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + // Compute the offset in units of elements. Note, the external coordinate system is + // approximately transposed with respect to the tiled internal structure + int offset = + (tile_offset.contiguous() * InstructionShape::kContiguous) * stride_ + + (tile_offset.strided() * Shape::kStrided); + + add_pointer_offset(offset); + + byte_offset_ ^= (tile_offset.contiguous() & 1) * 4 * sizeof(AccessType); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + pointer_ += stride_ * InstructionShape::kContiguous; + + byte_offset_ ^= 4 * sizeof(AccessType); + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + int access_idx = s + c * Policy::Iterations::kStrided; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c * stride_ + + Policy::Delta::kStrided * s; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = + tile_offset.contiguous() * InstructionShape::kContiguous * stride_ + + tile_offset.strided() * Shape::kStrided; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCrosswise128x4, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.strided(), tile_offset.contiguous()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCrosswise128x4, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.contiguous(), tile_offset.strided()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Congruous shared memory layout +// Warp-level iterators for complex*complex + complex => complex +// The underlying iterators are similar to that for MMA f64*f64 + f64 = f64 +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 64b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, cutlass::complex, + cutlass::layout::TensorOpMultiplicandCongruous64b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 8), "Divisibility."); + + /// Element type + using Element = cutlass::complex; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 2; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<8, 4>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + Shape::kContiguous / kElementsPerAccess / Delta::kContiguous, + InstructionShape::kStrided / Delta::kStrided + >; + + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + + /// Internal counter used to jump to next K partition + int k_group_idx_; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0), + k_group_idx_(0) { + + int access_strided = lane_id / Policy::Delta::kContiguous; + int access_contiguous = (lane_id % Policy::Delta::kContiguous) ^ access_strided; + + pointer_= reinterpret_cast(ref.data()) + + access_contiguous + access_strided * stride_; + + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + byte_offset_ += offset * sizeof(Element); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + int offset = + (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + + tile_offset.contiguous() * Shape::kContiguous; + + add_pointer_offset(offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + add_tile_offset({0, 1}); + + return *this; + } + + /// Advances the iterator along the opposite of the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + add_tile_offset({0, -1}); + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + int access_idx = c + s * Policy::Iterations::kContiguous; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c + + Policy::Delta::kStrided * s * stride_; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + + Index pointer_offset = + tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + + tile_offset.strided() * InstructionShape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Crosswise shared memory layout +// Warp-level iterators for complex*complex + complex => complex +// The underlying iterators are similar to that for f64*f64 + f64 = f64 +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 64b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, complex, + cutlass::layout::TensorOpMultiplicand64bCrosswise, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility."); + + static_assert(sizeof_bits>::value == 64, "This is specialized for 64b accesses."); + + /// Element type + using Element = complex; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 2; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<4, 16>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + InstructionShape::kContiguous / Delta::kContiguous, + Shape::kStrided / Delta::kStrided + >; + + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + + /// Internal counter for tracking K-group + Index k_group_idx_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0), + k_group_idx_(0) { + + int access_strided = lane_id / 8; + int access_contiguous = (lane_id % 8); + + byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType); + + pointer_= reinterpret_cast(ref.data()); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + pointer_ += offset / kElementsPerAccess; + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) * + stride_ * kElementsPerAccess + + tile_offset.strided() * Shape::kStrided; + + add_pointer_offset(offset); + + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + pointer_ += stride_ * InstructionShape::kContiguous; + + // xor ptr + byte_offset_ ^= 0x40; + + ++k_group_idx_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + int access_idx = c * Policy::Iterations::kStrided + s; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c * stride_ + + Policy::Delta::kStrided * s / kElementsPerAccess; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + + Element *exchange_ptr = reinterpret_cast(&frag); + + // exchange on 64b granularity only for fragments held in k=8/2 to k=8 + CUTLASS_PRAGMA_UNROLL + for (int i = Fragment::kElements/2; i < Fragment::kElements; i += 2) { + Element tmp = exchange_ptr[i]; + exchange_ptr[i] = exchange_ptr[i + 1]; + exchange_ptr[i + 1] = tmp; + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = tile_offset.contiguous() * + InstructionShape::kContiguous / + Layout::kElementsPerAccess + + tile_offset.strided() * Shape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + k_group_idx_ = k_group; + } +}; + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h new file mode 100644 index 000000000..bf3d98dfb --- /dev/null +++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h @@ -0,0 +1,357 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing warp-level matrix multiply-accumulate operations targeting + Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/complex.h" +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm80.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/warp/mma.h" + +#include "cutlass/gemm/warp/mma_tensor_op_policy.h" +#include "cutlass/gemm/warp/mma_tensor_op.h" + +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" +#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Data type of A elements + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transform on B operand + ComplexTransform TransformB = ComplexTransform::kNone, + /// Used for partial specialization + typename Enable = bool +> +class MmaGaussianComplexTensorOp; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Data type of A elements + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB, + /// Used for partial specialization + typename Enable +> +class MmaGaussianComplexTensorOp< + Shape_, + complex, + LayoutA_, + complex, + LayoutB_, + complex, + LayoutC_, + Policy_, + TransformA, + TransformB, + Enable> { +public: + /// Shape of warp-level matrix operation (concept: GemmShape) + using Shape = Shape_; + + /// Data type of multiplicand A + using ElementA = complex; + + /// Layout of multiplicand A + using LayoutA = LayoutA_; + + /// Data type of multiplicand B + using ElementB = complex; + + /// Layout of multiplicand B + using LayoutB = LayoutB_; + + /// Data type of accumulator matrix C + using ElementC = complex; + + /// Layout of accumulator matrix C + using LayoutC = LayoutC_; + + /// Shape of the warp in units of thread (concept: MmaLanePolicySimt) + using Policy = Policy_; + + /// Shape of underlying instruction + using InstructionShape = typename Policy::Operator::Shape; + + /// Underlying architecture tag + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = TransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = TransformB; + + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; + + /// Number of threads participating in warp-level matrix product + static int const kThreadCount = 32; + +public: + + /// Iterates over the A operand in memory + using IteratorA = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kA, + ElementA, + LayoutA, + MatrixShape, + Policy::OpDelta::kRow, + 32, + 1 + >; + + /// Storage for A tile + using FragmentA = typename IteratorA::Fragment; + + /// Storage for transformed A tile + using TransformedFragmentA = FragmentA; + + /// Iterates over the B operand in memory + using IteratorB = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kB, + ElementB, + LayoutB, + MatrixShape, + Policy::OpDelta::kColumn, + 32, + 1 + >; + + /// Storage for B tile + using FragmentB = typename IteratorB::Fragment; + + /// Storage for transformed B tile + using TransformedFragmentB = FragmentB; + + static_assert( + !(Shape::kM % Policy::Operator::Shape::kM) && + !(Shape::kN % Policy::Operator::Shape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + /// Number of mma operations performed + using MmaIterations = MatrixShape< + Shape::kM / Policy::Operator::Shape::kM, + Shape::kN / Policy::Operator::Shape::kN + >; + + /// Iterates over the C operand in memory + using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator< + MatrixShape, + ElementC, + LayoutC, + typename Policy::Operator::Shape, + typename Policy::OpDelta>; + + /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this + /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is + /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively + /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies. + using FragmentC = typename IteratorC::Fragment; + + static_assert( + FragmentC::kElements == 3 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements, + "Unexpected gaussian complex fragment length."); + +private: + + // + // Data members + // + + /// Underlying real-valued matrix multiply operator (concept: arch::Mma) + typename Policy::Operator mma; + +public: + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + MmaGaussianComplexTensorOp() {} + + /// Performs a warp-level matrix multiply-accumulate operation + CUTLASS_DEVICE + void operator()( + FragmentC &D, + FragmentA const &A, + FragmentB const &B, + FragmentC const &C + ) const { + + // Alias types for underlying real-valued matrix multiply operator + using MmaOperandA = typename Policy::Operator::FragmentA; + using MmaOperandB = typename Policy::Operator::FragmentB; + using MmaOperandC = typename Policy::Operator::FragmentC; + + static_assert(MmaOperandA::kElements == 1, + "This implementation only supports math instructions in which exactly one element is needed for the A operand." + "We can geneneralize later."); + + static_assert(MmaOperandB::kElements == 1, + "This implementation only supports math instructions in which exactly one element is needed for the B operand." + "We can geneneralize later."); + + D = C; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; ++m) { + + // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1()); + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_Asum; + MmaOperandB operand_Br; + + operand_Asum[0] = A[m].real() + ((kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag()); + operand_Br[0] = B[n].real(); + + // accumulator part1 + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_Asum, operand_Br, *accum); + } + + // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_Ar; + MmaOperandB operand_Bdiff; + + operand_Ar[0] = -A[m].real(); + operand_Bdiff[0] = B[n].real() - ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag()); + + // accumulator part2 + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_Ar, operand_Bdiff, *accum); + } + + // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3()) + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_Ai; + MmaOperandB operand_Bsum; + + operand_Ai[0] = (kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag(); + operand_Bsum[0] = B[n].real() + ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag()); + + // accumulator part3 + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount; + + mma(*accum, operand_Ai, operand_Bsum, *accum); + } + } + } + + /// Transform the mma operands to the required types + CUTLASS_DEVICE + void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, + FragmentA const &A, FragmentB const &B) const { + //TODO: Implement this + dst_A = A; + dst_B = B; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO - partial specializations of real*complex and complex*real + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h new file mode 100644 index 000000000..8d9417b0f --- /dev/null +++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h @@ -0,0 +1,384 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" +#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h" + +#include "cutlass/platform/platform.h" +#include "cutlass/fast_math.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Element type + typename Element_, + /// Layout of operand in memory + typename Layout_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_> +class MmaTensorOpGaussianComplexAccumulatorTileIterator; + +//////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// +/// Partial specialization for complex +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Data type of underlying field of reals. + typename RealElement, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_> +class MmaTensorOpGaussianComplexAccumulatorTileIterator< + Shape_, complex, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> { + public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand::kC; + + /// Element type + using Element = complex; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + using OpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + static_assert(platform::is_same::value, + "Layouts must be defined for logical MatrixCoord coordinate space."); + + /// Number of mma operations performed + using MmaIterations = MatrixShape; + }; + +private: + + // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire + // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements + // of that row. The accumulators within one row are assumed to be consecutive. + static int const kElementsPerAccess = InstructionShape::kN / 4; + static int const kRowsPerTile = 8; + static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators + /// are stored in a gaussian complex arrangement with parts 1, 2, and 3 as entirely contiguous + /// arranged as [part1, part2, part3] + using Fragment = Array; + + static int const kPart1Index = (Shape::kCount / kThreads) * 0; + static int const kPart2Index = (Shape::kCount / kThreads) * 1; + static int const kPart3Index = (Shape::kCount / kThreads) * 2; + +private: + + /// Reference to output tensor + TensorRef ref_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator( + TensorRef const &ref, + int lane_id + ): + ref_(ref) { + + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + + MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess); + + ref_.add_coord_offset(lane_offset); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator &add_pointer_offset(LongIndex offset) { + ref_.add_pointer_offset(offset); + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn)); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator & operator++() { + // deliberate no-op + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator & operator--() { + // deliberate no-op + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + Fragment &frag, ///< fragment to load from the tensor + Index pointer_offset) const { ///< loads a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile; + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col; + + Element z = offset_ref.at({accum_m, accum_n}); + + frag[mma_accum_start + row * kElementsPerAccess + col + kPart1Index] = z.real() + z.imag(); + frag[mma_accum_start + row * kElementsPerAccess + col + kPart2Index] = -z.real(); + frag[mma_accum_start + row * kElementsPerAccess + col + kPart3Index] = z.imag(); + } + } + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + Fragment &frag, ///< fragment to load from the tensor + Index byte_offset) const { ///< loads a tile with a linear offset + + load_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset) const { ///< loads a tile with a logical offset in units of whole tiles + + load(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset, ///< loads a tile with a logical offset in units of whole tiles + Index pointer_offset) const { ///< loads a tile with a logical offset AND a pointer offset + + load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } + + /// Stores a fragment to memory + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) const { + store_with_pointer_offset(frag, 0); + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_pointer_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index pointer_offset) const { ///< store a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile; + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col; + int idx = mma_accum_start + row * kElementsPerAccess + col; + + Element z(frag[kPart1Index + idx] - frag[kPart3Index + idx], + frag[kPart1Index + idx] + frag[kPart2Index + idx]); + + offset_ref.at({accum_m, accum_n}) = z; + } + } + } + } + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_byte_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index byte_offset) const { ///< store a tile with a linear offset + + store_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Stores a fragment to memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + Fragment &frag, ///< fragment to store to the tensor + TensorCoord const &tile_offset) const { ///< stores a tile with a logical offset in units of whole tiles + + store(frag, tile_offset, 0); + } + + /// Stores a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + /// fragment to store to the tensor + Fragment const &frag, + /// stores a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// stores a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h index 9166fe7ce..1bf23c743 100644 --- a/include/cutlass/gemm/warp/mma_simt.h +++ b/include/cutlass/gemm/warp/mma_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -147,6 +147,9 @@ public: dp4a_type >; + /// Shape of the underlying instruction + using InstructionShape = GemmShape<1,1,use_dp4a ? 4 : 1>; + public: /// Iterates over the A operand in memory diff --git a/include/cutlass/gemm/warp/mma_simt_policy.h b/include/cutlass/gemm/warp/mma_simt_policy.h index 782474337..6abd0bf6a 100644 --- a/include/cutlass/gemm/warp/mma_simt_policy.h +++ b/include/cutlass/gemm/warp/mma_simt_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h index 1d47e8f1a..ed1e59870 100644 --- a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h +++ b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h index 4e082db13..3eff7b905 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,12 +39,16 @@ #include "cutlass/arch/memory_sm75.h" #include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm80.h" + #include "cutlass/gemm/gemm.h" #include "cutlass/gemm/warp/mma.h" #include "cutlass/gemm/warp/mma_tensor_op_policy.h" #include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -77,6 +81,27 @@ struct ConvertAndPack { } }; +template +struct ConvertAndPack { + + using Converter = NumericArrayConverter; + + CUTLASS_HOST_DEVICE + Array operator()(Array const &source) { + Converter converter; + + Array tmp; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc)); + tmp[i] = source[idx]; + } + + return converter(tmp); + } +}; + template struct ConvertAndPack { @@ -130,8 +155,6 @@ template < /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. bool AccumulatorsInRowMajor = false, - /// PartitionsN indicating how many PartitionsN for multiplicand B - int PartitionsN_ = 1, /// Used for partial specialization typename Enable = bool > @@ -167,6 +190,9 @@ public: /// Indicates class of matrix operator using OperatorClass = arch::OpClassTensorOp; + /// Shape of underlying instruction + using InstructionShape = typename Policy::Operator::Shape; + /// Complex transform on A operand static ComplexTransform const kTransformA = ComplexTransform::kNone; @@ -179,9 +205,6 @@ public: /// Number of partitions along K dimension static int const kPartitionsK = PartitionsK_; - /// PartitionsN indicating how many PartitionsN for multiplicand B - static int const kPartitionsN = PartitionsN_; - public: /// Iterates over the A operand in memory @@ -228,9 +251,7 @@ private: /// Number of mma operations performed using MmaIterations = MatrixShape< Shape::kM / Policy::Operator::Shape::kM, - (Shape::kN / Policy::Operator::Shape::kN / kPartitionsN > 0) ? - Shape::kN / Policy::Operator::Shape::kN / kPartitionsN : - 1 + Shape::kN / Policy::Operator::Shape::kN >; public: @@ -254,8 +275,8 @@ public: FragmentC &D, TransformedFragmentA const &A, TransformedFragmentB const &B, - FragmentC const &C, - int const &partitionN_idx = 0) const { + FragmentC const &C + ) const { using MmaOperandA = typename Policy::Operator::FragmentA; using MmaOperandB = typename Policy::Operator::FragmentB; @@ -267,8 +288,7 @@ public: MmaOperandB const *ptr_B = reinterpret_cast(&B); MmaOperandC *ptr_D = reinterpret_cast(&D); - // The offset of multilicand B for current partition - const int n_off = partitionN_idx * FragmentB::kElements / MmaOperandB::kElements / kPartitionsN; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) // Serpentine visitation order maximizing reuse of Rb CUTLASS_PRAGMA_UNROLL for (int n = 0; n < MmaIterations::kColumn; ++n) { @@ -286,24 +306,46 @@ public: ptr_D[n + m_serpentine * MmaIterations::kColumn]); } else { mma( - ptr_D[m_serpentine + (n + n_off) * MmaIterations::kRow], + ptr_D[m_serpentine + n * MmaIterations::kRow], ptr_A[m_serpentine], - ptr_B[n + n_off], - ptr_D[m_serpentine + (n + n_off) * MmaIterations::kRow]); + ptr_B[n], + ptr_D[m_serpentine + n * MmaIterations::kRow]); } } } + #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + // Serpentine visitation order maximizing reuse of Ra + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; ++m) { + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n); + + if (AccumulatorsInRowMajor) { // matrix B is reordered + mma( + ptr_D[n_serpentine + m * MmaIterations::kColumn], + ptr_A[m], + ptr_B[n_serpentine], + ptr_D[n_serpentine + m * MmaIterations::kColumn]); + } else { + mma(ptr_D[m + n_serpentine * MmaIterations::kRow], + ptr_A[m], + ptr_B[n_serpentine], + ptr_D[m + n_serpentine * MmaIterations::kRow]); + } + } + } + #else + assert(0); + #endif } /// Transform the mma operands to the required types CUTLASS_DEVICE void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, FragmentA const &A, FragmentB const &B) const { - bool midway_depstage = - !(platform::is_same::value && - platform::is_same::value); // // Define conversions from source type to instruction type @@ -314,6 +356,7 @@ public: FloatRoundStyle const kRoundB = PreferredRoundingMode::kRound; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) detail::ConvertAndPack convert_A; @@ -331,6 +374,26 @@ public: ptr_dst_B[0] = convert_B(ptr_B[0]); ptr_dst_B[1] = convert_B(ptr_B[1]); + #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + detail::ConvertAndPack + convert_A; + NumericArrayConverter + convert_B; + Array const *ptr_A = + reinterpret_cast const *>(&A); + Array * + ptr_dst_A = reinterpret_cast *>(&dst_A); + + dst_B = convert_B(B); + + ptr_dst_A[0] = convert_A(ptr_A[0]); + ptr_dst_A[1] = convert_A(ptr_A[1]); + #else + assert(0); + #endif } }; diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h new file mode 100644 index 000000000..85f5009d8 --- /dev/null +++ b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h @@ -0,0 +1,428 @@ +/*! \file + \brief This defines a "fragment" iterator for visiting the fragments of a warp tile + that participate in one warp-level mma operation. + + Typically, this is used to access the accumulator tile/fragement of a warp-level mma operation. + The accumulator tile is then partitioned into smaller tiles/fragments that can be fed into + next warp-level mma operation. + + This iterator is necessary to accomplish warp-level mma fusion where the accumulator tile is + reused as multiplicand tile for the next mma. + +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/numeric_conversion.h" + +namespace cutlass { +namespace gemm { +namespace warp { + + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Size of the accumulation tile shape (concept: MatrixShape) + typename AccumulatorShape_, + /// KBlocks columns to compute residual + int KBlocksColumn_, + /// Accumulator Element type + typename ElementAccumulator_, + /// Element type + typename Element_, + /// Layout of operand in memory + typename Layout_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Output operation on the fragment + typename OutputOp_, + /// Whether beta is zero + bool IsBetaZero_ > +class MmaTensorOpFragmentIterator; + + +// Partial specialization for col-major accumulator tile +// And Element type is the same as Accumulator Element type + +template < + /// Shape of warp tile to load (concept: MatrixShape) + typename Shape_, + /// Shape of the warp accumulation tile (concept: MatrixShape) + typename AccumulatorShape_, + /// KBlocks columns to compute residual + int KBlocksColumn_, + /// Element type + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Output operation on fragment + typename OutputOp_> +class MmaTensorOpFragmentIterator { + public: + + /// Shape of warp tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Shape of the warp accumulation tile (concept: MatrixShape) + using AccumulatorShape = AccumulatorShape_; + + /// KBlocks columns to compute residual + static int const kKBlockColumn = KBlocksColumn_; + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Output operation on fragment + using OutputOp = OutputOp_; + + /// Whether beta is zero + static bool const IsBetaZero = true; + + /// Number of participating threads + static int const kThreads = 32; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + static_assert( + !(AccumulatorShape::kRow % Shape::kRow) && + !(AccumulatorShape::kColumn % Shape::kColumn), + "Shape of Warp Accumulator must be divisible by warp shape."); + static_assert( + !(kKBlockColumn % Shape::kColumn), + "KBlock size must be divisible by warp shape."); + + /// Number of times this iterator can be incremented + static int const kIterations = AccumulatorShape::kCount / Shape::kCount; + }; + +private: + + static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads; + + /// Number of mma operations performed by a warp + using MmaIterations = MatrixShape; + /// Number of mma operations performed by the entire accumulator + using AccumulatorIterations = MatrixShape; + + /// Number of K iterations + static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn; + static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn; + static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn + * (AccumulatorShape::kRow / Shape::kRow); + static int const kResidualIndex = kResidualColumn / Shape::kColumn + * (AccumulatorShape::kRow / Shape::kRow); + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array; + + /// Accumulator Fragment object + using AccumulatorFragment = Array; + + +private: + + /// Internal access type + using AccessType = Array; + +private: + // + // Data members + // + + /// Accumulator tile + AccessType const *accumulators_; + + /// Internal index + int index_; + + /// Used to access residual tile first + bool is_residual_tile_; + +public: + /// Constructs an iterator + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator(AccumulatorFragment const &accum) + : accumulators_(reinterpret_cast(&accum)), + index_(0), is_residual_tile_(true) {} + + /// Add offset + CUTLASS_HOST_DEVICE + void add_offset(int index_offset) { + index_ += index_offset; + if(is_residual_tile_ && index_ >= kKBlockColumnIterations) { + index_ = index_ - kKBlockColumnIterations + kResidualIndex; + is_residual_tile_ = false; + } + } + + /// Increments + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator &operator++() { + add_offset(1); + return *this; + } + + /// Decrements + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator &operator--() { + add_offset(-1); + return *this; + } + + /// Loads a fragment from the referenced part of the accumulator tile + CUTLASS_HOST_DEVICE + void load(Fragment &frag, OutputOp output_op) const { + + if (output_op.is_source_needed()) //beta must be zero + assert(0); + + AccessType src_fragment; + src_fragment.clear(); + + + AccessType *frag_ptr = reinterpret_cast(&frag); + + int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow; + int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow + * MmaIterations::kColumn; + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; n++) { + for (int m = 0; m < MmaIterations::kRow; m++) { + int accumulator_access_offset = + (n + index_n) * AccumulatorIterations::kRow + m + index_m; + + frag_ptr[n * MmaIterations::kRow + m].clear(); + if(!(is_residual_tile_ && index_ >= kResidualIndex)) + //frag_ptr[n * MmaIterations::kRow + m] = accumulators_[accumulator_access_offset]; + frag_ptr[n * MmaIterations::kRow + m] = output_op(accumulators_[accumulator_access_offset], src_fragment); + } + } + } + +}; + +// Partial specialization for row-major accumulator tile + +template < + /// Shape of warp tile to load (concept: MatrixShape) + typename Shape_, + /// Shape of the warp accumulation tile (concept: MatrixShape) + typename AccumulatorShape_, + /// KBlocks columns to compute residual + int KBlocksColumn_, + /// Accumulator Element type + typename ElementAccumulator_, + /// Element type + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Output operation on fragment + typename OutputOp_> +class MmaTensorOpFragmentIterator { + public: + + /// Shape of warp tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Shape of the warp accumulation tile (concept: MatrixShape) + using AccumulatorShape = AccumulatorShape_; + + /// KBlocks columns to compute residual + static int const kKBlockColumn = KBlocksColumn_; + + /// Accumulator Element type + using ElementAccumulator = ElementAccumulator_; + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Output operation on fragment + using OutputOp = OutputOp_; + + /// Whether beta is zero + static bool const IsBetaZero = true; + + /// Number of participating threads + static int const kThreads = 32; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + static_assert( + !(AccumulatorShape::kRow % Shape::kRow) && + !(AccumulatorShape::kColumn % Shape::kColumn), + "Shape of Warp Accumulator must be divisible by warp shape."); + static_assert( + !(kKBlockColumn % Shape::kColumn), + "KBlock size must be divisible by warp shape."); + + /// Number of times this iterator can be incremented + static int const kIterations = AccumulatorShape::kCount / Shape::kCount; + }; + +private: + + static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads; + + /// Number of mma operations performed by a warp + using MmaIterations = MatrixShape; + /// Number of mma operations performed by the entire accumulator + using AccumulatorIterations = MatrixShape; + + /// Number of K iterations + static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn; + static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn; + static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn + * (AccumulatorShape::kRow / Shape::kRow); + static int const kResidualIndex = kResidualColumn / Shape::kColumn + * (AccumulatorShape::kRow / Shape::kRow); + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array; + + /// Accumulator Fragment object + using AccumulatorFragment = Array; + + +private: + + /// Internal access type + using AccessType = Array; + using FragmentAccessType = Array; + +private: + // + // Data members + // + + /// Accumulator tile + AccessType const *accumulators_; + + /// Internal index + int index_; + + /// Used to access residual tile first + bool is_residual_tile_; + +public: + /// Constructs an iterator + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator(AccumulatorFragment const &accum) + : accumulators_(reinterpret_cast(&accum)), + index_(0), is_residual_tile_(true) {} + + /// Add offset + CUTLASS_HOST_DEVICE + void add_offset(int index_offset) { + index_ += index_offset; + if(is_residual_tile_ && index_ >= kKBlockColumnIterations) { + index_ = index_ - kKBlockColumnIterations + kResidualIndex; + is_residual_tile_ = false; + } + } + + /// Increments + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator &operator++() { + add_offset(1); + return *this; + } + + /// Decrements + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator &operator--() { + add_offset(-1); + return *this; + } + + /// Loads a fragment from the referenced part of the accumulator tile + CUTLASS_HOST_DEVICE + void load(Fragment &frag, OutputOp output_op) const { + + if (output_op.is_source_needed()) //beta must be zero + assert(0); + + FragmentAccessType src_fragment; + src_fragment.clear(); + + FragmentAccessType *frag_ptr = reinterpret_cast(&frag); +// NumericArrayConverter fragmentConverter; + + int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow; + int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow + * MmaIterations::kColumn; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; m++) { + for (int n = 0; n < MmaIterations::kColumn; n++) { + int accumulator_access_offset = + (m + index_m) * AccumulatorIterations::kColumn + n + index_n; + + frag_ptr[m * MmaIterations::kColumn + n].clear(); + if(!(is_residual_tile_ && index_ >= kResidualIndex)) +// frag_ptr[m * MmaIterations::kColumn + n] = fragmentConverter(accumulators_[accumulator_access_offset]); + frag_ptr[m * MmaIterations::kColumn + n] = output_op(accumulators_[accumulator_access_offset], src_fragment); + } + } + } + +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/include/cutlass/gemm/warp/mma_tensor_op_policy.h index 823860111..68b28bfff 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_policy.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h index 59515b5bf..063c77f9c 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -106,6 +106,9 @@ public: /// Architecture tag using ArchTag = arch::Sm70; + /// Underlying instruction shape + using InstructionShape = typename Policy::Operator::Shape; + /// Complex transform on A operand static ComplexTransform const kTransformA = ComplexTransform::kNone; @@ -210,8 +213,7 @@ public: FragmentC &D, FragmentA const &A, FragmentB const &B, - FragmentC const &C, - int const &partitionN_idx = 0) { + FragmentC const &C) { using MmaOperandA = typename Policy::Operator::FragmentA; using MmaOperandB = typename Policy::Operator::FragmentB; diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h index 45048d389..1a8fa4f91 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -229,8 +229,11 @@ public: k_group_idx_(0) { int quad_pair = (lane_id >> 3); + int quad_quad = (lane_id >> 4); int lane_in_quad = (lane_id & 3); int lane_in_quad_pair = (lane_id & 7); + int lane_in_quad_quad = (lane_id & 15); + CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kPointerCount; ++i) { int partition_contiguous_idx = -1; @@ -242,6 +245,24 @@ public: access_contiguous_idx = (quad_pair ^ lane_in_quad); access_strided_idx = lane_in_quad_pair; } + else if (Policy::LdsmShape::kContiguous == 2 && + kOperand == Operand::kA) { + // Matrix multiply 16816 A + // Q0 Q2 + // Q1 Q3 + partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1)); + access_contiguous_idx = + (((quad_pair & 1) + ((i & 1) << 1)) ^ lane_in_quad); + access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3); + } else if (Policy::LdsmShape::kContiguous == 2 && + kOperand == Operand::kB) { + // Matrix multiply 16816 B + // Q0 Q1 + // Q2 Q3 + partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1)); + access_contiguous_idx = ((quad_quad + ((i & 1) << 1)) ^ lane_in_quad); + access_strided_idx = lane_in_quad_quad; + } int access_contiguous = partition_contiguous_idx * Layout::PartitionShape::kContiguous + access_contiguous_idx; @@ -436,6 +457,364 @@ public: }; //////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for 32-thread MMA.TF32 NT TensorOps. It +/// uses LDS.32 to load from shared memory and therefore must be initialized +/// with a TensorRef to shared memory. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicandCongruous<32, 32>, InstructionShape_, + OpDelta_, 32, PartitionsK_> { + public: + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand == Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for " + "A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCongruous<32, 32>; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: + /// MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kContiguous % InstructionShape::kContiguous), + "Shape of warp-level Mma must be divisible by operator shape."); + + // Determine number of elements along outer dimension per individual LDS.32 + // op. Every one warp of LDS.32 loads 8x4 elements + static int const kLdsOpInner = Layout::TileShape::kStrided; + static int const kLdsOpOuter = kThreads / kLdsOpInner; + + static_assert(!(Shape::kContiguous % kLdsOpOuter), + "Shape of warp-level mma must be divisible by LDS.32's " + "fundamental tile size."); + + static_assert(!(Shape::kStrided % kLdsOpInner), + "Shape of warp-level mma must be divisible by LDS.32's " + "fundamental tile size."); + + /// Number of LDS.32 instructions needed by one MMA instruction + /// 1684 A 2x1 + /// 1684 B 1x1 + /// 1688 A 2x2 + /// 1688 B 1x2 + static int const LdsShapeContiguous = + InstructionShape::kContiguous / kLdsOpOuter; + static int const LdsShapeStrided = InstructionShape::kStrided / kLdsOpInner; + using LdsShape = + layout::PitchLinearShape; + + /// Number and arrangement of LDS instructions + using LdsIterations = layout::PitchLinearShape< + Shape::kContiguous / LdsShapeContiguous / kLdsOpOuter, 1>; + + /// Number of groups for each tile + static int const kGroupsPerTile = + Shape::kStrided / InstructionShape::kStrided; + }; + + private: + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Number of internal pointers needed to reference shared memory + static int const kPointerCount = Layout::TileShape::kContiguous * + Layout::kElementsPerAccess / + Policy::kLdsOpOuter; + + /// Vectorized access is not used + static int const kElementsPerAccess = 1; + + /// Pointer type used for accesses + using AccessType = Element; + + /// Internal counter used to jump to next K partition + int k_group_idx_; + + public: + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + + private: + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_[kPointerCount]; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + + public: + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() : stride_(0), byte_offset_(0) {} + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id) + : stride_(ref.stride(0)), byte_offset_(0), k_group_idx_(0) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kPointerCount; ++i) { + int access_strided = lane_id % Policy::kLdsOpInner; + int access_contiguous = (lane_id / Policy::kLdsOpInner) + + (access_strided ^ i) * Policy::kLdsOpOuter; + + pointer_[i] = reinterpret_cast(ref.data()) + + access_contiguous + access_strided * stride_; + } + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + byte_offset_ += offset * sizeof(Element); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset( + TensorCoord const &tile_offset) { + int contiguous_offset = tile_offset.contiguous(); + if (Shape::kContiguous == + Layout::TileShape::kContiguous * Layout::kElementsPerAccess / 2) { + if (tile_offset.contiguous() % 2) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kPointerCount / 2; ++i) { + AccessType const *tmp_pointer = pointer_[i]; + pointer_[i] = pointer_[i + kPointerCount / 2]; + pointer_[i + kPointerCount / 2] = tmp_pointer; + } + } + contiguous_offset = (tile_offset.contiguous() >> 1) << 1; + } + + int offset = (tile_offset.strided() * InstructionShape::kStrided) * stride_ + + contiguous_offset * Shape::kContiguous; + + add_pointer_offset(offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &operator++() { + add_tile_offset({0, 1}); + + if (kPartitionsK > 1) { + ++k_group_idx_; + // Jump to next stage + if (k_group_idx_ == Policy::kGroupsPerTile) { + k_group_idx_ = 0; + add_tile_offset( + {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)}); + } + } + + return *this; + } + + /// Advances the iterator along the opposite of the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &operator--() { + byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) * + kElementsPerAccess; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of + ///< the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &operator+=( + TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of + ///< the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &operator-=( + TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { load_with_byte_offset(frag, 0); } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + Element *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) { + CUTLASS_PRAGMA_UNROLL + for (int ss = 0; ss < Policy::LdsShape::kStrided; ++ss) { + CUTLASS_PRAGMA_UNROLL + for (int cc = 0; cc < Policy::LdsShape::kContiguous; ++cc) { + int access_idx = + cc + (ss + (c + s * Policy::LdsIterations::kContiguous) * + Policy::LdsShape::kStrided) * + Policy::LdsShape::kContiguous; + int access_idx_contiguous = cc + c * Policy::LdsShape::kContiguous; + int access_idx_strided = + (ss + s * Policy::LdsShape::kStrided) * Policy::kLdsOpInner; + + AccessType const *source_ptr = + pointer_[access_idx_contiguous % kPointerCount] + + Layout::TileShape::kContiguous * Layout::kElementsPerAccess * + (access_idx_contiguous / kPointerCount) + + access_idx_strided * stride_; + + char const *source_byte_ptr = + reinterpret_cast(source_ptr) + byte_offset + + byte_offset_; + + fetch_ptr[access_idx] = + *reinterpret_cast(source_byte_ptr); + } + } + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = + tile_offset.contiguous() * Shape::kContiguous / + Layout::kElementsPerAccess + + tile_offset.strided() * InstructionShape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + // no op + } +}; + +//////////////////////////////////////////////////////////////////////////////// + /// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared /// memory and therefore must be initialized with a TensorRef to shared memory. /// @@ -1069,7 +1448,6 @@ class MmaTensorOpMultiplicandTileIterator< k_group_idx_(0) { // Warp level iterator at most use double buffer to hide latency. If there // are more than 2 sections, every stage should have more than 1 section. - // TODO: refactor code after every case is implemented // Turing silicon requires all 32 threads in a warp provide valid addresses // even for LDSM.1 and LDSM.2 @@ -1077,6 +1455,8 @@ class MmaTensorOpMultiplicandTileIterator< lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner); #endif + int quad_quad = (lane_id >> 4); + int quad_pair = (lane_id >> 3); int lane_in_pair = (lane_id & 1); int lane_in_quad = (lane_id & 3); int lane_in_quad_pair = (lane_id & 7); @@ -1100,6 +1480,26 @@ class MmaTensorOpMultiplicandTileIterator< (lane_in_quad_quad / Layout::kFactor)); access_strided_idx = lane_id / Layout::kFactor; } + else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kA) { + // Integer matrix multiply 16832 A + partition_contiguous_idx = lane_in_quad / factor_in_partition; + access_strided_idx = lane_in_quad_quad / Layout::kFactor; + access_contiguous_idx = + ((lane_in_pair * factor_in_partition + quad_quad) ^ + access_strided_idx); + } + else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kB) { + // Integer matrix multiply 16832 B + partition_contiguous_idx = lane_in_quad / factor_in_partition; + access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2; + access_contiguous_idx = + ((lane_in_pair * factor_in_partition + ((lane_id & 8) >> 3)) ^ + access_strided_idx); + } } else if (Layout::kFactor == 2) { // Super Matrix multiply kBlock = 32 if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) { @@ -1113,6 +1513,28 @@ class MmaTensorOpMultiplicandTileIterator< access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor); access_strided_idx = lane_id / Layout::kFactor; } + else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kA) { + // Matrix multiply 16816|1688.TF32 A + // Q0 Q2 + // Q1 Q3 + partition_contiguous_idx = (lane_id % Layout::kFactor); + access_contiguous_idx = + (quad_quad ^ (lane_in_quad_pair / Layout::kFactor)); + access_strided_idx = (lane_in_quad_quad / Layout::kFactor); + } else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kB) { + // Matrix multiply 16816|1688.TF32 B + // Q0 Q1 + // Q2 Q3 + partition_contiguous_idx = (lane_id % Layout::kFactor); + access_contiguous_idx = + ((quad_pair & 1) ^ (lane_in_quad_pair / Layout::kFactor)); + access_strided_idx = + (lane_in_quad_pair + (lane_id >> 4 << 3)) / Layout::kFactor; + } } else if (Layout::kFactor == 1) { // Super Matrix multiply kBlock = 64 if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) { @@ -1124,6 +1546,25 @@ class MmaTensorOpMultiplicandTileIterator< access_contiguous_idx = lane_in_quad; access_strided_idx = lane_id; } + else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kA) { + // Matrix multiply 16816|1688.TF32 A + // Q0 Q2 + // Q1 Q3 + partition_contiguous_idx = (lane_in_quad_pair >> 2); + access_contiguous_idx = (quad_quad ^ lane_in_quad); + access_strided_idx = lane_in_quad_quad; + } else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kB) { + // Matrix multiply 16816|1688.TF32 B + // Q0 Q1 + // Q2 Q3 + partition_contiguous_idx = (lane_in_quad_pair >> 2); + access_contiguous_idx = ((quad_pair & 1) ^ lane_in_quad); + access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3); + } } int access_contiguous = @@ -1161,16 +1602,68 @@ class MmaTensorOpMultiplicandTileIterator< return *this; } + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative( + TensorCoord const &tile_offset) { + + int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile; + int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile; + if (k_groups_delta < 0) { + whole_tiles -= 1; + k_groups_delta += Policy::kGroupsPerTile; + } + + if ((Policy::kGroupsPerTile / kPartitionsK) >= 2) { + byte_offset_ ^= (k_groups_delta & 1) * Policy::LdsmShape::kContiguous * + sizeof_bits::value * + Layout::kElementsPerAccess / 8; + } + if ((Policy::kGroupsPerTile / kPartitionsK) >= 4) { + byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 1)) & 2) * + Policy::LdsmShape::kContiguous * + sizeof_bits::value * + Layout::kElementsPerAccess / 8; + } + if ((Policy::kGroupsPerTile / kPartitionsK) == 8) { + byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 3)) & 4) * + Policy::LdsmShape::kContiguous * + sizeof_bits::value * + Layout::kElementsPerAccess / 8; + } + + k_group_idx_ += k_groups_delta; + whole_tiles += k_group_idx_ / (Policy::kGroupsPerTile / kPartitionsK); + k_group_idx_ = k_group_idx_ % (Policy::kGroupsPerTile / kPartitionsK); + + pointer_ += + tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor + + whole_tiles * stride_ / sections_; + return *this; + } + /// Advances the iterator along the advance dimension CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator &operator++() { + // Integer matrix multiply 16832 Interleaved-32 + // NONE + // Integer matrix multiply 16816 Interleaved-32 || Integer matrix multiply 16816 kblock=32 + // Integer matrix multiply 8816 Interleaved-32 // ^1 ^1 + // Matrix multiply 1684.TF32 kblock=16 || Integer matrix multiply 16816 kblock=64 // Matrix multiply 1688 kblock=32 || Integer matrix multiply 8816 kblock=64 // ^1 ^3 ^1 ^3 // Matrix multiply 1688 kblock=64 // ^1 ^3 ^1 ^7 ^1 ^3 ^1 ^7 + + // Matrix multiply 16816 kblock=32 | 1688.TF32 kblock=16 || Integer matrix multiply 16832 kblock=64 + // ^2 ^2 + // Matrix multiply 16816 kblock=64 | 1688.TF32 kblock=32 || Integer matrix multiply 16832 kblock=128 + // ^2 ^6 ^2 ^6 + if ((Policy::kGroupsPerTile / kPartitionsK) > 1) { int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8) ? 3 @@ -1443,6 +1936,16 @@ class MmaTensorOpMultiplicandTileIterator< return *this; } + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative( + TensorCoord const &tile_offset) { + iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()}); + + return *this; + } + /// Advances the iterator along the advance dimension CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator &operator++() { @@ -1673,6 +2176,16 @@ class MmaTensorOpMultiplicandTileIterator< return *this; } + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative( + TensorCoord const &tile_offset) { + iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()}); + + return *this; + } + /// Advances the iterator along the advance dimension CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator &operator++() { @@ -1782,6 +2295,7 @@ class MmaTensorOpMultiplicandTileIterator< }; //////////////////////////////////////////////////////////////////////////////// + template < /// Size of the matrix to load (concept: MatrixShape) typename Shape_, @@ -2682,6 +3196,7 @@ public: }; //////////////////////////////////////////////////////////////////////////////// + } // namespace warp } // namespace gemm } // namespace cutlass diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h index 51c5ce269..ed6384f05 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h new file mode 100644 index 000000000..e43373b64 --- /dev/null +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h @@ -0,0 +1,1579 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/platform/platform.h" +#include "cutlass/fast_math.h" + +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 64b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicandCongruous64b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 4), "Divisibility."); + + static_assert(sizeof_bits::value == 64, "This is specialized for 64b accesses."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 2; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<8, 4>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + Shape::kContiguous / kElementsPerAccess / Delta::kContiguous, + InstructionShape::kStrided / Delta::kStrided + >; + + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + + /// Internal counter used to jump to next K partition + int k_group_idx_; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0), + k_group_idx_(0) { + + int access_strided = lane_id / Policy::Delta::kContiguous; + int access_contiguous = (lane_id % Policy::Delta::kContiguous) ^ access_strided; + + pointer_= reinterpret_cast(ref.data()) + + access_contiguous + access_strided * stride_; + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + byte_offset_ += offset * sizeof(Element); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + int offset = + (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + + tile_offset.contiguous() * Shape::kContiguous; + + add_pointer_offset(offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + add_tile_offset({0, 1}); + + return *this; + } + + /// Advances the iterator along the opposite of the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + add_tile_offset({0, -1}); + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + int access_idx = c + s * Policy::Iterations::kContiguous; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c + + Policy::Delta::kStrided * s * stride_; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + + Index pointer_offset = + tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + + tile_offset.strided() * InstructionShape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCongruous64b, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.strided(), tile_offset.contiguous()}, + byte_offset); + } + + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared +/// memory and therefore must be initialized with a TensorRef to shared memory. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCongruous64b, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.contiguous(), tile_offset.strided()}, + byte_offset); + } + + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 64b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicand64bCrosswise, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility."); + + static_assert(sizeof_bits::value == 64, "This is specialized for 64b accesses."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 2; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<4, 16>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + InstructionShape::kContiguous / Delta::kContiguous, + Shape::kStrided / Delta::kStrided + >; + + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + + /// Internal counter for tracking K-group + Index k_group_idx_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0), + k_group_idx_(0) { + + int access_strided = lane_id / 8; + int access_contiguous = (lane_id % 8); + + byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType); + + pointer_= reinterpret_cast(ref.data()); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + pointer_ += offset / kElementsPerAccess; + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) * + stride_ * kElementsPerAccess + + tile_offset.strided() * Shape::kStrided; + + add_pointer_offset(offset); + + int old_k_group_idx = k_group_idx_; + + k_group_idx_ += tile_offset.contiguous(); + + if ((k_group_idx_ & 2) ^ (old_k_group_idx & 2)) { + byte_offset_ ^= 0x40; + } + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + pointer_ += stride_ * InstructionShape::kContiguous; + + if (k_group_idx_ & 0x1) { + // xor ptr + byte_offset_ ^= 0x40; + } + + ++k_group_idx_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + int access_idx = c + s * Policy::Iterations::kContiguous; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c * stride_ + + Policy::Delta::kStrided * s / kElementsPerAccess; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + + Element *exchange_ptr = reinterpret_cast(&frag); + + if (k_group_idx_ & 1) { + // exchange on 64b granularity + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Fragment::kElements; i += 2) { + Element tmp = exchange_ptr[i]; + exchange_ptr[i] = exchange_ptr[i + 1]; + exchange_ptr[i + 1] = tmp; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = tile_offset.contiguous() * + InstructionShape::kContiguous / + Layout::kElementsPerAccess + + tile_offset.strided() * Shape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + k_group_idx_ = k_group; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicand64bCrosswise, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.strided(), tile_offset.contiguous()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicand64bCrosswise, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.contiguous(), tile_offset.strided()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h index 0caf6247d..64be65568 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h index fe69867ec..824e207d7 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -40,6 +40,8 @@ #include "cutlass/arch/memory_sm75.h" #include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm80.h" + #include "cutlass/gemm/gemm.h" #include "cutlass/gemm/warp/mma.h" @@ -75,8 +77,6 @@ template < typename Policy_, ///< Number of partitions along K dimension int PartitionsK_ = 1, - ///< Number of partitions along N dimension - int PartitionsN_ = 1, ///< Used for partial specialization typename Enable = bool > @@ -106,6 +106,9 @@ public: /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy) using Policy = Policy_; + /// Underlying instruction shape + using InstructionShape = typename Policy::Operator::Shape; + /// Underlying architecture tag using ArchTag = typename Policy::Operator::ArchTag; @@ -116,7 +119,7 @@ public: static ComplexTransform const kTransformB = ComplexTransform::kNone; /// Indicates class of matrix operator - using OperatorClass = arch::OpClassTensorOp; + using OperatorClass = arch::OpClassWmmaTensorOp; /// Number of threads participating in warp-level matrix product static int const kThreadCount = 32; @@ -124,9 +127,6 @@ public: /// Number of partitions along K dimension static int const kPartitionsK = PartitionsK_; - /// PartitionsN indicating how many PartitionsN for multiplicand B - static int const kPartitionsN = PartitionsN_; - public: /// Iterates over the A operand in memory @@ -163,9 +163,7 @@ private: /// Number of wmma operations performed using WmmaIterations = MatrixShape< Shape::kM / Policy::Operator::Shape::kM, - (Shape::kN / Policy::Operator::Shape::kN / kPartitionsN > 0) ? - Shape::kN / Policy::Operator::Shape::kN / kPartitionsN : - 1 + Shape::kN / Policy::Operator::Shape::kN >; public: @@ -189,8 +187,7 @@ public: FragmentC &D, FragmentA const &A, FragmentB const &B, - FragmentC const &C, - int const &partitionN_idx = 0) const { + FragmentC const &C) const { CUTLASS_PRAGMA_UNROLL for (int n = 0; n < WmmaIterations::kColumn; ++n) { diff --git a/include/cutlass/half.h b/include/cutlass/half.h index 8ac08722a..10d00de1c 100644 --- a/include/cutlass/half.h +++ b/include/cutlass/half.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h index f69517699..6b97f8222 100644 --- a/include/cutlass/integer_subbyte.h +++ b/include/cutlass/integer_subbyte.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/kernel_launch.h b/include/cutlass/kernel_launch.h index b48fd7d0b..bd84a3578 100644 --- a/include/cutlass/kernel_launch.h +++ b/include/cutlass/kernel_launch.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/layout.h b/include/cutlass/layout/layout.h index ba540e77b..775357d12 100644 --- a/include/cutlass/layout/layout.h +++ b/include/cutlass/layout/layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h index 2ab907a55..7c02f8f2c 100644 --- a/include/cutlass/layout/matrix.h +++ b/include/cutlass/layout/matrix.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h index 987c2bb8a..a6158b32a 100644 --- a/include/cutlass/layout/pitch_linear.h +++ b/include/cutlass/layout/pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h index 2ef4e9d20..20d5bad77 100644 --- a/include/cutlass/layout/tensor.h +++ b/include/cutlass/layout/tensor.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/include/cutlass/layout/tensor_op_multiplicand_sm70.h index 26bd427e6..03f87db39 100644 --- a/include/cutlass/layout/tensor_op_multiplicand_sm70.h +++ b/include/cutlass/layout/tensor_op_multiplicand_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/include/cutlass/layout/tensor_op_multiplicand_sm75.h index b4b35667e..00870fb50 100644 --- a/include/cutlass/layout/tensor_op_multiplicand_sm75.h +++ b/include/cutlass/layout/tensor_op_multiplicand_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/include/cutlass/layout/tensor_op_multiplicand_sm80.h new file mode 100644 index 000000000..e5963a2a8 --- /dev/null +++ b/include/cutlass/layout/tensor_op_multiplicand_sm80.h @@ -0,0 +1,1133 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace layout { + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct TensorOpMultiplicandCongruous64b { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = PitchLinearCoord; + + /// Stride vector + using Stride = Coord; + + // + // Static constants + // + + static int const kElementSize = 64; + static int const kElementsPerAccess = 1; + + private: + + // + // Data members + // + + /// Stride data member. + Stride stride_; + + public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCongruous64b(Index ldm = 0) : stride_(ldm) {} + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCongruous64b(Stride stride) : stride_(stride) {} + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static TensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) { + return TensorOpMultiplicandCongruous64b(extent[0]); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + + int tc = coord.contiguous() / 16; + int ts = coord.strided() / 4; + + int c = coord.contiguous() % 16; + int s = coord.strided() % 4; + + + int bank = ((((c & 1) * 4 + (c & 6) / 2)) ^ (s & 1)) * 2 + (c / 8); + int row = (c & 6) / 2; + + bank ^= ((s & 2) * 2); + + LongIndex offset = tc * 16 + bank + (ts * 4 + row) * stride_[0]; + + return offset; + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { return stride_; } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride &stride() { return stride_; } + + /// Compute the number of contiguous elements needed to store a tensor with + /// the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return extent[1] * stride_[0]; + } + + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + return TensorCoord(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a column-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct ColumnMajorTensorOpMultiplicandCongruous64b { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCongruous64b; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static ColumnMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) { + return ColumnMajorTensorOpMultiplicandCongruous64b(extent.row()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.row(), coord.column())); + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + PitchLinearCoord coord = layout_.inverse(offset); + return MatrixCoord(coord.contiguous(), coord.strided()); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.row(), extent.column())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a row-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct RowMajorTensorOpMultiplicandCongruous64b { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCongruous64b; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static RowMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) { + return RowMajorTensorOpMultiplicandCongruous64b(extent.column()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.column(), coord.row())); + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + PitchLinearCoord coord = layout_.inverse(offset); + return MatrixCoord(coord.strided(), coord.contiguous()); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.column(), extent.row())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct TensorOpMultiplicand64bCrosswise { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = PitchLinearCoord; + + /// Stride vector + using Stride = Coord; + + // + // Static constants + // + + static int const kElementSize = 64; + static int const kElementsPerAccess = 1; + + private: + + // + // Data members + // + + /// Stride data member. + Stride stride_; + + public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicand64bCrosswise(Index ldm = 0) : stride_(ldm) {} + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicand64bCrosswise(Stride stride) : stride_(stride) {} + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static TensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) { + return TensorOpMultiplicand64bCrosswise(extent[0]); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + + int tc = coord.contiguous() / 16; + int ts = coord.strided() / 16; + + int c = coord.contiguous() % 16; + int s = coord.strided() % 16; + + int k_group = c / 4; + int access_s = s / 2; + + int row = access_s % 4; + int bank = ((k_group & 2) << 2) ^ ((s % 2) << 3) + (c % 4) * 2 + (access_s / 4) ^ (k_group & 1); + + int smem_row = (k_group * 4 + row) + tc * 16; + int smem_col = ts * 16 + bank; + + LongIndex offset = smem_row * stride_[0] + smem_col; + + return offset; + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { return stride_; } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride &stride() { return stride_; } + + /// Compute the number of contiguous elements needed to store a tensor with + /// the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return extent[1] * stride_[0]; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct ColumnMajorTensorOpMultiplicand64bCrosswise { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicand64bCrosswise; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static ColumnMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) { + return ColumnMajorTensorOpMultiplicand64bCrosswise(extent.column()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.row(), coord.column())); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.row(), extent.column())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct RowMajorTensorOpMultiplicand64bCrosswise { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicand64bCrosswise; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static RowMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) { + return RowMajorTensorOpMultiplicand64bCrosswise(extent.row()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.column(), coord.row())); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.column(), extent.row())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct TensorOpMultiplicandCongruous128b { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = PitchLinearCoord; + + /// Stride vector + using Stride = Coord; + + // + // Static constants + // + + static int const kElementSize = 128; + static int const kElementsPerAccess = 1; + + private: + + // + // Data members + // + + /// Stride data member. + Stride stride_; + + public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCongruous128b(Index ldm = 0) : stride_(ldm) {} + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCongruous128b(Stride stride) : stride_(stride) {} + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static TensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) { + return TensorOpMultiplicandCongruous128b(extent[0]); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + + Index tc = coord.contiguous() / 8; + Index ts = coord.strided() / 4; + + Index c = coord.contiguous() % 8; + Index s = coord.strided() % 4; + + Index k_index = (c / 2); + + Index bank = (((c & 1) * 4) | (s ^ k_index)); + + LongIndex offset = tc * 8 + bank + (ts * 4 + k_index) * stride_[0]; + + return offset; + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { return stride_; } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride &stride() { return stride_; } + + /// Compute the number of contiguous elements needed to store a tensor with + /// the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return extent[1] * stride_[0]; + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + return TensorCoord(); + } +}; + + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a column-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct ColumnMajorTensorOpMultiplicandCongruous128b { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCongruous128b; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static ColumnMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) { + return ColumnMajorTensorOpMultiplicandCongruous128b(extent.row()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.row(), coord.column())); + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + PitchLinearCoord coord = layout_.inverse(offset); + return MatrixCoord(coord.contiguous(), coord.strided()); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.row(), extent.column())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a row-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct RowMajorTensorOpMultiplicandCongruous128b { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCongruous128b; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static RowMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) { + return RowMajorTensorOpMultiplicandCongruous128b(extent.column()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.column(), coord.row())); + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + PitchLinearCoord coord = layout_.inverse(offset); + return MatrixCoord(coord.strided(), coord.contiguous()); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.column(), extent.row())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct TensorOpMultiplicandCrosswise128x4 { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = PitchLinearCoord; + + /// Stride vector + using Stride = Coord; + + // + // Static constants + // + + static int const kElementSize = 128; + static int const kElementsPerAccess = 1; + + private: + + // + // Data members + // + + /// Stride data member. + Stride stride_; + + public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCrosswise128x4(Index ldm = 0) : stride_(ldm) {} + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCrosswise128x4(Stride stride) : stride_(stride) {} + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static TensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) { + return TensorOpMultiplicandCrosswise128x4(extent[0]); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + + Index tc = coord.contiguous() / 8; + Index ts = coord.strided() / 8; + + Index c = coord.contiguous() % 8; + Index s = coord.strided() % 8; + + Index liq = c % 4; + + Index bank = liq + ((s & 1) * 4) ^ (c & 4); + + Index k_index = (c & 4) + (s / 4) * 2 + ((s & 2) / 2); + + LongIndex offset = (tc * 8 + k_index) * stride_[0] + ts * 8 + bank; + + return offset; + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { return stride_; } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride &stride() { return stride_; } + + /// Compute the number of contiguous elements needed to store a tensor with + /// the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return extent[1] * stride_[0]; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a column-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct ColumnMajorTensorOpMultiplicandCrosswise128x4 { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCrosswise128x4; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static ColumnMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) { + return ColumnMajorTensorOpMultiplicandCrosswise128x4(extent.column()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.row(), coord.column())); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.row(), extent.column())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a row-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct RowMajorTensorOpMultiplicandCrosswise128x4 { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCrosswise128x4; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static RowMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) { + return RowMajorTensorOpMultiplicandCrosswise128x4(extent.row()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.column(), coord.row())); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.column(), extent.row())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace layout +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h index 0700e5872..b54b6b3b1 100644 --- a/include/cutlass/layout/vector.h +++ b/include/cutlass/layout/vector.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix_coord.h b/include/cutlass/matrix_coord.h index 8ba61a5ec..b432665e8 100644 --- a/include/cutlass/matrix_coord.h +++ b/include/cutlass/matrix_coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix_shape.h b/include/cutlass/matrix_shape.h index 1d0b4820f..cb3118c2d 100644 --- a/include/cutlass/matrix_shape.h +++ b/include/cutlass/matrix_shape.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix_traits.h b/include/cutlass/matrix_traits.h index 8e7fe3305..cf7002a42 100644 --- a/include/cutlass/matrix_traits.h +++ b/include/cutlass/matrix_traits.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h index ef4604cb8..78181ce79 100644 --- a/include/cutlass/numeric_conversion.h +++ b/include/cutlass/numeric_conversion.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -45,7 +45,9 @@ enum class FloatRoundStyle { round_toward_zero, ///< round toward zero round_to_nearest, ///< round to nearest even round_toward_infinity, ///< round toward infinity - round_toward_neg_infinity ///< round toward negative infinity + round_toward_neg_infinity, ///< round toward negative infinity + round_half_ulp_truncate, ///< add 0.5ulp to integer representation then round toward zero + round_half_ulp_trunc_dntz ///< like round_half_ulp_truncate, except denorms are rounded *toward* zero }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -240,6 +242,232 @@ struct NumericConverter { } }; +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Partial specializations for float <=> bfloat16_t +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for float <= bfloat16_t +template +struct NumericConverter { + + using result_type = float; + using source_type = bfloat16_t; + static FloatRoundStyle const round_style = Round; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + return static_cast(s); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = bfloat16_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + return static_cast(s); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = bfloat16_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + uint32_t x32 = reinterpret_cast(s); + + #if defined(__CUDA_ARCH__) + if (::isfinite(s)) { + x32 += 0x8000; + } + #else + if (std::isfinite(s)) { + x32 += 0x8000; + } + #endif + + uint16_t x16 = uint16_t((x32 >> 16) & 0xffff); + return bfloat16_t::bitcast(x16); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = bfloat16_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + uint32_t x32 = reinterpret_cast(s); + uint16_t x16 = uint16_t(x32 >> 16); + + return bfloat16_t::bitcast(x16); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Partial specializations for float <=> tfloat32_t +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for float <= tfloat32_t +template +struct NumericConverter { + + using result_type = float; + using source_type = tfloat32_t; + static FloatRoundStyle const round_style = Round; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + return static_cast(s); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = tfloat32_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + unsigned storage = reinterpret_cast(s); + + if ((storage & 0x7f800000) != 0x7f800000) { + + bool mantissa_bit = ((storage & (1 << 13)) != 0); + bool round_bit = ((storage & (1 << 12)) != 0); + bool sticky_bit = ((storage & ((1 << 12) - 1)) != 0); + + if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) { + storage += uint32_t(1 << 13); + } + + // Note, the following is intentionally commented out. TF32 + // does not define the low order bits, so they may be left in + // an undefined state. + // + // By not truncating these bit explicitly, we avoid an extra logical + // operation. + // + // TF32 may be implicitly converted to float by performing this + // operation as needed. + // + // storage = (storage & ~0x1fff); + } + else if (storage & ~0xff800000) { + storage = 0x7fffffff; + } + + return tfloat32_t::bitcast(storage); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = tfloat32_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + return tfloat32_t::round_half_ulp_truncate(s); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +/// This rounding operation is similar to half_ulp_truncate except it rounds denorms toward zero. +/// It avoids predicated code, though it requires a temporary register. +template <> +struct NumericConverter { + using result_type = tfloat32_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_trunc_dntz; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + unsigned y = reinterpret_cast(s); + y = y & 0xff800000; + float d = reinterpret_cast(y); + float z = d / float(1 << 11) + s; + + return reinterpret_cast(z); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = tfloat32_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + uint32_t x = reinterpret_cast(s); + return tfloat32_t::bitcast(x & 0xffffe000); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// // // Conversion and Clamp operator for Integers @@ -518,6 +746,77 @@ struct NumericArrayConverter { ///////////////////////////////////////////////////////////////////////////////////////////////// +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Array <= Array, round to nearest +template <> +struct NumericArrayConverter { + + using result_type = Array; + using source_type = Array; + static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & source) { + + unsigned d; + + asm("cvt.rn.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) ); + + return reinterpret_cast(d); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Array <= Array +template < + int N, + FloatRoundStyle Round +> +struct NumericArrayConverter { + + using result_type = Array; + using source_type = Array; + static FloatRoundStyle const round_style = Round; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & source) { + + NumericArrayConverter convert_vector_; + NumericConverter convert_element_; + + result_type result; + + Array *result_ptr = reinterpret_cast *>(&result); + Array const *source_ptr = reinterpret_cast const *>(&source); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + result_ptr[i] = convert_vector_(source_ptr[i]); + } + + if (N % 2) { + result[N - 1] = convert_element_(source[N - 1]); + } + + return result; + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +#endif // if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + ///////////////////////////////////////////////////////////////////////////////////////////////// // Conditional guards to enable partial specialization for packed integers @@ -843,6 +1142,12 @@ struct PreferredRoundingMode { static FloatRoundStyle const kRound = FloatRoundStyle::round_to_nearest; }; +/// Defines preferred rounding mode for a pair of types +template <> +struct PreferredRoundingMode { + static FloatRoundStyle const kRound = FloatRoundStyle::round_half_ulp_truncate; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h index 2282e43e6..9479ccb08 100644 --- a/include/cutlass/numeric_types.h +++ b/include/cutlass/numeric_types.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,6 +69,10 @@ struct sizeof_bits { ///////////////////////////////////////////////////////////////////////////////////////////////// #include "cutlass/integer_subbyte.h" + #include "cutlass/half.h" +#include "cutlass/bfloat16.h" +#include "cutlass/tfloat32.h" ///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h index 36d290bb9..826b3977f 100644 --- a/include/cutlass/platform/platform.h +++ b/include/cutlass/platform/platform.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h index ac4f02780..929369622 100644 --- a/include/cutlass/predicate_vector.h +++ b/include/cutlass/predicate_vector.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/real.h b/include/cutlass/real.h index 8fa4d710d..45ab1864e 100644 --- a/include/cutlass/real.h +++ b/include/cutlass/real.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,7 @@ template struct RealType { using Type = T; +CUTLASS_HOST_DEVICE static T from_real(double x) { return static_cast(x); } diff --git a/include/cutlass/reduction/batched_reduction.h b/include/cutlass/reduction/batched_reduction.h index 83324ec01..16132a021 100644 --- a/include/cutlass/reduction/batched_reduction.h +++ b/include/cutlass/reduction/batched_reduction.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/batched_reduction_traits.h b/include/cutlass/reduction/batched_reduction_traits.h index c44238e1e..46157dc70 100644 --- a/include/cutlass/reduction/batched_reduction_traits.h +++ b/include/cutlass/reduction/batched_reduction_traits.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/device/reduce_split_k.h b/include/cutlass/reduction/device/reduce_split_k.h new file mode 100644 index 000000000..e3626f88c --- /dev/null +++ b/include/cutlass/reduction/device/reduce_split_k.h @@ -0,0 +1,215 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Kernel performing a reduction over densely packed tensors in global memory +*/ + +#pragma once + +#include "cutlass/device_kernel.h" +#include "cutlass/reduction/kernel/reduce_split_k.h" +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace reduction { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename ReductionKernel_ +> +class ReduceSplitK { +public: + using ReductionKernel = ReductionKernel_; + + using Shape = typename ReductionKernel::Shape; + using ReductionOp = typename ReductionKernel::ReductionOp; + using OutputOp = typename ReductionKernel::OutputOp; + + using ElementWorkspace = typename ReductionKernel::ElementWorkspace; + using ElementAccumulator = typename ReductionKernel::ElementAccumulator; + using ElementOutput = typename ReductionKernel::ElementOutput; + + using WorkspaceTensorRef = typename ReductionKernel::WorkspaceTensorRef; + using OutputTensorRef = typename ReductionKernel::OutputTensorRef; + + /// Argument structure + struct Arguments { + + // + // Data members + // + + MatrixCoord problem_size; + int partitions; + size_t partition_stride; + WorkspaceTensorRef workspace; + OutputTensorRef destination; + OutputTensorRef source; + typename OutputOp::Params output; + typename ReductionOp::Params reduction; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + Arguments() : + problem_size(0, 0), + partitions(1), + partition_stride(0) { } + + CUTLASS_HOST_DEVICE + Arguments( + MatrixCoord const & problem_size + ): + problem_size(problem_size) { } + + CUTLASS_HOST_DEVICE + Arguments( + MatrixCoord problem_size_, + int partitions_, + size_t partition_stride_, + WorkspaceTensorRef workspace_, + OutputTensorRef destination_, + OutputTensorRef source_, + typename OutputOp::Params output_ = typename OutputOp::Params(), + typename ReductionOp::Params reduction_ = typename ReductionOp::Params() + ): + problem_size(problem_size_), + partitions(partitions_), + partition_stride(partition_stride_), + workspace(workspace_), + destination(destination_), + source(source_), + output(output_), + reduction(reduction_) + { + + } + + }; + +private: + /// Kernel parameters object + typename ReductionKernel::Params params_; + +public: + /// Constructs Reduction SplitK + ReduceSplitK() { } + + /// Determines whether the ReduceSplitK can execute the given problem. + static Status can_implement(Arguments const &args) { + + return Status::kSuccess; + } + + /// Gets the workspace size + static size_t get_workspace_size(Arguments const &args) { + // needs no additional workspace + return 0; + } + + /// Initializes Reduction state from arguments. + Status initialize( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + // initialize the params structure from the arguments + params_ = typename ReductionKernel::Params( + args.problem_size, + args.partitions, + args.partition_stride, + args.workspace, + args.destination, + args.source, + args.output, + args.reduction + ); + + return Status::kSuccess; + + } + + /// Initializes Reduction kernel state from arguments. + Status update(Arguments const &args, void *workspace = nullptr) { + + // update the params structure from the arguments + params_.workspace.reset(args.workspace.non_const_ref().data()); + params_.destination.reset(args.destination.non_const_ref().data()); + params_.source.reset(args.source.non_const_ref().data()); + params_.output = args.output; + params_.reduction = args.reduction; + + return Status::kSuccess; + } + + /// Runs the kernel using initialized state. + Status run(cudaStream_t stream = nullptr) { + + // + // Launch reduction kernel + // + dim3 block = ReductionKernel::block_shape(); + dim3 grid = ReductionKernel::grid_shape(params_.problem_size); + + Kernel<<< grid, block, 0, stream >>>(params_); + + cudaError_t result = cudaGetLastError(); + + return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; + } + + + /// Runs the kernel using initialized state. + Status operator()(cudaStream_t stream = nullptr) { + return run(stream); + } + + /// Runs the kernel using initialized state. + Status operator()( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + Status status = initialize(args, workspace); + + if (status == Status::kSuccess) { + status = run(stream); + } + + return status; + } + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace reduction +} // namespace cutlass diff --git a/include/cutlass/reduction/kernel/reduce_split_k.h b/include/cutlass/reduction/kernel/reduce_split_k.h index 1869102f1..586c90d86 100644 --- a/include/cutlass/reduction/kernel/reduce_split_k.h +++ b/include/cutlass/reduction/kernel/reduce_split_k.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -129,8 +129,8 @@ public: cutlass::MatrixCoord problem_size) { return dim3( - (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn, - (problem_size.row() + Shape::kRow -1) / Shape::kRow); + (problem_size.row() + Shape::kRow - 1) / Shape::kRow, + (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn); } /// Determines the threadblock shape @@ -145,8 +145,8 @@ public: // Determine CTA position MatrixCoord thread_offset( - int(blockIdx.y) * Shape::kRow + threadIdx.y, - int(blockIdx.x) * Shape::kColumn + threadIdx.x * kElementsPerAccess + int(blockIdx.x) * Shape::kRow + threadIdx.y, + int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess ); // One guard conditional diff --git a/include/cutlass/reduction/thread/reduce.h b/include/cutlass/reduction/thread/reduce.h index ae03c8214..698b174f9 100644 --- a/include/cutlass/reduction/thread/reduce.h +++ b/include/cutlass/reduction/thread/reduce.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/thread/reduction_operators.h b/include/cutlass/reduction/thread/reduction_operators.h index 3eed62097..6f9aeb6f3 100644 --- a/include/cutlass/reduction/thread/reduction_operators.h +++ b/include/cutlass/reduction/thread/reduction_operators.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/threadblock_swizzle.h b/include/cutlass/reduction/threadblock_swizzle.h index 6e42cadab..2419cdf6f 100644 --- a/include/cutlass/reduction/threadblock_swizzle.h +++ b/include/cutlass/reduction/threadblock_swizzle.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h index cb6d68ca5..5714fbd2f 100644 --- a/include/cutlass/relatively_equal.h +++ b/include/cutlass/relatively_equal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -145,6 +145,28 @@ bool relatively_equal(half_t a, half_t b, half_t epsilon, half_t nonzero return detail::relatively_equal_float(a, b, epsilon, nonzero_floor); } +template <> +CUTLASS_HOST_DEVICE +bool relatively_equal( + bfloat16_t a, + bfloat16_t b, + bfloat16_t epsilon, + bfloat16_t nonzero_floor) { + + return detail::relatively_equal_float(a, b, epsilon, nonzero_floor); +} + +template <> +CUTLASS_HOST_DEVICE +bool relatively_equal( + tfloat32_t a, + tfloat32_t b, + tfloat32_t epsilon, + tfloat32_t nonzero_floor) { + + return detail::relatively_equal_float(a, b, epsilon, nonzero_floor); +} + template <> CUTLASS_HOST_DEVICE bool relatively_equal(float a, float b, float epsilon, float nonzero_floor) { diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h index 94b2eaced..dc5523dca 100644 --- a/include/cutlass/semaphore.h +++ b/include/cutlass/semaphore.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,11 @@ public: CUTLASS_DEVICE void fetch() { if (wait_thread) { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock)); + #else asm volatile ("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock)); + #endif } } @@ -94,7 +98,11 @@ public: __syncthreads(); if (wait_thread) { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + asm volatile ("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); + #else asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); + #endif } } }; diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h index 9ce529015..6f7aab2c6 100644 --- a/include/cutlass/subbyte_reference.h +++ b/include/cutlass/subbyte_reference.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_coord.h b/include/cutlass/tensor_coord.h index 043f7a569..d7a6d0df6 100644 --- a/include/cutlass/tensor_coord.h +++ b/include/cutlass/tensor_coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h index 6567fe81b..a805107c3 100644 --- a/include/cutlass/tensor_ref.h +++ b/include/cutlass/tensor_ref.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_view.h b/include/cutlass/tensor_view.h index 3efb16a5a..a9cf569de 100644 --- a/include/cutlass/tensor_view.h +++ b/include/cutlass/tensor_view.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -151,14 +151,20 @@ class TensorView : public TensorRef { /// Updates the pointer and layout object CUTLASS_HOST_DEVICE - void reset(Element* ptr, Layout const &layout, TensorCoord size) { + void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) { Base::reset(ptr, layout); - this->resize(extent_); + this->resize(extent); + } + + /// Updates the pointer + CUTLASS_HOST_DEVICE + void reset(Element* ptr) { + Base::reset(ptr); } /// Changes the size of the view without affecting pointer or layout CUTLASS_HOST_DEVICE - void resize(TensorCoord extent) { + void resize(TensorCoord const &extent) { this->extent_ = extent; } diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h new file mode 100644 index 000000000..64dc39149 --- /dev/null +++ b/include/cutlass/tfloat32.h @@ -0,0 +1,453 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! + \file + \brief Defines a proxy class for storing Tensor Float 32 data type. +*/ +#pragma once + +#if !defined(__CUDACC_RTC__) +#include +#include +#include +#endif + +#include "cutlass/cutlass.h" + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tensor Float 32 data type +struct alignas(4) tfloat32_t { + + // + // Data members + // + + /// Storage type + uint32_t storage; + + // + // Methods + // + + /// Constructs from an unsigned int + CUTLASS_HOST_DEVICE + static tfloat32_t bitcast(uint32_t x) { + tfloat32_t h; + h.storage = x; + return h; + } + + /// Emulated rounding is fast in device code + CUTLASS_HOST_DEVICE + static tfloat32_t round_half_ulp_truncate(float const &s) { + uint32_t x = reinterpret_cast(s); + + #if defined(__CUDA_ARCH__) + if (::isfinite(s)) { + x += 0x1000u; + } + #else + if (std::isfinite(s)) { + x += 0x1000u; + } + #endif + + return tfloat32_t::bitcast(x); + } + + /// Default constructor + CUTLASS_HOST_DEVICE + tfloat32_t() { } + + /// Floating-point conversion - round toward nearest even + CUTLASS_HOST_DEVICE + explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).storage) { } + + /// Floating-point conversion - round toward nearest even + CUTLASS_HOST_DEVICE + explicit tfloat32_t(double x): tfloat32_t(float(x)) { + + } + + /// Integer conversion - round toward zero + CUTLASS_HOST_DEVICE + explicit tfloat32_t(int x) { + float flt = static_cast(x); + storage = reinterpret_cast(flt); + } + + /// Converts to float + CUTLASS_HOST_DEVICE + operator float() const { + + // Conversions to IEEE single-precision requires clearing dont-care bits + // of the mantissa. + unsigned bits = (storage & ~0x1fffu); + + return reinterpret_cast(bits); + } + + /// Converts to float + CUTLASS_HOST_DEVICE + operator double() const { + return double(float(*this)); + } + + /// Converts to int + CUTLASS_HOST_DEVICE + explicit operator int() const { + return int(float(*this)); + } + + /// Casts to bool + CUTLASS_HOST_DEVICE + operator bool() const { + return (float(*this) != 0.0f); + } + + /// Obtains raw bits + CUTLASS_HOST_DEVICE + uint32_t raw() const { + return storage; + } + + /// Returns the sign bit + CUTLASS_HOST_DEVICE + bool signbit() const { + return ((raw() & 0x80000000) != 0); + } + + /// Returns the biased exponent + CUTLASS_HOST_DEVICE + int exponent_biased() const { + return int((raw() >> 23) & 0x0ff); + } + + /// Returns the unbiased exponent + CUTLASS_HOST_DEVICE + int exponent() const { + return exponent_biased() - 127; + } + + /// Returns the mantissa + CUTLASS_HOST_DEVICE + int mantissa() const { + return int(raw() & 0x7fffff); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTLASS_HOST_DEVICE +bool signbit(cutlass::tfloat32_t const& h) { + return h.signbit(); +} + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t abs(cutlass::tfloat32_t const& h) { + return cutlass::tfloat32_t::bitcast(h.raw() & 0x7fffffff); +} + +CUTLASS_HOST_DEVICE +bool isnan(cutlass::tfloat32_t const& h) { + return (h.exponent_biased() == 0x0ff) && h.mantissa(); +} + +CUTLASS_HOST_DEVICE +bool isfinite(cutlass::tfloat32_t const& h) { + return (h.exponent_biased() != 0x0ff); +} + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t nan_tf32(const char*) { + // NVIDIA canonical NaN + return cutlass::tfloat32_t::bitcast(0x7fffffff); +} + +CUTLASS_HOST_DEVICE +bool isinf(cutlass::tfloat32_t const& h) { + return (h.exponent_biased() == 0x0ff) && !h.mantissa(); +} + +CUTLASS_HOST_DEVICE +bool isnormal(cutlass::tfloat32_t const& h) { + return h.exponent_biased() && h.exponent_biased() != 0x0ff; +} + +CUTLASS_HOST_DEVICE +int fpclassify(cutlass::tfloat32_t const& h) { + int exp = h.exponent_biased(); + int mantissa = h.mantissa(); + if (exp == 0x0ff) { + if (mantissa) { + return FP_NAN; + } + else { + return FP_INFINITE; + } + } + else if (!exp) { + if (mantissa) { + return FP_SUBNORMAL; + } + else { + return FP_ZERO; + } + } + return FP_NORMAL; +} + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t sqrt(cutlass::tfloat32_t const& h) { +#if defined(__CUDACC_RTC__) + return cutlass::tfloat32_t(sqrtf(float(h))); +#else + return cutlass::tfloat32_t(std::sqrt(float(h))); +#endif +} + +CUTLASS_HOST_DEVICE +tfloat32_t copysign(tfloat32_t const& a, tfloat32_t const& b) { + + uint32_t a_mag = (reinterpret_cast(a) & 0x7fffffff); + uint32_t b_sign = (reinterpret_cast(b) & 0x80000000); + uint32_t result = (a_mag | b_sign); + + return reinterpret_cast(result); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Standard Library operations and definitions +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace std { + +#if !defined(__CUDACC_RTC__) +/// Numeric limits +template <> +struct numeric_limits { + static bool const is_specialized = true; + static bool const is_signed = true; + static bool const is_integer = false; + static bool const is_exact = false; + static bool const has_infinity = true; + static bool const has_quiet_NaN = true; + static bool const has_signaling_NaN = false; + static std::float_denorm_style const has_denorm = std::denorm_present; + static bool const has_denorm_loss = true; + static std::float_round_style const round_style = std::round_to_nearest; + static bool const is_iec559 = false; + static bool const is_bounded = true; + static bool const is_modulo = false; + static int const digits = 19; + + /// Least positive value + static cutlass::tfloat32_t min() { return cutlass::tfloat32_t::bitcast(0x01); } + + /// Minimum finite value + static cutlass::tfloat32_t lowest() { return cutlass::tfloat32_t::bitcast(0xff7fffff); } + + /// Maximum finite value + static cutlass::tfloat32_t max() { return cutlass::tfloat32_t::bitcast(0x7f7fffff); } + + /// Returns smallest finite value + static cutlass::tfloat32_t epsilon() { return cutlass::tfloat32_t::bitcast(0x1000); } + + /// Returns smallest finite value + static cutlass::tfloat32_t round_error() { return cutlass::tfloat32_t(0.5f); } + + /// Returns smallest finite value + static cutlass::tfloat32_t infinity() { return cutlass::tfloat32_t::bitcast(0x7f800000); } + + /// Returns smallest finite value + static cutlass::tfloat32_t quiet_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); } + + /// Returns smallest finite value + static cutlass::tfloat32_t signaling_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); } + + /// Returns smallest finite value + static cutlass::tfloat32_t denorm_min() { return cutlass::tfloat32_t::bitcast(0x1); } +}; +#endif + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace std + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Arithmetic operators +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTLASS_HOST_DEVICE +bool operator==(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) == float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator!=(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) != float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator<(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) < float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator<=(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) <= float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator>(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) > float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator>=(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) >= float(rhs); +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator+(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return tfloat32_t(float(lhs) + float(rhs)); +} + + +CUTLASS_HOST_DEVICE +tfloat32_t operator-(tfloat32_t const& lhs) { + float x = -reinterpret_cast(lhs); + return reinterpret_cast(x); +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator-(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return tfloat32_t(float(lhs) - float(rhs)); +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator*(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return tfloat32_t(float(lhs) * float(rhs)); +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator/(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return tfloat32_t(float(lhs) / float(rhs)); +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator+=(tfloat32_t & lhs, tfloat32_t const& rhs) { + lhs = tfloat32_t(float(lhs) + float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator-=(tfloat32_t & lhs, tfloat32_t const& rhs) { + lhs = tfloat32_t(float(lhs) - float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator*=(tfloat32_t & lhs, tfloat32_t const& rhs) { + lhs = tfloat32_t(float(lhs) * float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator/=(tfloat32_t & lhs, tfloat32_t const& rhs) { + lhs = tfloat32_t(float(lhs) / float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator++(tfloat32_t & lhs) { + float tmp(lhs); + ++tmp; + lhs = tfloat32_t(tmp); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator--(tfloat32_t & lhs) { + float tmp(lhs); + --tmp; + lhs = tfloat32_t(tmp); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator++(tfloat32_t & lhs, int) { + tfloat32_t ret(lhs); + float tmp(lhs); + tmp++; + lhs = tfloat32_t(tmp); + return ret; +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator--(tfloat32_t & lhs, int) { + tfloat32_t ret(lhs); + float tmp(lhs); + tmp--; + lhs = tfloat32_t(tmp); + return ret; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// User-defined literals +// + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t operator "" _tf32(long double x) { + return cutlass::tfloat32_t(float(x)); +} + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t operator "" _tf32(unsigned long long int x) { + return cutlass::tfloat32_t(int(x)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/thread/matrix.h b/include/cutlass/thread/matrix.h index 1e1f3eebd..a54b34715 100644 --- a/include/cutlass/thread/matrix.h +++ b/include/cutlass/thread/matrix.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h index 71edb936f..812dbd772 100644 --- a/include/cutlass/transform/pitch_linear_thread_map.h +++ b/include/cutlass/transform/pitch_linear_thread_map.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/thread/transpose.h b/include/cutlass/transform/thread/transpose.h index 552295d84..268e64813 100644 --- a/include/cutlass/transform/thread/transpose.h +++ b/include/cutlass/transform/thread/transpose.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/thread/unaryOp.h b/include/cutlass/transform/thread/unaryOp.h new file mode 100644 index 000000000..de4f79b97 --- /dev/null +++ b/include/cutlass/transform/thread/unaryOp.h @@ -0,0 +1,101 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/complex.h" + +namespace cutlass { +namespace transform { +namespace thread { + +namespace UnaryTransform { + struct Identity; ///< None (i.e., identity) + struct Conjugate; ///< Complex conjugate +} + +/// Element-wise unary operator that transforms one element of a fragment at a time +template< + typename FragmentIn, ///< Input Fragment + typename FragmentOut,///< Output Fragment + typename Transform> ///< Unary transform operator +class UnaryOp +{ + public: + CUTLASS_DEVICE + static FragmentOut execute(FragmentIn &in) + { + static_assert(FragmentIn::kElements == FragmentOut::kElements, "Number of elements must match."); + static_assert(std::is_same::value || + std::is_same::value, + "Unary Operator not supported."); + + FragmentOut out; + if( std::is_same::value ) + { + CUTLASS_PRAGMA_UNROLL + for(int i=0; i < FragmentIn::kElements; ++i){ + out[i] = static_cast(in[i]); + } + } + else if( std::is_same::value ) + { + for(int i=0; i < FragmentIn::kElements; ++i){ + out[i] = conj(static_cast(in[i])); + } + } + return out; + } +}; + +template +class UnaryOp +{ + public: + CUTLASS_DEVICE + static FragmentIn execute(FragmentIn &in) + { + static_assert(std::is_same::value || + std::is_same::value, + "Unary Operator not supported."); + + if( std::is_same::value ) + { + return in; + } + else if( std::is_same::value ) + { + for(int i=0; i < FragmentIn::kElements; ++i){ + in[i] = conj(in[i]); + } + } + return in; + } +}; +} +} +} + + diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h index 2ab40adda..c77a09ffb 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h +++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: @@ -216,6 +216,7 @@ class PredicatedTileAccessIterator(byte_ptr); - if (address_iterator_.valid()) { - frag_ptr[idx] = *access_ptr; - } + cutlass::arch::global_load( + frag_ptr[idx], access_ptr, address_iterator_.valid()); + ++address_iterator_; } } diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h index 58d2f7e3a..0342a4346 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h +++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h index 2047723dc..0d775dffb 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h index 73174b57a..31f529e00 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h index 6230b7a7b..6eef4b522 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -798,6 +798,269 @@ class RegularTileAccessIterator +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandRowMajorInterleaved::value, + InterleavedK>, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = + layout::TensorOpMultiplicandRowMajorInterleaved::value, + InterleavedK>; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 128; + + static_assert(sizeof_bits::value * ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 128bs"); + }; + + private: + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : byte_offset_(0) { + layout::PitchLinearCoord thread_offset_base = + ThreadMap::initial_offset(thread_id); + + // initialize pointer + pointer_ = reinterpret_cast( + ref.data() + ref.offset(thread_offset_base)); + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + byte_offset_ += pointer_offset * sizeof(Element); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + AccessType *access_ptr = pointer_; + + int access_offset = + (iteration_strided_ * ThreadMap::Delta::kStrided * Layout::kInterleavedK + + iteration_contiguous_ * ThreadMap::Delta::kContiguous) / ThreadMap::kElementsPerAccess; + + char *access_byte_ptr = + reinterpret_cast(access_ptr + access_offset); + + return reinterpret_cast(access_byte_ptr + byte_offset_); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + add_pointer_offset(coord.contiguous() * Shape::kCount); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for k interleaved arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// + +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandColumnMajorInterleaved::value, + InterleavedK>, + AdvanceRank, ThreadMap_, Alignment> { + + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = + layout::TensorOpMultiplicandColumnMajorInterleaved::value, + InterleavedK>; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + cutlass::MatrixShape, + Element, + layout::TensorOpMultiplicandRowMajorInterleaved::value, InterleavedK>, + (kAdvanceRank == 1 ? 0 : 1), + ThreadMap + >; + + private: + + /// Element type per access + using AccessType = Array; + + private: + + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iterator_.set_iteration_index(index); + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return iterator_.get(); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.strided(), coord.contiguous()}); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + } // namespace threadblock } // namespace transform } // namespace cutlass diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h new file mode 100644 index 000000000..5a0c74fdc --- /dev/null +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h @@ -0,0 +1,1522 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing computing the addresses of storing of tiles + from pitch-linear rank=2 tensors. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" +#include "cutlass/matrix_coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace transform { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for congruous arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandCongruous64b, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorOpMultiplicandCongruous64b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + static_assert(ThreadMap::kThreads / 32 > 1, + "This tile iterator requires at least two warps."); + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 64; + + static_assert(sizeof_bits::value * + ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 64b"); + + ///< Number of pointers + static int const kPointerCount = 1; + }; + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Stride value + Index stride_; + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + stride_(ref.stride(0) / Layout::kElementsPerAccess), + byte_offset_(0) { + + layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id); + + // This is the offset of a thread within a threadblock tile for a specific + // pointer (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base; + + // initialize pointer + pointer_ = reinterpret_cast(ref.data() + ref.offset(thread_offset_in_threadblock_tile)); + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + byte_offset_ += pointer_offset * sizeof(Element); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + + AccessType *access_ptr = pointer_; + + int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ + + iteration_contiguous_ * ThreadMap::Delta::kContiguous / + ThreadMap::kElementsPerAccess; + + char *access_byte_ptr = + reinterpret_cast(access_ptr + access_offset); + + return reinterpret_cast(access_byte_ptr + byte_offset_); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + + RegularTileAccessIterator prev(*this); + + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + + add_pointer_offset( + coord.contiguous() * Shape::kContiguous + + coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for column-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::ColumnMajorTensorOpMultiplicandCongruous64b, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for column-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCongruous64b, + (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.row(), coord.column()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for row-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for row-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajorTensorOpMultiplicandCongruous64b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCongruous64b, + (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.column(), coord.row()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for crosswise arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicand64bCrosswise, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorOpMultiplicand64bCrosswise; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + static_assert(ThreadMap::kThreads / 32 > 1, + "This tile iterator requires at least two warps."); + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 64; + + static_assert(sizeof_bits::value * + ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 64b"); + + ///< Number of pointers - two pointers are needed if making more than 4 iterations along + ///< strided dimension + static int const kPointerCount = (ThreadMap::Iterations::kStrided > 4 ? 2 : 1); + }; + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Stride value + Index stride_; + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_[Detail::kPointerCount]; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + + /// Construct a TileIterator with zero threadblock offset + CUTLASS_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + stride_(ref.stride(0) / ThreadMap::kElementsPerAccess) { + + layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id); + + // This is the offset of a thread within a threadblock tile for a specific + // pointer (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base; + + // initialize pointer + pointer_ = reinterpret_cast(ref.data()); + + byte_offset_[0] = ref.offset(thread_offset_in_threadblock_tile) * sizeof(Element); + + if (Detail::kPointerCount == 2) { + byte_offset_[1] = byte_offset_[0] ^ 8; + } + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + pointer_ += pointer_offset / ThreadMap::kElementsPerAccess; + } + + /// Returns a pointer + CUTLASS_DEVICE + AccessType *get() const { + + // Map the logical contiguous and strided access to the internal swizzled structure. + int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16; + + char *access_byte_ptr = reinterpret_cast(pointer_ + uniform_offset); + + int byte_offset; + + // This iterator may require two byte offsets if it must load more than 8 rows (or 2 iterations) + // in the strided dimension + if (Detail::kPointerCount == 2 && (iteration_strided_ & 0x4)) { + byte_offset = byte_offset_[1]; + } + else { + byte_offset = byte_offset_[0]; + } + + return reinterpret_cast(access_byte_ptr + byte_offset); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + + RegularTileAccessIterator prev(*this); + + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + + add_pointer_offset(coord.strided() * Shape::kStrided + coord.contiguous() * Shape::kContiguous * stride_); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for column-major crosswise TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::ColumnMajorTensorOpMultiplicand64bCrosswise, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for column-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicand64bCrosswise, + (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.row(), coord.column()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for row-major crosswise TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for row-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajorTensorOpMultiplicand64bCrosswise; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicand64bCrosswise, + (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.column(), coord.row()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for congruous arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandCongruous128b, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorOpMultiplicandCongruous128b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + static_assert(ThreadMap::kThreads / 32 > 1, + "This tile iterator requires at least two warps."); + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 128; + + static_assert(sizeof_bits::value * + ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 128b"); + + ///< Number of pointers + static int const kPointerCount = 1; + }; + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Stride value + Index stride_; + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + stride_(ref.stride(0) / Layout::kElementsPerAccess), + byte_offset_(0) { + + layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id); + + // This is the offset of a thread within a threadblock tile for a specific + // pointer (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base; + + // initialize pointer + pointer_ = reinterpret_cast(ref.data() + ref.offset(thread_offset_in_threadblock_tile)); + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + byte_offset_ += pointer_offset * sizeof(Element); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + + AccessType *access_ptr = pointer_; + + int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ + + iteration_contiguous_ * ThreadMap::Delta::kContiguous / + ThreadMap::kElementsPerAccess; + + char *access_byte_ptr = + reinterpret_cast(access_ptr + access_offset); + + return reinterpret_cast(access_byte_ptr + byte_offset_); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + + RegularTileAccessIterator prev(*this); + + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + + add_pointer_offset( + coord.contiguous() * Shape::kContiguous + + coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for column-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::ColumnMajorTensorOpMultiplicandCongruous128b, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for column-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous128b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCongruous128b, + (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.row(), coord.column()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for row-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for row-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajorTensorOpMultiplicandCongruous128b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCongruous128b, + (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.column(), coord.row()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for congruous arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandCrosswise128x4, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorOpMultiplicandCrosswise128x4; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + static_assert(ThreadMap::kThreads / 32 > 1, + "This tile iterator requires at least two warps."); + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 128; + + static_assert(sizeof_bits::value * + ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 128b"); + + ///< Number of pointers + static int const kPointerCount = 1; + }; + + + static_assert(!(ThreadMap::Iterations::kStrided % 2), "This iterator requires at least two iterations along the strided dimension"); + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Stride value + Index stride_; + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + + /// Construct a TileIterator with zero threadblock offset + CUTLASS_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + stride_(ref.stride(0) / Layout::kElementsPerAccess), + byte_offset_(0) { + + layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id); + + // This is the offset of a thread within a threadblock tile for a specific + // pointer (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base; + + // initialize pointer + pointer_ = reinterpret_cast(ref.data() + ref.offset(thread_offset_in_threadblock_tile)); + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + byte_offset_ += pointer_offset * sizeof(Element); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + + AccessType *access_ptr = pointer_; + + int offset_c = (iteration_contiguous_ * ThreadMap::Delta::kContiguous + (iteration_strided_ & 1) * 2); + int offset_s = (iteration_strided_ / 2) * 8; + + int access_offset = offset_c * stride_ + offset_s; + + char *access_byte_ptr = + reinterpret_cast(access_ptr + access_offset); + + return reinterpret_cast(access_byte_ptr + byte_offset_); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + + RegularTileAccessIterator prev(*this); + + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + + add_pointer_offset( + coord.contiguous() * Shape::kContiguous * stride_ + + coord.strided() * Shape::kStrided * Layout::kElementsPerAccess); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for column-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::ColumnMajorTensorOpMultiplicandCrosswise128x4, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for column-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCrosswise128x4, + (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.row(), coord.column()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for row-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for row-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajorTensorOpMultiplicandCrosswise128x4; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCrosswise128x4, + (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.column(), coord.row()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace transform +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator.h b/include/cutlass/transform/threadblock/regular_tile_iterator.h index 8445b8366..d7928ac00 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h index 93849c658..c3f0b5249 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h index 4ea472938..85d702fec 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h index 21176880c..c7f069077 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -831,6 +831,269 @@ class RegularTileIterator +class RegularTileIterator< + Shape_, Element_, + layout::TensorOpMultiplicandRowMajorInterleaved::value, + InterleavedK>, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = + layout::TensorOpMultiplicandRowMajorInterleaved::value, + InterleavedK>; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 128; + + static_assert(sizeof_bits::value * ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 128bs"); + }; + + private: + + /// Element type per access + using AccessType = Array; + + public: + /// Fragment object to be loaded or stored + using Fragment = + Array; + + /// Underlying iterator to compute the addresses + using TileAccessIterator = RegularTileAccessIterator; + + private: + // + // Data members + // + + /// Data member to the tile access iterator + TileAccessIterator address_iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : address_iterator_(ref, thread_id) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + address_iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileIterator &operator++() { + address_iterator_.add_pointer_offset(Shape::kCount); + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileIterator operator++(int) { + RegularTileIterator prev(*this); + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + address_iterator_.add_pointer_offset(coord.contiguous() * Shape::kCount); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + address_iterator_.set_iteration_index(0); + AccessType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + int access_idx = c + s * ThreadMap::Iterations::kContiguous; + frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset); + ++address_iterator_; + } + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment &frag) { load_with_pointer_offset(frag, 0); } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + AccessType const *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + int access_idx = c + s * ThreadMap::Iterations::kContiguous; + *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx]; + ++address_iterator_; + } + } + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for k interleaved arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// + +template +class RegularTileIterator< + Shape_, Element_, + layout::TensorOpMultiplicandColumnMajorInterleaved::value, + InterleavedK>, + AdvanceRank, ThreadMap_, Alignment> { + + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = + layout::TensorOpMultiplicandColumnMajorInterleaved::value, + InterleavedK>; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileIterator< + cutlass::MatrixShape, + Element, + layout::TensorOpMultiplicandRowMajorInterleaved::value, InterleavedK>, + (kAdvanceRank == 1 ? 0 : 1), + ThreadMap + >; + + public: + /// Fragment object to be loaded or stored + using Fragment = Array; + + private: + + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileIterator operator++(int) { + RegularTileIterator prev(*this); + ++iterator_; + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.strided(), coord.contiguous()}); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment &frag) { load_with_pointer_offset(frag, 0); } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + iterator_.store_with_pointer_offset(frag, pointer_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace threadblock } // namespace transform } // namespace cutlass diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h index ff5f0b456..82c8842ec 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/util/debug.h b/include/cutlass/util/debug.h deleted file mode 100644 index 9941b41a1..000000000 --- a/include/cutlass/util/debug.h +++ /dev/null @@ -1,122 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#pragma once - -/** - * \file - * \brief Debugging and logging functionality - */ - -#include - -namespace cutlass { - -/****************************************************************************** - * Debug and logging macros - ******************************************************************************/ - -/** - * Formats and prints the given message to stdout - */ -#if !defined(CUDA_LOG) -#if !defined(__CUDA_ARCH__) -#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) -#else -#define CUDA_LOG(format, ...) \ - printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ - blockIdx.x, \ - blockIdx.y, \ - blockIdx.z, \ - threadIdx.x, \ - threadIdx.y, \ - threadIdx.z, \ - __VA_ARGS__); -#endif -#endif - -/** - * Formats and prints the given message to stdout only if DEBUG is defined - */ -#if !defined(CUDA_LOG_DEBUG) -#ifdef DEBUG -#define CUDA_LOG_DEBUG(format, ...) CUDA_LOG(format, __VA_ARGS__) -#else -#define CUDA_LOG_DEBUG(format, ...) -#endif -#endif - -/** - * \brief The corresponding error message is printed to \p stderr (or \p stdout in device code) - * along with the supplied source context. - * - * \return The CUDA error. - */ -__host__ CUTLASS_DEVICE cudaError_t cuda_perror_impl(cudaError_t error, - const char* filename, - int line) { - (void)filename; - (void)line; - if (error) { -#if !defined(__CUDA_ARCH__) - fprintf( - stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); - fflush(stderr); -#else - printf("CUDA error %d [%s, %d]\n", error, filename, line); -#endif - } - return error; -} - -/** - * \brief Perror macro - */ -#ifndef CUDA_PERROR -#define CUDA_PERROR(e) cuda_perror_impl((cudaError_t)(e), __FILE__, __LINE__) -#endif - -/** - * \brief Perror macro with exit - */ -#ifndef CUDA_PERROR_EXIT -#define CUDA_PERROR_EXIT(e) \ - if (cuda_perror_impl((cudaError_t)(e), __FILE__, __LINE__)) { \ - exit(1); \ - } -#endif - -/** - * \brief Perror macro only if DEBUG is defined - */ -#ifndef CUDA_PERROR_DEBUG -#ifdef DEBUG -#define CUDA_PERROR_DEBUG(e) CUDA_PERROR(e) -#else -#define CUDA_PERROR_DEBUG(e) (e) -#endif -#endif - -} // namespace cutlass diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h index 7758309ec..e80961394 100644 --- a/include/cutlass/wmma_array.h +++ b/include/cutlass/wmma_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md index ffab354ec..9a00d3056 100644 --- a/media/docs/code_organization.md +++ b/media/docs/code_organization.md @@ -88,6 +88,7 @@ tools/ cutlass/ library/ # header files for CUTLASS Deliverables Library (in cutlass::library:: namespace) + handle.h # implements a host-side API for launching kernels, similar to cuBLAS library.h # defines enums and structs to describe the tiled structure of operator instances manifest.h # collection of all instances @@ -175,6 +176,14 @@ examples/ 07_volta_tensorop_gemm/ # example demonstrating mixed precision GEMM using Volta Tensor Cores 08_turing_tensorop_gemm/ # example demonstrating integer GEMM using Turing Tensor Cores + + 10_planar_complex/ # example demonstrating planar complex GEMM kernels + + 11_planar_complex_array/ # example demonstrating planar complex kernels with batch-specific problem sizes + + 12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu + + 13_fused_two_gemms/ # example demonstrating two GEMms fused in one kernel ``` ## Media @@ -211,7 +220,7 @@ of tests run may vary over time as more are added. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md index 6b8e09dd4..15656d25e 100644 --- a/media/docs/doxygen_mainpage.md +++ b/media/docs/doxygen_mainpage.md @@ -120,7 +120,7 @@ cudaError_t cutlass_sgemm_nn( # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md index d601ff5a6..7a1a6ae7f 100644 --- a/media/docs/efficient_gemm.md +++ b/media/docs/efficient_gemm.md @@ -216,6 +216,7 @@ participating warps - since each warp now owns a partial sum (since they compute The following additional resources describe design and implementation details of GEMMs targeting NVIDIA GPUs. +- [Developing CUDA Kernels to Push Tensor Cores to the Absolute Limit on NVIDIA A100.](https://www.nvidia.com/en-us/gtc) (SR 21745) - [CUTLASS: Fast Linear Algebra in CUDA C++](https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/) - [CUTLASS: SOFTWARE PRIMITIVES FOR DENSE LINEAR ALGEBRA AT ALL LEVELS AND SCALES WITHIN CUDA](https://on-demand-gtc.gputechconf.com/gtcnew/sessionview.php?sessionName=s8854-cutlass%3a+software+primitives+for+dense+linear+algebra+at+all+levels+and+scales+within+cuda) - [Programming Tensor Cores: NATIVE VOLTA TENSOR CORES WITH CUTLASS](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9593-cutensor-high-performance-tensor-operations-in-cuda-v2.pdf) @@ -224,7 +225,7 @@ targeting NVIDIA GPUs. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/functionality.md b/media/docs/functionality.md index de8da82db..465fae7d1 100644 --- a/media/docs/functionality.md +++ b/media/docs/functionality.md @@ -27,7 +27,16 @@ Hyperlinks to relevant unit tests demonstrate how specific template instances ma | **TensorOp** | 75 | 10.2+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu) | | **TensorOp** | 75 | 10.2+ | `s4 * s4 + s32 => {s32, s4}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu) | | **TensorOp** | 75 | 10.2+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu) | - +| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `bf16 * bf16 + f32 => {bf16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_bf16n_bf16t_bf16t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `tf32 * tf32 + f32 => f32`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `s4 * s4 + s32 => {s32, s4}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `f64 * f64 + f64 => f64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `cf32 * cf32 + cf32 => cf32` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `cf64 * cf64 + cf64 => cf64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu), [Gaussian 3m](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu) | ## Warp-level Matrix Multiply with Tensor Cores @@ -37,9 +46,13 @@ The following table summarizes supported warp level shapes for each TensorOp ins |-----------------|-----------------------|--------------------------------------------| | **TensorOp** | 8-by-8-by-4 | 32x32x4, 32x64x4, 64x32x4, 64x64x4 | | **TensorOp** | 16-by-8-by-8 | 32x32x8, 32x64x8, 64x32x8, 64x64x8 | +| **TensorOp** | 16-by-8-by-16 | 32x32x16, 32x64x16, 64x32x16, 64x64x16 | | **TensorOp** | 8-by-8-by-16 | 32x32x16, 32x64x16, 64x32x16, 64x64x16 | | **TensorOp** | 8-by-8-by-32 | 32x32x32, 32x64x32, 64x32x32, 64x64x32 | +| **TensorOp** | 16-by-8-by-32 | 32x32x32, 32x64x32, 64x32x32, 64x64x32 | +| **TensorOp** | 16-by-8-by-64 | 32x32x64, 32x64x64, 64x32x64, 64x64x64 | | **TensorOp** | 8-by-8-by-128 | 32x32x128, 32x64x128, 64x32x128, 64x64x128 | +| **TensorOp** | 16-by-8-by-256 | 32x32x256, 32x64x256, 64x32x256, 64x64x256 | TensorOp instructions depend on a permuted shared memory layout that can be efficiently loaded from. The following tables summarize the destination shared memory layout that @@ -68,6 +81,38 @@ from global memory with layout specified in the column "GMEM Layout." | **C** | `half_t` | `RowMajor` | `RowMajor` | | **C** | `float` | `RowMajor` | `RowMajor` | +**TensorOp 16-by-8-by-8.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `tfloat32_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<32>` | +| **A** | `tfloat32_t` | `RowMajor` | `RowMajorTensorOpCrosswise<32>` | +| **B** | `tfloat32_t` | `ColumnMajor` | `ColumnMajorTensorOpCrosswise<32>` | +| **B** | `tfloat32_t` | `RowMajor` | `RowMajorTensorOpCongruous<32>` | +| **C** | `float` | `RowMajor` | `RowMajor` | + + +**TensorOp 16-by-8-by-16.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `half_t`, `bfloat16_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<16>` | +| **A** | `half_t`, `bfloat16_t` | `RowMajor` | `RowMajorTensorOpCrosswise<16>` | +| **B** | `half_t`, `bfloat16_t` | `ColumnMajor` | `ColumnMajorTensorOpCrosswise<16>` | +| **B** | `half_t`, `bfloat16_t` | `RowMajor` | `RowMajorTensorOpCongruous<16>` | +| **C** | `half_t` | `RowMajor` | `RowMajor` | +| **C** | `float` | `RowMajor` | `RowMajor` | + +**TensorOp 8-by-8-by-4.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `double` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<64>` | +| **A** | `double` | `RowMajor` | `RowMajorTensorOpCrosswise<64>` | +| **B** | `double` | `ColumnMajor` | `ColumnMajorTensorOpCrosswise<64>` | +| **B** | `double` | `RowMajor` | `RowMajorTensorOpCongruous<64>` | +| **C** | `double` | `RowMajor` | `RowMajor` | + **TensorOp 8-by-8-by-16.** |**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | @@ -76,6 +121,14 @@ from global memory with layout specified in the column "GMEM Layout." | **B** | `int8_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<8>` | | **C** | `int32_t` | `RowMajor` | `RowMajor` | +**TensorOp 16-by-8-by-32.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `int8_t` | `RowMajor` | `RowMajorTensorOpCrosswise<8>` | +| **B** | `int8_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<8>` | +| **C** | `int32_t` | `RowMajor` | `RowMajor` | + **TensorOp 8-by-8-by-32.** |**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | @@ -84,6 +137,14 @@ from global memory with layout specified in the column "GMEM Layout." | **B** | `int4b_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<4>` | | **C** | `int32_t` | `RowMajor` | `RowMajor` | +**TensorOp 16-by-8-by-64.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `int4b_t` | `RowMajor` | `RowMajorTensorOpCrosswise<4>` | +| **B** | `int4b_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<4>` | +| **C** | `int32_t` | `RowMajor` | `RowMajor` | + **TensorOp 8-by-8-by-128.** |**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | @@ -119,7 +180,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++ # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md index 983740227..7556cd45d 100644 --- a/media/docs/fundamental_types.md +++ b/media/docs/fundamental_types.md @@ -16,6 +16,8 @@ Most types in CUTLASS are usable in both host code and device code. Moreover, th CUTLASS defines classes for the following numeric data types. * `half_t`: IEEE half-precision floating point (exponent: 5b, mantissa: 10b; literal suffix `_hf`) +* `bfloat16_t`: BFloat16 data type (exponent: 8b, mantissa: 7b; literal suffix `_bf16`) +* `tfloat32_t`: Tensor Float 32 data type (exponent: 8b, mantissa: 10b; literal suffix `_tf32`) * `int4_t`, `uint4_t`: 4b signed and unsigned integer (literal suffx `_s4`, `_u4`) * `bin1_t`: 1b binary numeric type (literal suffix `_b1`) * `complex`: defines complex-valued data type based on the supplied real-valued numeric type @@ -182,6 +184,39 @@ AlignedArray *ptr = reinterpret_cast *>(smem_ AlignedArray x = ptr[threadIdx.x]; // 128b shared memory load ``` +### Numeric Conversion + +CUTLASS defines procedures for performing numeric conversion between data types in `cutlass/numeric_conversion.h`. +Where possible, these target hardware acceleration on the target architecture and support multiple rounding modes. + +```c++ +#include "cutlass/numeric_conversion.h" +#include "cutlass/numeric_types.h" + +NumericConverter convert_f32_to_f16; +NumericConverter convert_f32_to_tf32; + +half_t x = convert_f32_to_f16(3.14159f); +tfloat32_t y = convert_f32_to_tf32(3.14159f); +``` + +Recent GPU architectures such as NVIDIA Turing and Ampere combine numeric conversion with efficient packing +into bit vectors. Consequently, CUTLASS defines conversion on both scalars and `Array<>` objects to implement +the optimal code sequence on all architectures. + +```c++ +// +// Example: convert and pack 32b signed integers to a vector of packed signed 8-bit integers. +// +int const kN = 16; +Array destination; +Array source; + +NumericConverter convert; + +destination = convert(source); +``` + ### Coord ```c++ @@ -311,7 +346,7 @@ support on current and future NVIDIA GPUs. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/gemm_api.md b/media/docs/gemm_api.md index 0d58cd36f..759b1cd41 100644 --- a/media/docs/gemm_api.md +++ b/media/docs/gemm_api.md @@ -514,7 +514,7 @@ to inline PTX. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/layout.md b/media/docs/layout.md index fc36a2761..bacec0e44 100644 --- a/media/docs/layout.md +++ b/media/docs/layout.md @@ -267,7 +267,7 @@ Permuted Shared Memory Layouts: # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/profiler.md b/media/docs/profiler.md index 34051651d..ad4c58abb 100644 --- a/media/docs/profiler.md +++ b/media/docs/profiler.md @@ -15,7 +15,7 @@ $ make cutlass_profiler -j To limit compilation time, only one tile size (128x128) is instantiated for each data type, math instruction, and layout. To instantiate all sizes, set the following environment variable when running CMake from an empty `build/` directory. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=all ... $ make cutlass_profiler -j ``` @@ -102,7 +102,7 @@ Report: --verbose= If true (default), prints human-readable text to stdout. About: - --version CUTLASS 2.0.0 built on Nov 19 2019 at 13:01:00 + --version CUTLASS 2.2.0 built on Jun 8 2020 at 07:59:33 Operations: --operation= Specifies a particular operation to run or print the usage statement. @@ -191,29 +191,34 @@ Test your changes to gemm kernels with a quick functional test and save results Example command line for profiling SGEMM kernels is as follows: ```bash -$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096 +$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 + + ============================= Problem ID: 1 - Provider: CUTLASS - Operation: cutlass_simt_sgemm_128x128_nn + Provider: CUTLASS + OperationKind: gemm + Operation: cutlass_simt_sgemm_128x128_8x2_nn_align1 - Disposition: Passed - Status: Success + Status: Success + Verification: ON + Disposition: Passed - Arguments: --m=4352 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 \ - --split_k_slices=1 --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 \ - --stages=2 --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 \ - --max_cc=1024 + cuBLAS: Passed - Bytes: 52428800 bytes - FLOPs: 146064539648 flops + Arguments: --m=3456 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1 \ + --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 - Runtime: 10.5424 ms - Memory: 4.63158 GiB/s + Bytes: 180355072 bytes + FLOPs: 115992428544 flops - Math: 13854.9 GFLOP/s + Runtime: 6.73655 ms + Memory: 24.934 GiB/s + + Math: 17218.4 GFLOP/s ``` Note, the arguments which appear in the output may be used as command line parameters for subsequent invocations. @@ -224,31 +229,34 @@ Note, the arguments which appear in the output may be used as command line param To execute kernels targeting Tensor Core operations, supply the flag `--op_class=tensorop` in the command line. ```bash -$ ./tools/profiler/cutlass_profiler --op_class=tensorop +$ ./tools/profiler/cutlass_profiler --op_class=tensorop --m=3456 --n=4096 --k=8192 + + ============================= Problem ID: 1 - Provider: CUTLASS - Operation: cutlass_turing_h1688gemm_128x128_nt + Provider: CUTLASS + OperationKind: gemm + Operation: cutlass_tensorop_s16816gemm_f16_256x128_32x3_nn_align8 - Disposition: Passed - Status: Success + Status: Success + Verification: ON + Disposition: Passed - Arguments: --m=4352 --n=4096 --k=4096 --A=f16:column --B=f16:row --C=f16:column --alpha=1 --beta=0 \ - --op_class=tensorop --accum=f16 --cta_m=128 --cta_n=128 --cta_k=32 --stages=2 \ - --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=8 \ - --min_cc=75 --max_cc=1024 + cuBLAS: Passed + Arguments: --m=3456 --n=4096 --k=8192 --A=f16:column --B=f16:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1 \ + --batch_count=1 --op_class=tensorop --accum=f32 --cta_m=256 --cta_n=128 --cta_k=32 --stages=3 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024 - Bytes: 52428800 bytes - FLOPs: 146064539648 flops + Bytes: 180355072 bytes + FLOPs: 231956545536 flops - Runtime: 1.51255 ms - Memory: 32.2821 GiB/s - - Math: 96568.7 GFLOP/s + Runtime: 0.98647 ms + Memory: 170.272 GiB/s + Math: 235138 GFLOP/s ``` ## Covering the problem space @@ -271,7 +279,7 @@ with the `--output=` command line option as shown: ```bash $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn \ - --m=4352 --n=4096 --k=8:4096:8 --output=report.csv + --m=3456 --n=4096 --k=8:4096:8 --output=report.csv ``` To faclitate generation of pivot tables and charts, additional columns may be prepended with the @@ -279,13 +287,13 @@ To faclitate generation of pivot tables and charts, additional columns may be pr ```bash $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn \ - --m=4352 --n=4096 --k=8:4096:8 --output=report.csv \ - --tags=cutlass:2.0,date:2019-11-19 + --m=3456 --n=4096 --k=8:4096:8 --output=report.csv \ + --tags=cutlass:2.2,date:2020-06-08 ``` # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md index 5ce16af1d..0cf7ea257 100644 --- a/media/docs/programming_guidelines.md +++ b/media/docs/programming_guidelines.md @@ -104,6 +104,14 @@ for (int idx = 0; idx < kN; ++idx) { // Loop has constant number of iterati ## Style +### C++ Style + +CUTLASS source code follows the +[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with exceptions and extensions. + +Design choices should be consistent with the +[CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) recommendations by Stroustrup and Sutter. + ### CUDA Built-in Variables Avoid direct access to CUDA built-in variables `threadIdx`, `blockIdx`, `blockDim`, and `gridDim` within @@ -132,14 +140,6 @@ In particular, be sure to use: Avoid defining alternative implementations of the same functionality. Instead, prefer to enhance or extend additional components where it makes sense. -### C++ Style - -CUTLASS source code follows the -[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with exceptions and extensions. - -Design choices should be consistent with the -[CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) recommendations by Stroustrup and Sutter. - ### Classes and Structs Type names use `CapitalLetters` except when implementations are a _perfect_ drop-in replacement for @@ -178,9 +178,10 @@ Members within classes and structures should be organized as follows: 3. Constructors 4. Other methods -This convention follows the [CUB library](https://nvlabs.github.io/cub/), -and it also approximates the usual order of Systems and Controls textbooks. That is, they start by -(1.) identifying relevant constants, (2.) define a state-space representation of the dynamical system +This convention follows the [CUB library](https://nvlabs.github.io/cub/) and is also described by +[Howard Hinnant](https://howardhinnant.github.io/classdecl.html). Unsurprisingly, it approximates +the usual ordering of chapters in a typical Systems and Controls textbook. That is, +(1.) identify relevant constants, (2.) define a state-space representation of the dynamical system under study (i.e. the data members), and (3.) devote subsequent chapters to definining dynamical behavior of the system (i.e. the methods). @@ -291,7 +292,7 @@ Github's pretty printer. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md index 5f4592230..4587b7d21 100644 --- a/media/docs/quickstart.md +++ b/media/docs/quickstart.md @@ -7,7 +7,7 @@ ## Prerequisites CUTLASS requires: -- NVIDIA CUDA Toolkit (9.2 or later required, 10.2 recommended) +- NVIDIA CUDA Toolkit (9.2 or later required, [11.0](https://developer.nvidia.com/cuda-toolkit) recommended) - CMake 3.12+ - host compiler supporting C++11 or greater (g++ 7.3.0 or Microsoft Visual Studio 2015 recommended) - Python 3.6+ @@ -20,23 +20,7 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc $ mkdir build && cd build -$ cmake .. -DCUTLASS_NVCC_ARCHS=75 # compiles for NVIDIA's Turing GPU architecture -``` - -## Clang - -For experimental purposes, CUTLASS may be compiled with -[clang 8.0](https://github.com/llvm/llvm-project/releases/download/llvmorg-8.0.1/clang+llvm-8.0.1-amd64-unknown-freebsd11.tar.xz) using the -[CUDA 10.0 Toolkit](https://developer.nvidia.com/cuda-10.0-download-archive). -At this time, compiling with clang enables the CUTLASS SIMT GEMM kernels (sgemm, dgemm, hgemm, igemm) -but does not enable TensorCores. - -```bash -$ mkdir build && cd build - -$ cmake -DCUDA_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. - -$ make test_unit -j +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA Ampere GPU architecture ``` ## Build and run the CUTLASS Profiler @@ -120,6 +104,53 @@ $ make test_unit_gemm_warp -j [100%] Built target test_unit_gemm_warp ``` +## Building for Multiple Architectures + +To minimize compilation time, specific GPU architectures can be enabled via the CMake command, +selected by [CUDA Compute Capability.](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities) + +**NVIDIA Ampere Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA Ampere GPU architecture +``` + +**NVIDIA Turing Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=75 # compiles for NVIDIA Turing GPU architecture +``` + +**NVIDIA Volta Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=70 # compiles for NVIDIA Volta GPU architecture +``` + +**NVIDIA Pascal Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS="60;61" # compiles for NVIDIA Pascal GPU architecture +``` + +**NVIDIA Maxwell Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS="50;53" # compiles for NVIDIA Maxwell GPU architecture +``` + +## Clang + +For experimental purposes, CUTLASS may be compiled with +[clang 8.0](https://github.com/llvm/llvm-project/releases/download/llvmorg-8.0.1/clang+llvm-8.0.1-amd64-unknown-freebsd11.tar.xz) using the +[CUDA 10.0 Toolkit](https://developer.nvidia.com/cuda-10.0-download-archive). +At this time, compiling with clang enables the CUTLASS SIMT GEMM kernels (sgemm, dgemm, hgemm, igemm) +but does not enable TensorCores. + +```bash +$ mkdir build && cd build + +$ cmake -DCUDA_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. + +$ make test_unit -j +``` + + ## Using CUTLASS within other applications Applications should list [`/include`](/include) within their include paths. They must be @@ -143,10 +174,10 @@ int main() { ## Launching a GEMM kernel in CUDA -**Example:** launch a mixed-precision GEMM targeting Volta Tensor Cores. +**Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores. ```c++ #include -#include +#include #include int main() { @@ -161,7 +192,7 @@ int main() { cutlass::layout::ColumnMajor, // LayoutOutput float, // ElementAccumulator cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores - cutlass::arch::Sm70 // tag indicating target GPU compute architecture + cutlass::arch::Sm75 // tag indicating target GPU compute architecture >; Gemm gemm_op; @@ -193,7 +224,7 @@ int main() { int lda = A.device_ref().stride(0); int ldb = B.device_ref().stride(0); int ldc = C.device_ref().stride(0); - int ldd = D.device_ref().stride(0); + int ldd = C.device_ref().stride(0); // // Launch GEMM on the device // @@ -372,9 +403,14 @@ To instantiate kernels of all tile sizes, data types, and alignment constraints, Several recipes are defined below for convenience. They may be combined as a comma-delimited list. -**Example.** All kernels for Volta and Turing architectures. +**Example.** All GEMM kernels targeting NVIDIA Ampere Tensor Cores. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=all +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=tensorop*gemm +``` + +**Example.** All kernels for NVIDIA Volta, Turing, and Ampere architectures. +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=all ``` **Example.** All GEMM kernels targeting Turing Tensor Cores. @@ -384,17 +420,17 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=tensorop*gemm **Example.** All GEMM kernels with single-precision accumulation. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=s*gemm +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=s*gemm ``` **Example.** All kernels which expect A and B to be column-major. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=gemm*nn +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=gemm*nn ``` **Example.** All planar complex GEMM variants. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=planar_complex +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=planar_complex ``` diff --git a/media/docs/terminology.md b/media/docs/terminology.md index 1ef0b3839..07464143c 100644 --- a/media/docs/terminology.md +++ b/media/docs/terminology.md @@ -74,7 +74,7 @@ contiguous and strided dimensions of a tile. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md index 4fd068f89..061ff9073 100644 --- a/media/docs/tile_iterator_concept.md +++ b/media/docs/tile_iterator_concept.md @@ -466,7 +466,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept { # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/utilities.md b/media/docs/utilities.md index e3d2a52c0..b9ddc79a7 100644 --- a/media/docs/utilities.md +++ b/media/docs/utilities.md @@ -111,8 +111,8 @@ std::cout << tensor.host_view() << std::endl; ## Device Allocations -To strictly allocate memory on the device using the smart pointers to manage allocation and deallocation, -use `cutlass::device_memory::allocation<>`. +To strictly allocate memory on the device using the smart pointer pattern to manage allocation and deallocation, +use `cutlass::DeviceAllocation<>`. **Example:** allocating an array in device memory. ```c++ @@ -128,7 +128,7 @@ int main() { size_t N = 1024; - cutlass::device_memory::allocation device_alloc(N); + cutlass::DeviceAllocation device_alloc(N); // Call a CUDA kernel passing device memory as a pointer argument kernel<<< grid, block >>>(alloc.get()); @@ -340,8 +340,9 @@ used throughout the unit tests. ```c++ #include #include -#include + #include +#include int main() { @@ -378,7 +379,7 @@ int main() { # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/images/cutlass-performance-plot.png b/media/images/cutlass-performance-plot.png index 1d76a7e64ba3c0c1aac16286fd4282f62b0649ed..9caf0223492535a717ab4a2a1e4bed04a5bc74f4 100644 GIT binary patch literal 69902 zcmd?R2UJsCw>F9uUKOz;-3kg)6d`n8ML|VCI#LBxga{ark^li;6r`(2k*Xpslz<4K z2SGq-K_NtG5dw)OLWB??K;W({7kRm)bWqf^6Oup>omB&@nOI)5m`%+owcfvN;m!S`=s~OI^*9b zVab2oS^xX&&E>G&zfYduSfBCxgin6=*Nwl=Zaw3Z`F-+d=l|fV2JC4ULO8F!wk<(> z-Uc@J?c?0H+Dy%WeL?d-E+&M*ldzEu0i2QFO5DOFU+LaRUBH5%cZ9$L?bSun_nlnA zhcEC(+V&=NZM9$Q#|n#4$j|)-^7?@WA77K~9!t_@5kFe5fMUbSZ0C1VQ*llI6U;Wq z?c$b5A>n+JY~uuKMac5J3$ZceyQvAYvW66|8Lbz&J=dW7+rscTGXY-EV&>ZrKp!1{ z%P(~~p{=jKItMXbLy3Oy;KAHC{XYAI#I$x_kFypTvD7Cy)CC>8=_SwfEFAr9IDd>o zo&R1tjF*wYwHu-f0drNJL(v=fx4$x|z?fBC!4YoSy;X*iN(B>FN2a_30O84+dpQ$s z6;=IZKV)M{?eYENNpuOtUaE))FHQJd}H6y+}@>vHqevWD+P4;|RqvJDi zVF7yRmTGifn}k8Eh^yk>r`sms`US7u8G*(|i18>HGw;)|xD;GSZXmTV-wZ9Yes7IH z!5`kQMPqmOZo%m+|A@XSM7h|gnowt@H`3-HHcp>*UvvD4(FA05KITYD(-z^eht(e; z(?XDqjo$PBZwFS+M7m8N1~=TfEDHyXNo44WoFs$9EZ z=L&p!X<8+4t$esJns6y_8n2$<;e>L2d)-C5tLRJV%^BNw8ds|>uI+1r)(m&kGUW5I zBY+b&I=5eJ=i@8;tJSCt`SxLMm2ekES%_UV7Asa{Y2@`nCps_-pN>lOlCY&PM5UX{ z!<$1BGuEE7rWeF#(GDKzG!8?s1lhlI;SXAZS1+*}{cQnDJl_-1hS~|pq1cGY`c1bQ zEm!M3`2{`2C<|n*(mz=AlB-_Yl9Zhkb5S=tvU_8|(OE>7qXe|&SZL((kOUuJ*rf3$ zpnqX|Wl>GrY7eR%s}~vfol$|Am-YAWBNtu0--LpM6`=!CuhaU1m)oizYkiaUXVsi~ zSY@H|DX%TlB342-$LW?*JOzD24L*CFs^Th%bC*0i)3wo{aQ&WpE!7H}Lp;q%@p7ea14CT8&fp~t z*{Ge&BI;@GG$loz8c#!6>Dw7QBEIML)5>+47yYLOlMvdZ)t(JjS&MeELXsxCqHlNj zuy`CX;3(s@P0^kLzJ`_~;_n2T$rj!h+PjH?CG_lE9w1XC=r>sKKlYm-WoS+k^ z%(|y5tc2I9=MQKTkqsUh3vR27B6JA0rqGeb@|A2oMoC6PEJ>x#eOg$vdICw?sUusj zQ70)GQ5sIZ^GQWxg7Qp^nYOJZ%eT=nB`<_dhC(eZisH6lNhk*URQ$zIZ{ww9g?^Sj z19~k3>N^O9eP--`MTDySfKTW`pJ0WGd`IR*buMS3ClC0Qe(6iQDfj^!(9gU*ywGt3 zs)vP(&xa(Io*~&7ht_QXGPhE|6}@H)%)-{32%B3b%*}U(i%WV;8~2jTO|+8`hMh&4 zZE+IE5u=D}+lcC}w4KNGcMb&MG9-TZ9yW6S4;`z-zJL})K#Jv!0+YhN1IduP% z32QhaGvL0)ICOj^?_>u<>{amgo%T__B6Hf|J<9PyX)UuXK}6fdOk-THy;!1S~tms9i{Yuwjn!xsim)#47nX_Bou~B;{+V3hk0U2hbH^N)9s-J0I5-4=a%@){iwCftm157h>>)|eqF0BMge_n+fF3%D7 zoFX~)6nU>D_uQ2mPkQw>dg}ji%6Qj-?+Yu^a4#6L&hx9o2hgs}*8{UD%MQ++S6pZaZjzq35bd+rgWBjzI zHSUuUoJrW_KP4lpY!$-lbTV!j>&Z}0L};^!in&+2&3B`33_<6WjEWC3q_NUcfnV?S zr)N&i^DKP=&(eQFFZG_bt2e{e%bH-*G4Q7G4>H-88*-Uqa%T_UDDW^diW_<*>=<~E ziFj%IshFcz$3s`sRXr~?&<@>t5@|lp-ZAvT%FaHt2BxRlo*?D4$1C<^k#3HLZn@DN zxK>qK2{i!u(PswvA=_>x7D-G;uXE0Ler-R6CAf=dF8F2Abz+<~c`tuOeSXMmVd6t# zA;AebwGGj|94CQpQJbZRGaT3=tl?PEN6|{2k~Xo7l6VPuY-3NRW(or0!C;%^3&>3B zh8B9Jw$y78YSFC%2y3k3TF4Mx^!tKePRE)lPVG$j3X>kNy&@yO)~U0K@WVOHMvrW? z@hF23{CTi9VseqklNcUPo>*6>r+D&zk9vMi<%-I&Ka==R%zDo7N2L$5+hWtlOa(nY zrN0GydAj1aJzVxdiqGH+rdh;~jDB=SqQ)$Hp$ zaJe|~q?O}IM$^3wg^-4V6Dg5qhpsDsb9?98v%xrL^Cm+-9qg$$PZM^w&k~jK6cMPw zs_&uVC;YMFLCIU5BsmjLUH(E(6FBL=THES}>Hi%56sNvsuw$lz(PV@iA-jyu(sbLV zg@>cEg|p?Ha*$^zz4Obt3OmnsQjYZ$5uv7O`4o*9Xt^;g#FAK1ZX9dIIu*)h5FjTA z(AU{li@zAUy9hpml&emltyj}}&e{|31Z2(1Kl`4)jA}EFbiXf}*|z29WB8}=P+ z-W2YokOB#pDcpkZuQvK--R)g?=ZYA4CS!0H!Ar8^Atftv9fNvzr_KkgwD5~(9{Z0A z4ad4%;VUv``NlNquj=4heG||{qWkvza2uou)0cCbvo!sGTxe(=ALe%;%?vY`6D&%$ zNrPCc@54Vol@U(7qa{Z+CyUl)JLA-|>?G|SJmPx7wQiJo9KRtThuwUs0)dS^gp}Im z0T+-jee6v5rln1&Lbpm!*qJ4~jzc)!QxYnA0B@9TSLsx#%a$H^boCoX`e{k6>N}O4 z6rCSS?kzs6JXfd3b9GmZhmMr$vji7&*9XJDxjqd$f81-MqMxs>m#m48a&;k$){piW zC~n!xi8!KY2Z{S2e~GXdbn7vXr(qWFTP~0`-W=CT+y3y~N_$Yw9$aYuPp@DHN$M>I ztF=a9@AYl$EJJazd2xCzt+bz}9=aM#HBv;YL3%#tXi*+vOT&@+I%_2Rq3|v$koQFi z$ffGrHM#`%($>5Qe1gFrM&z;D;O$n;Dp%7c(?(2oL*LiOP{j7CQ{MWl@dk2+8$jDW zB8K6-XUVfvE0R@iboz2j^U2BXudwD#Msap2^&SIiZ)5VhNb30GRcdjpX@bwpu#kkt zQ%Q#u>s&FnAQsmz;|kUl3cP)DQpgRLWz>+RRebug766MTi{jZ9{1oPH)w6jSob4-`?f!{dC`Er z;Cgb>OxkSoRz(nQNLq(Z!7q;F)zk;XzBDi?YP7GDz^yjJ7_%ur!SS~MgTiL~PDYH#I zhKdbO5nZ;94`jS%w+hapo4{Y-? z*O1Ai#nW+vRgPEAs5f*DiET+Hsi?oNFZsf85wffI3|AT+ zMILZt>ev=?-M&&NH&?WJ1e}8-E8)v~@6zosRghmGD6d#cy}p>HM65(V$z zv2w|?=<~U2F*6f(O8V!sPc4>P#B8yu(Yk!HuwdgeqChE0+80o@i$q47-#5EiV+2p9 zoSwEq6&+kG7u}#Q=TM*TVpMFk{*|OB)@eN8T``J=pRu?8RGEsEWXovmhpalN-6wGc{T9rPjU~o52oGSWui3$k~KxIjTGwV zteQgD@&}A#-1F6g{n&mWrF4M}8@HHJ~zM89|-{IKQI~F)e>x`-t9bNecKLx^c zwf?=_RaIud#DOASdU?~3YquUmiT$uNvMl~m5BH@qFC!RA*qxc#Yd+Qs^JP2zyD@$l z`m2b9x}ysRfco!Eibox?(4ze$gV@}s%)KwKpLX@4^L=+h#|n)xV^wRT$9NQazy9Xc z8v){e?9yV}X?lv*b76+&w}44tU?-9C^NHSD*((fO{%GJ(KKhkP@aVt~U|lBBm={YP(%onXAIkiY~S>*Ex4kw>r>HH6Bl5;fa^~y{{TS;^VmPp5fJM1#2LGIFTCppVPlLj6C0Nhkd z^(H!`Q7E;;rLD+u?r{aC*5v4VWI#%$um9hunMO@4PnwdoL6VmZW%MQTcP|4?alBSCri5q=b6rX z<7D6A-lFI&&tQiGEMVbf!@HIMOfr&C1F)A|N0XIUQ@xCnNMzsai%7{!4M_*lRmvy$ zPF@&J?WR-^lbLvRF=;RHgry~j%5?SY^Xmng$3%1wXFG(R)*U)=Y<4Pqxg^ALChNBH_0yM&iEw6fbyZ{( z4PHno-!-ZTzjNoxgIYM8GiG?1*Dy69@x9!+HZ2Rvp}(H(>=nN`TKd7=u@i1v!I?%@ zEY@|_7aa|M_U6;-k?~N&iBP*q-%n4IX8$D|_50d%PYh+(8aM0=c~1i1177St{ML&- zkT8ZA3KkA&4DF5kTFpvLkb3Fy?by0|lZUe0m}Ex5x`$_GJM;`kE+h~jgU`#(+`lxN zzP5Awet#qYa-yph80jo9X2S-=JC$c^vB zn+FFl!P@Z@)nh+L=?#0(xJ$N&4U!_=HnUyt&)U8xCNi!8P^~b)Ap^P^r1TUU9(k23 z%lmx^bd_f*l)<*j&*(vySnItseJsJ<4BmU9rw%a<@AZEKuanM)%TqkNy*va9qswP0 z)~nq*UOV7sleo^cw(fv^w>mDZ@B$d;?#lRnjy!CN7OUwS4vb5m*}2)_((sOK>B~DH zHyL6dNFBI?ZRryc^ka|){wT4+Nu&AdKO7rOHNUegCMi}SBeU6%cq*KdCK@0P)6>Sf zYZvB{50++Y4WIm`{>IOkaqR3ye3;&S+e3U7DZwkS6Mo&3zN+7u>fLIIrHVN#u^L%U ztCIVfgI%GxSRbQU-$qiY-i@EA`uH|DF<~i#K^L1xnAG0f79^hz0?@eBP zZ*&_>+2=|2&bJZOB3eJ%e)yP<)&*jDJqw?F_TqG%);33>W1zQ2 z&n3Nq{S`M!Z+CuMGr5ZVfLV)EqCDSn7v!cx)%!?pPv7FIwjmr`8$@O5MaUqeo3G5s zSE-9qIYRb$N{}g?gdzPM#@&p4qv-s*df$SF0*RFST$ zP4$Zvk;IK+`js$JTfZ2t;B73jqs&QCCn|l-?=}or&NOco4XXH42s0cm+%p5-V? zctyLln{e*F1uQ&4qps6Weh^Kqv>Q^KRg+A|D`c$M#! z!}K)KJcLoK$CGUHUqM${{pYg)c>Hxdzrmg2UuXaOBmVz`#fAUst4eAH9xrhic!Xa_ z@m#<@u@B#*pROkgzJ7NX~cHZg49mVG&yMpIuu49N3rB3gAogAHy}u+M*@^ zXO&@J&4Hwc$m5eKmVicn`r|0>#O9bqR+%9n` zA$q)ql10mM#h1Ibo1xX&NH2RCSVJsosdj?6yhutAHi#F$n9wFwCo3g~HI4{rL=hQ| z9j_?bJ?et5K>!4T2y%l~oHQ;co=}L2s(k*PdBZ{uCXvSPS!&7fDD;CpB^(Y-xa>1! zL3oynylDqrJisVZfWR`lse_p*%)MY>R$J`~0D|{t_o3e1&VN;})433=&?xQIlOXGr z*S@f=wor1s4Iz2m=F&z8iN^l`qIaLaO`Ra!Q<0tGh&ay7!}K$}`?vrzp_QiAvfbzZfwEyE4f^cxAaM>&Q25n*Q-A!RC#?b|a5MUur%vXsIwi>r%t~X}DxL z(sF}d-z^vUTP|Jh#qa!XNp|^^q*cy}CeQ*O%|gf78bUWKBO>88DKvYCp;4@QSA7W@ z457?cfMe$Vj2q9JSHpiEC$AhEs~0#nxApk8QytyR#WdJ@y_^i_Zi@PQa(BA9H^P)v zDvL7o9IBGnrg}PP83I^}W8Jb1?TZp4+>EnzG7%LaA%`x1vU*tP^ze2X;f|S6hCw~1 z-2t2Y>78vadFd1iZWHM;N~%fr%xGHBvU7*xkGaT;QA866xpB1mi+!bI74|7KM~L2T z*LW96f;Ax1MxI;3;8Tez`?J!;YUBC&si71}<~hq8B;OP6eB}_EQ07ssR-IBRxdzTiOR~6_F^$q9;mYu6RQQaos)+r9mUx0OzA8 zYa0Q}n=ZXrF9U|wf8y+holoDl>^Z1IP`rIb@an`YHD-p*_WIh6E4I25v*XX1TZX`G?<7||C6Eu%)sWL;?Z*zIFDKFi)a z_5@X}dN_6XDRq8+Caa*Qn?pTSDQZ04s z$+!q`rMAGz^*3X+3s9DyE1{g!V=U<>CvS1Y=?Q5+MX)EcbvYP}1K}OqY!4TUa;(mE z!oR*~fyVBKoNXYbM#@JWw%>Iy5v7LCZSL(Wh`2WmN+09*_$4nSNCyeZSW@5<^g4m9 zadztz3GZyLgTe$+H+)MQ&xuBRM z@k&A5Vy@DLh0!2yDsvkuD~Y%tC--FX&<@j*RP{@N&J1#QA%R$ii&X9LRf#5)+fJcM z@3MMf5B8K-CKKdUA>%642v= z)oC_cixrCqm#!8i`^-*!;Ar#n&hsig$i3KR)S_FyF@AK zC;r5Ix?CH9^We&aOP6U0nJX0J2b`3Suj?LiD7#H?ruw8~Eu%-%+}j=Abc%RN8)acR ziO2fjaSL1Dp7zVCP%V)47&_EtE~s?A7ZqQ2Zll8`(nckea}L$Nvf`=rK+)zP>Xne5 zawG4;K`E8Hr{m!231>70@UxH08^iiy#ek5=sfjH*yuPo{MsXiM(%tu-9iOHq4m;DTQ?Li_H7M zAD{%zY}-OVSN%%#$@`)+5qUj{G+xE7^VV(z<3@5}#t- zQKYG6tD;CLe=0q&o@N+I(!nCJu7~VdKPqr}1}$;Bs!p7L4t?6|cEA15x!R|Ios+zJ z0t1m{Bo-j(w4FBQb1eikTZFL=aHT-`XTP}#NWQSWZPj}RE~P`fjT^F%ByqJn2bhRM z+a5>WI@Etv(7$aU4X$+PG(2nWT=h|*k$Bq&UXQetlsl{{@CPnqJ&G$V8nW8{$Q3jf zv|$c8s%EF;b#=l|7} zAK}eMS%Nbw5v&9Fshcg&FImCWWO|Ov<|Wxtt->{S4tH(%GdB;Hm=!6BIL1W8OX+4I z5{tYDnjHi0@HpI%6_$rvTo$_{Z`q>Du1YEKB8ce=R>wgddt8rmjn>VlXxHMYnYwGe zOOn||5i*R>V5P1s?I5p3{tYwB3PP~$MNg`q44_cdc|4u~R}pplPq-}KiNQ`|{YoP&OT=1vWd}o0z|<`gRqenA9PQB4Wy|)G+uHy&E{;9n zNr%_l#w#^s2ADD zp}FMQYn~(INiCa9E(RPUr>}4ni56TQu_~!nyKN8Md-Px56)&#-1fd;XRpQw>))HAa zl&iQmE6T%qM#azbV)w+MhfgET@r${mM6=%`mrX9z58z z?xFglNTdnJIcbx;TGG80gE>{}IxqFx7lWsVYd{D#-(WfNQeAefH5nKFj)KCkqYak^ zaCj;|N}ADp;gp^t+f`vxG+wh1WHf8lPH%%%$PCZ@B#IFRk*~B`>6sW?R|ELOO0`D#49P|rbb(i!p#O7c+mODqq_uPN@{Of2# zP+v|KsXZazv0;{Ry-dp{-Fi%%08r5s1REn=8li%6(lkN6O@tLaB9F7IS}bN(ORR=y z1QdPdO35BbA+Uy3D&_lZoV@l}zF^R9jCRv(RlYNm(3Wr|FBOrh2=oB+j(Qo_-reO! z^Uu$imcJG~@DOFjzg_%7LUJ;32OZ;ZTNkIh77Xkk$pR;{KCK01)2jN)^yHg8>!Lm0 zzba&0cgOE=>-undNSp`RnrF5I8;(0Cxw;+HZLgcIm19bq8@$hV@@s;1cfw;qAPwwF ztc@bd@`c`gX3nh7=xo#CiOL46R$ux;+Jxt)>?CW2F7%N51YxCmtXH+=RZj6(r>D04 zcAj&V;?a7ryu{BBN)zyw)VO<0TU6n%L`(eC`IIN#leQ8vXsarY1-2W^3>XEy8Tj-C zqYRmr^VbS$yKkEh6;SMU;c}?z5zBO}E|)r)<_oN4PkYWhdT{442jSVOvFqTw$)BR~ z3zf0~T+|ZBt+n@c>iLl(^cYib4A?==yZ%$+*6qyelsgo^d-R0m21Jfk3o1fQ&MisQZP{0l z4$($?Ld#^+KZgw}G0)FPzKYrL_J|~>98KF~8DV*~OK&^;;f+sIrx@tY@PmHYGC5JQ z8Tg;?0BH>QFxMUi9OY^_UMr#QbA*YIM0|Szl66U2;N~gL#UAf(aG1s;EY|JQ^Hra6 zdjjeViV&cNTnVJcNw1Eo0r!iz>)=>&H{;H&jV#oQj;SF`z9So`XbKHcEpFoku#{^& zSpT^`%vDq9@QD*AUX3o!dY+54wCM`ZTzw+K0DFf zpjm|I)$*5+(fbyKou8grsJz;JFva5z<^vz@NC%-&wg)yS3+szWM$zL0YGOrGJ2)~} z1d_Qz<_*bFtlxVAN!hLEm18I=IHZ1{#gTs1#u$oMGBtWFsua6G13^w-PHroq)i2#U zz_BA#_g#?dXZp&cwUODMEkgN@A#L1Is6k<#7KCb~FV=Pq3G%%#IJa2{BqL%ZR@{`p znF;4-uy+e&=Lbm0*x}em{09&1c|wCYW{{qX&BL^WeCCF{Fw+csg{;t37AFYI@Hl(l zfjZbc$L*pKlY?36LJTKWo;sz>i2fMDC~Bu&mi$@=;!|_)yR3zYB$EC~Mas!2>4CrU z22$XGYXiBpo6migcZ@NsH)e@uhq8fX@5`Z$S9YOW;pO53DA)wlS}Q-0z1(g1FgksP zwOIRTjTCFPg8a6iiFIo2*`cc#Ds=u&#nEs-5fFRy(8h`Dmo+6t|7_+ZRxdlOMEduiAC7gB?Mf!zm%^99XgWCebARR*>(}%{ zH7_ui`7_-%8lv77xM!Qh0y%tnXy}DrWG&nI(5=Uj&Sy7zw+Y&8t5uDc&`d_;M3ot+ z^omnX7MNMDYQ#hP2_c`=Eh$>D%Nd5O*k%@M^4PQ?al zy)j^%S*&;eln*<;JEl6BRvV8x_`*Glu|Y`d5klK=6K%!F+aPaI^!v7d)qcsq{!qO$;nCXlT>ZA}(e*?>%++D`cAfuv%wu zQ9}E+V0MaMP5-;vFa5S!1*f38MQl0*5S)6H>v#)3ycx@Oh76L(OJ--%s$WQEOQ}5F zeJ_4-$@$(x5X{A^c#TlhT9Tt0g06+ zQ{UV%(a^yyry$GR0$Y?uR;Qqs)y6VZajt1J&!^X}?1r96q;usK*d16tqOEfMdy}+o;`6!g>?U02R(}EdwjB9k;(8i0=2Z)3f3I< zSMDPQ5JL(H%C{ZtkB4G`acJ^!-RcKM7WPM6ZNo{W(E3ntkuU73`8`ep?zQ!%BE_|p zML&g4NR48_1->RNZuriZ+Ei$sZ|1#N^{Od&W|h;W%tyY*O+lJcQTS!hCTax$rivkc za`u8E=UXL39;($gH>nt^&Wv^;lsXjhpWy%XSYB4Zx4*>~r)`~~6H+^r&=CUqPN(MY zu_)}VoF{Z8st`2Hq&OeBN7z1Y>(;Nc|D(H!|EqxC|5-iK|1zp7vaJEtWa{hN@YV{P z&d?P7OGW8haCcAwt}=Sp$a!-;8E$8Ih)lOy^~xtGvC(8xA5l;bJD~t%y$wea`YGn9pD|s z#?Zd@^7FYLKrc5|VsyAMBnT6CSk@ug`GbJg{8j^ffm1rdhxUuC>$KlZdgmJjd)Ycb z%LR?v`J-{;zBex11@)wngukE0$5#^tUN7Hw_^aB*Ycre{F>4+uBQ`ZyDaVWkP0s*P zqESUD+}^d;bzOitKd0_^+}B!;`jhs(FTajmpi}Ed8}RyvUL1hH!jx|hRh8JiIJG`L z@z{n0=fbH0*A)+Dm)5zf1T{)eCJlK4n@0uy?+5b7Ldc74Z`z=zItFHb)73;(0nhj= zog1sR3GbCN+=&ps$w6kU_i!UtJjB?N3Dt-F{Q9wr^vk>^=aO{_EJLT*2p7KTB#OL_O6pU++PDa+5U-j*HWKtQlpq!48&eaJGed_RFS5u zK$HA*xq!zskXK-n(kBPL*ZEITk^fkPu>S#{o3yNZ^1*Htr~e<|CX{VT3&|^EY7)Pq^qg$RCu9?g<$c1J~8cmPTM$PL(*=sV|F5=cPa;@2oO#aqd z706y8Z`L^tjVw8Wg0Kuu$F^90V6Sd)r@<%FHb3(CnrveLbImA0dSTj+K20jm?5I({-^)`;WyBE?*T#5&gz{^*5Ltc*5yl}6adZoBv`H;p#+oIHO) zr8)h@u!i>f$K`m0{5oQ zotl&0F+^pp8A&S`SG$p~SSh8{AyBja3*GyG z@f2#MTx(PXcC<#joWhoF^6gD{s9s z%4r6|sBKigaCP#OMfhXCxQeHr{To&q8-n<~3+p%)M4+P2snp6Mb50Bd4x5WN_kA4( z4Hht9ZtpGw+dcclF4iFI^&7AbrJq&^%{b!{GnZqK;J-YhZTLXP;EzX~Vy>!_2II?k zpUGNB|I`|MMyT&0?{Q@--6xtv{C|9Yj5rTlS)A#w`^asUIL7Tzt^=N6>3Dn0qi>wD z78~V(?`BW{plzaZS)<2)3Oqut)h>m2Y;_6meRml;P@Gx)Kp4BgoHpP+-R6&*qF+Hj z1?466+42INK40WdA01)aKSa~5*Hx6ld`#gL!x|wV9045{1$?!jxZ;H^?!T(xN3~dR z%@`^ess!UNJ|{LXVjFzDQ%1%^m(}QWA&2+$rjC=qQl5RXGrG8{FW)2|H{Tq~qPS|# z4ugn$-vdPk{rLHBKDaFH?;qo+qWOF9%AN9r$W1y@oMex->3pZ^iIsmWi7GZ$E@`?N z{d1CwP_(|R<#;6hEe-=J#>X(m37`zGy>mLY+;``(I1SlbR@|E% zLcT*$v3Js1x#x7*pxrlDx$T4ZiayNTEjD4KLQ&#|cuJ}L37rM};Ss{me3^{M)J|Lg z*_mCN9+QClwxlZ%I3Xux)bKr+YM_&X;44Ga`?2E20~+UxM&$dY5No&Ao+|{5b)>fz zVzq3CefofYg#1kDGi{q0m?UU2enK$2XdyN7qJ1NxaPfz7q$YZ7X?#p*#f4+NGfx^LNs#Dt z4cFiwSs~wVCBXN0=gJAk;6j)e7F1Tg|3NuOo@HdhZLrhi@f@ka^!Nc~erJ`g7N0j| zWsd-`>$)v0iCWU>y$8#vbdpysuB#0E#t!=QamK!S_9l#~n6M8seAjNqS88SQ?o$|; z8fK5yimjXE=*MS~BY;Q+dw7nrPw*MmmF_;Qq4i%?v$9ZWWM&&w;cD=++UERxLFDK>^lZbSS9DG z&30~R8dj(^o0MP-==s`DND$6OE?IlHJF(_dJ{5qQl(4+`^;;3qe>m|pUzoFF!=2JV z786UI=Xf>Ct)a}0XIv|j>c$(&l|Jl1?eQ~ z1BDG^@v2O~YISi04RTD!c(*JkAib6RH=*E!yO!~q{=R+t^1MLOb-mQFPQ#BQihN=2 zTo{|L3>7@{z=TV5o|j3(qJjH4&eweVvpJ9H#iKvb0Rzb+VkwUDjf_D^9a{xDU_9LH z)G+(EX-vJ(dt*_LJY>Y`mu*`e%{JlLwx$WNrpSs})UV>jeKY6jHq-wS)c^Z(&THrE z5L47&v&r||i^G_vXlAZ*www-o4t;rKm4In{r2i{=Dv0Dza6&mDcrwQvga6t*35muc znYS_6o85y^c7ZcV=m6bcX0HD{r}NY1qnBd1`jJyVwII{@{uQj3X9Xf5rHnaXbzy?Z%AW4dPMFUJMG=iQ^8cOpBAy zbvFN#0)U*`kvG9SLV0~L*VeOT+3`cXOJVr`=b`fM0{{OvSI36!b7=6tLEU^MA4%)t zDIZsuuGAqRj5bdou^*+dKc4{yE|pfR%^WX0eW>Z~>Nnb2C0#?=YkAfw9@H#u%7Yh~cW=|e?to12?~V30@BQyk@1iGq}Jul zEVa`2DmJW^cwvbMWsiSiWYaewc433|wLUKv=ZoBG*+>Xe-*e24xEuo%Ff56j?1$N_ zSrzQ_a-KvVNzz}p89r0#7H!Zt`>}A)4U0SwSEVtG=+j&-RnK`N6+$N(@kE@&vHBhu zL;SmJ{vz?; zRqhuRI>2_F0=6sc%x}|ifbjskIK>kr_IJ$M5suu%{02AvY+V>d)GSVmg^o<29V-0C z_cgMqUpSWb{Xg!JqSv!VKy;s-QQ*0zyJjp3FsP;UA|592W3*v8q6#!C&+W~Lp?`;1 zt;FU@Xd~vPvKNMZak;$7=hCO`yZdWw8R_w11JJVKh-6JpQx5M}zbuj!OaCr|Y%BZo z)_sLezMEe_mX@kh_eS8M4(Os9Sp{pfV*Udjky|5hNB>W52t59|+|?TcfWSQ5o{?+U zMdE+RG^gWn*6mULoPasOpxXD!L*+Ib8=DIWr9ohkrEn(Y0BGf*BV)5Gfe-tqjP$t& zd0mgkjm`oaPO3UrKmkj*?e~m}TRc!k-Ao?q7BJDjrX29C_r>{xJ;BA){rCg!Op26F zuo}~APqv&~C>ghb@42Tn+TqzRXUWIco$4N@ygt$N%j?qxE;sJ0VLXSDC7H4?vT>Hy zGsJliamIrPywsu241Fvsa6GD?2InkX@P_dzBas_5^fG*P%l($P*h8RXl`%ix&F?0^ z^Zndx0!{ocEU*+P81Z(^)RWjVs&6*`qxxf}Cy zd|~zFGG`^dRjs+xnqtTrj8r|fwqP9!T8+R}o59N)GPs>FFF(I=H>WZKz{~SN$~5=M zu#m7DuV{$Blo@`Tvw(JAVDqls+Jz`ye_b)q8wlZ9`y1cZi*KRsU492zh9RrWz%X#J=R$=Q`G29q@ zoL!|VloRZYZ;KN%_nuiurtDt#(f+FZ>7Ezc67?Sfge+$Sd3&AIP{nU2&9_atkB{>F zz~nKM<*6c~orLcmJiTFUb=e%vTaD51oXnj*e#yY+%O}Nkk@d?b&t`(oLE8N9h?p+| z(7*t#KgJC&>jYTU~T^lzgCDe4}bE82V|=t)BqeNBY?9^L=j&q(a`%Kg+$&g+J>+!4>ChGe;#M z+F2d4X^{x#nn+sJ;1I?!D36l>Wuijn6}Qhx)NNhZsVVN%;m$uA}=~}&n=#ky%hn{(!BtV`!;V-3GA9z>glynBrXS$ z#W+-TKYO5Z++!vQgT&g@-E=5-?NrRoA=MzJF2TlspoO8)m8}WGv&!FZ;i4dGH>gHl z?ImlnpzVWG{^^TSnyIh0wLiW(SE74tprocSFtAgGlN;3d*QmkgKlYp`mP0NkC%nqh zW@rd8PVz-p9*WTO3`1Yx1juflztdl7{kkObsqMw#%>kK^Z4BR|@I?`rAGgjW|T z3+ZZ)Y1vPAwNNw}Wu@TM;#s5y`9N_zyt@BcWMb=5B}7cvH9REs{X!(Y0P+*Jb^*Fa7+wG6S$Gbl>x`06 zPEGLCSvSeXE1Z|;?e1v-Z^)qbv_$l11*=I+odh=kYSI3bF^Nog>W})rvmz$c>$uLF zvHVXTLAIP`NEoGdwMgy56xWq3eppyL3o zXH>=zMo%>Qk6FNIMi^n;nMQ0%e7sw)R`3FKYipt~QUz2ob%f;btx)f~Gkwz zU9Rfv6-#lFUtLUZ2U&GIVBvL*!PulFh6f4DNGeVX(*i@m~ zw=AIE{@(lF?ITzCFa8|E$lXh}%QrZ1%InT*GwLjF>FM(hx;hklhxi2m=-Y8dNbc?rN)Z?W%C$}O!EX?}0Ar3Cf zZgY5BV(gueE@_hM6E9bnOR-DbU%H0-_{MS+;r;mgoCpTIx5BF->6J8V@%vAU*^^v5 z@$B@q@*0qLqh8zUZB*}^>e9O}Ym%)qGG)N*M%q2Mwv@Zmwr{%JbLp51ZSFH!+d?kH zO3vF#&irYThMvP-Ir4c6&y}t|4&tedGC;0G>LkU_znkCMx742ytiVJhZ}}r3rxZrL z^9!U%-jcxufwCq~a`gTnN))3m^3d1 zouwBI-*0LA?yzdtvIlyEEzMm|S@IJD@~8OeAqq08ZGCw_es2Sn9Feyq4G zzOs-Xc#a2Lcj2v*u|ZH*%dM9lBzo`^!K1JVPFos^&K|ibDLq5ZB}A{y{J<6gdP->C zz_#dKGzdydjjVOV!qXxl3Jta^(-mutV9<1%wVrWr&*gStFyAnKW)0Dg>&!#8=NX)b zfd;DSWL)3p*EbAjqD(H&69v;|+gh!oiR_BCD}V(aff-pT1!m=xglF5O5Z%+QLC#ue2Q{aa(d;z@7$vdB*?dM> zx3{jRtK}O9by#HUhQ6r+qz(U^25LM{{KGwBp(Xg1hosdw{X4yZ=>r?DA)>(8>bz=l zK9@6>LW`J4kO!X@-m7L{Ix-)`?W*M#(#5*D&71Xf`M&{X2Fi=*XleCr+s@U0R5b)T zrJF!Uf(Zbahy4f+g@g*u;uN`P?v4qYi;zYuhNY(GlDf@qvpBn}q?mcnzN571RQFL3nOpFc*lP0ox2& zOS&ex6g&UMpP7c60REi7ORJi+06NjZK_|k-$U03T&RwVe6yY?pBVwjLmmdZIIcMXV zfSlbj#;?G*K@%PT*43>xo?B-B_y_KML4#^F#Tg&~iR3~Th1Y60Ibfa_7dmQiSshE2$R6Nm?7r0zA9w>5cGTSwX$oeaOO(FNu;&VSqkQ4MaZB=Aq9I&ixbm%plJ*#bPAv zbkt3Qerih@a&jc4uC5IkJ8SL13W}UK*)}nYLS@+-Z8j~udQYR1yNl;_3qQJ z$9Xh@|2ajtO+BYxeL)+&+-5bG{}IUW4m@ zpgxsTS~20G*fx&8FpX;w#vyF%9!|Zba42Ps8pA_llQ;Y-9Hc%bC?L@QJQl8FS3(Dj zhMO@?Ab|j|2iW}TZyqVn$1{o6AmcRLT`3jvcP@BBh6gV5sQ7#PJDcV&6>c%0HeEQF zlc1BvEmQD)`=7y#nqJ=pj?l%~*%>weEhi!GBj2qdfMuI=(R94dGIImK-ffV-;T}Nw z|MvWu{rR1rIxeGPU7m1_m2;cWLG{#p83Ra7B~g@^)GLHz6*LZ zT7faF3*pUrnCS?T(dN>g?3BhqyL8j^L7q9XMK zZ1cXtwoqA$60bS^O)?={<=8tB*STd8PR9l0y?Ph7_krX##p!6^AQ!x&APo*+p%na2 zGOx}GcKN%jYaRXQhQhxCO&J`Z={qP=*0nc-?niUF{!Gnm&0G-2%l|r@0dAF_g^&e4 zbu@@>)F5W5yTI9vWYkZ_BP)BZYy1=aWaDxJ-7N-LeqZY>c8ZOi*aX{kD zgCJ1jUz+>wr7P<_sKLQsc*p+h3RPDWxjUV-Sx3mU05lGeu(Pe$YOHwkDO{!|fo+3q z2;Tb&d5a@rK^JV9v0XW=fiwbJ+8bV z;=~~r5S#`Vn&-kV4-9@>_VgLQ8h1AvxoA$9Fd7#xshx&^cvT`f?e%tAqX+Ql!p(YM zGZe70<)2zzuwVm5ZncOPsGw9Ghu%b_cS3J*h!7P5C9A<4C`eeG*!U3-7mZ>>cL2mmgIC>(m9=tdeQN3cSV z${D^*uajI*m7t|{S8A+u?5`^AjmYb0x^7FL|4ECX$#H8KlfiN)Q`+Li`{Mt2R3z!M z&~(o7?ho{W{geY%k_ocGwB>;S<^~LR5Ql(zt;tpT!bE%jCden-Hn<=LWMG3UrNy7h zT6D4c*wzXYEcGekgANA#kaoN_?^Of@G@usCPCB7~;HDlt_U6EC^4AfU4fXWKQ6W#P zwAuC_6ggY2J_xaApFua6xKm@_Ki5Y)zklA_n?-F&Qi%5<2S2p=3;j8L=kjxj7yk$2 zpBzgREirI%B%P#XC`rFMbFoIj(MLW~(d|GU!gH`W?Y+on`j_Ze3tqpQc&Xh?WXu<} z4`sy!JrLxPmI?fbb4bDEC37We#vhDo}rQ z_`6b`eFJ6qjMIjHkq5a7u@+YVl`Qr|4!`I&74AH2A0kjWY7L0tkke*UM|@wnAybF8 zxKQnJ;-HQ`6%M^Jm*KSWbCDxit$}E$_y{yBuS=U6Kn>twWoA4_EXV!8KNZ{M9g`vZ zwQfjLqPL7@u3>eZ7h^Xe2Kg>Ur2$a|Sfd;PfrPu4kmN(hu7JNA$PWMy1~{sRT&+=x zgRwza_%r3Te23^>Gi)<44M2-vXF+WxV|MSR82}R%UE!mqj7BGqF*84F6iQt=-dp(Y z5m}BdG~M~wnArtu)aK7yUjw#LjwV>3>4ksDhfdCyhBSzCDw_7yOh*l83^130O&QnB zW9EVaR+59`JNu0K@Ix8xnskxuNxHK=baY+WAhp74l-$C{S8K-p5Advy+H=BY^`1GL z1Qq9u5`r^md!nS^KLDdxu6NkDmCA91JVtW&h0vjXEYL@HE7&J{(cE(wj%V!0&co%#}4tYbWA^q=g9>+*~}fvcj93*RWSw z{`c@DRXBl(udWL-FdzV1p^#Yvb1%rC)vZ{YA(Kjq{!C$$?pN&1H=oAIw?}!5I3? z86oTlpx;1uY>**8=fIDWV}I94^b(?&OBeV z6dO=$^zc^XjyyW^txbE#BS$X69<@;Jc^B(`TE~lbH$!DP5Fpgw7(;J^H!3KilRVM_ zZew}X*`-Ly@U4c@mdAP znYM+~RQ;|>3@JVXlTl7mi^#YpODd)u6L2&66HZ?lFJ#ElV z$<#&UO>dje?PehdjpZfj=46O>3>Vp7O>3$?jARhCUXebQz&rWlQAu>| z)v%nfuD2?(k^Uog$IIPxO~*IyblCPj<_yT2Vf5JKg?sA>x$P2^qJyL_J5oi5D41gs z__cRbZ)}j=VoPRSx6{6bY`d>oK-KA4t|stV;Ab#_U**e={3)Aiq}veeeNcy2iN0|B zPY}gS(MU^GKm-I?(v=z;FR74RD8@)ZJogFJCI)|MCW|PzY`~-fBrg_k# zz<7|?+-YF`qZh1X9{f7gtx+_Xos@b1={tl3mL0iBX&hYXDWF`tP5-2JQXM9%6dYlW zbO-1HZvsWmKx|OJZ{bT4q8!*-f4T&RlaWs;qu-k8hj65#&BTVmo~4L(7FJd$AJoEq zbmCKm^bH^cY(O>+C^f65sIv_x6-B!Pq-|dFa+?0+zudZc0giU6UReuHRANNDee=`{ zai_QX`_=G!y269dgutjBE>c>HfI+F58uP@C4ShBvvw-UT3s_jLW@9X@fw_36mTA(9 zSdDky70z{xS|#S9!D;6@H)8s*wf~dl_5ToZEo{oAfVDWoS!cNZM>6v=bciq$#Y|TEZ_8;+uOQfgke50bniDX#DqKAgmi-`v23>i5M_c#u%NEPW0aED z6{NGlR&;4gF3ng?`!YH8E=W{Az2L;Tj;5?Nww~)lh7nH97PcM$`w%DanP4PDav+SA z;NX%A5~nwCeN^w@fqqzKmYeG2Rq6M4Drnz%mg@KkQ9--8IPv|?mnEB!6{GSckPj4Q zp11lbyn%WJ$&3!URonFQ>MIc+7Qt?Ly1&2m9)!*GW+;!6T_+bH;}uqE?CGlY3Lj{B z)3h?wu}Y2JVZ}OfpL)Ej5Zx^CoSU6brVzF@$6kJIfH9Qh;vOQ`tXfU9rCbO(pRgj6 z#@O^f)|Q=_q#yX}e~i13j;f1uI0t_1fJo1xUy6|k&#C)!+XroElR`H5h$Tk-{ z=a~Ox0{v<368Fu?&b$ehqnCA6=M+9~Na64Q{aa<|#D8au1(*McvaoK?|4SJa>zIEJ z;QMV?!heea1WSrrpCg3;Gt%flkig{$C&;nxM-cY6Y9H=<{jbWSrm`=otRyc*V)85V zO~M(+Lm&|N8Sk{wmH%bUQS2TW2nhNE5gZ*amz%CQ?q|K_w(vB*E8R4EgKJ^q;56Ui z;ZRYKKfXB(LumeUfnVuR09*o`)9NfV>C^ZMrqF71~r| zJL$gF9Q=Fzq2{3&H0TDLiu9(a)Ku;k)zt#tl`J>+<4&;-dd*XfY3V3D4RL{$Tb*^&W$ zI{gy@oIMxQg+tOr%v7<`p>KJ{kkAWO*lAZSA}f;avjAl1fo($u=#zLAS`oCI24~7c z!An>&D96&3yLJ*D>6<+r8(h85s4QIxS8q-<^?+fPfPzm6#2h^-QrLd{Jr{8kcrWh) zbu*4Muo4!8=v;`QIFFsm9`uDQ-Ebj%i2BRtXL9j3<9}s45S)Qs`*c0Kb`hJgx9D>u zfq-frtGhpVPWN?BN&k4-{_>AxfI2XZJ$vix&oApG5Fx8a)kbEGzGGq?+_s$zOA~1V z$(T`IF>m>JOVATMJI)F={yHkku$uJW1?br1rcvc0zo0(}djmV5BXkjC72%PNW}^Gq z+E6Y16R_=JA)Y2cJccHiI<_|v-^6Y}jfL4MyO@J@e3&fp`e@CcSz&uJ&hO7{MMZj^ zxidCss1Wh?8dc%UT#x#sflDdlwIRVY*BGDwu4ptG(dpEBq{+A}b%jVa^b9LIuEe^R zP+!C|#THMLk8}n=tqWEr%!!LSo!%~`71!1$3 zvlguSed@g{`eK8S&<|tyGgFKMAq)I;XV3th3|X{y5>i~h9e}W+x0H|u&bqmHh*|(Z z5Y*0+23%?(mbSXi2Ai6Ptgj(kue_F-m=<{l5sr~6+?4rT^SxHk%T%W^vX+?} z02x5Uqq9C_x|uPYE-B5q?7*0C?~o6rt;+Yo_K%nOsPmMNi8o$+ud-9j9gq#Ut@MDh z2bY4JBHQX8RDC|G%7a+{f-}=OxF-_qNJ}JCz2SA7-&hbCd_$=!Ial$mQ6d8Ds9+6gO z&iJp_DkZ}Zhp7}C4p`4)UUSfMGc3AE)NqfGoHP0Qj3>$j01;?`;2JDI4K*2Er;D2~ zWK!f+$Ny9D&wzpz>hf}DX;-jP^(?7jYEt^B(&_+H4*G1LD9d=$5IE$>#nzQDz(Q3c>;<8Z4g}~+!Zz8O|K3itRzK%uFZRK z*b~@J_i!g{ns0f>$f=QckBD;MbxC}Gql+X`7EEHw`HnRA^kLr1BV}oh^xq^?N6Eg3 zh^QPZ3}IB~ZuH%MNsK@CWrAgCi+7O~IWEhP%1nEc&Uw6F#DD*PtT;sUg%4bEEIK6} z6~~$lJ#`9xwa%oy!)AteEmYtgvDvQ4K%*oLX%UeBrE7FL6B+HOA;(cR%RWe%MOTBWfp%8!?QeESAc zCUjcx`EGgP(i+<`BFmf$ieoh;XaGO}rC-ZNJK-x6yTJZ9S52NYW;kv9QU zeQNg_a!|9OLIFm@Ij~P#4tkq_ovic1z9MM^W&9S=Z42)hy*aBu&065qmlwXvCHT69?%cy z@G^UKnQ8zQQvdN`aHUO214v^gOU^%Ix>jDQ^S;lyJ+EPK^Su(nAdycXLW*Dj)M3AL zZL0X*5GExXV6q_?mBbeL4oy&8jc)Yq4UlEo%m>+FNGR5yTBI}n9`W8WFIZ6#Bs?=8 zXi|AT0q`mlaJ#KJisrL1W{VoASg@O`Zrxl%(22!{TnM3bN+5$}M|>R#8H{&>GBH4k z5w9@fzgCEAjyh`?tbsExW;(Y0E-h4dZGAdfvEAH^KGyQSQ3Ci@$Mzh05m>XO`^rzDKr=x0^7W9!h%%ZYh>OK=Hy;qW^LI6 z7_lO{^AK*%H(+xTco2>nsMgTBer{$3uv)FpkkQ;)z^2VjMAbCZpXfcl;d!1eQl%?| zD`(tMVunG7{0Gl6mFFJibuoEa$j518W|$*sjf`ArjyQ*0Gx1{Ga%WS+1FC-x49-l| zC@2?=m`?96W4mIIl0pHGPv{Ss~U37(^vdn)uH|r?GRw(n~zs2~+M= zaQcjU;MW1$KD4`)Y|e^vFAjZ*!vBL6i2koM2RvCm!y9SQCRb`3s0XwEXlHb(Z?CJ& zroT+k%qs5lkpDeWC>qI6m@zPXZuC3LhGvo6D(m}ayP9SmO!e($<*wgGF>F$vU$g-5 zNHfuN7_3uRj4Fe${%Oo4MExSFCbQD=l-H;LQdSQb4$GbHwrBVsgyFlvl&(&WW0Dg1K7F!(eydq-x4k&V)Kz}toFwlp1_?UAncV+6E;Tr> zGe<{ed1i2MN_Pt4DyF8oM<#=ACZi&yo6b!@m_U_)n@hM$ehS3Qp3Zsjue3(z7UF1x zF=Yw=!=|?&&B1rMW&caUB)Gg>b%@dsB>r~ne9XbPgo~d6O{5rveM?EyaKFxLAMp+J z^Sdi&(PtzdA(Ov);`9BuDk%+;F^9|VL=EnWWN7X=P4_@@K^##D$!lxM>m+3*#WHJb zlefp9P>f7DN~3>&+A-=_GqP0Q5iYX0mHlL^@&6Y4+JB>|Pc!NNzZBK$%6%T%2!5K9qMrS4^(eKSjxJ@KMc|FwF zVm6wR(dwQ&G2QBpZ?0S6+qxjToxz(|#dk(H$9)d7g0FV`r52~vPBD_?&RU$Af2qYu z-Z^jneX37$Xx9@f1@8AqNuhUu2SP_69nJ*CtrrA5q035;aG>fE74q497+Vf$-PU0| z$~s86%R4#ayW+b@jlt;*e{>u?NPQkw*V|CY^AO3 z>N^kvS!@eeuF_(6S+gl(?!@1@bH~ixeWC2&&3A<HauP5en+a2M+wn>Z7a?g&aH#Pa zPsjn0UpoEb!6DnD^`Po$|L6h9R<7U=pdg=$MEv}VLmJPz7j{|US z1S0Hq2Y=X{??Y~Ab5KwaZ}fH>Zg2&7o~Te#yzoWu1@9awhj$OH+GnakgWYH;f1hUzOJj*{Wn|`f@nz!qGAdKryL{wW=5|C%+ zupPDi-f6{oVpjo9D0keD+rx2Xs-!3B!7JV&MrFj^y29^kT&8^A(taNi-n5N7+t-2( zh_cceTV1gu85x^TN*#2(%dDDTDJ*u<@#>1=ct%<3i>X1WC-|0R*1w+BvL8&uWGvY6 z`(*8TbFsptW5`Rd(j4p6l@o9#uZ-IGqSSe~THq|LFgugjQi8t3Z{U`iNt6yo=bom` zwtUq&E^z-!Zg568$0b=Ms6Eb6zI#L7`7W-0aJo0C8xg2hzj7^Hjogm|-=8 zJkuyGHJnf7irF7u;%~IzUkyR8UpBbeTKZt_DZ7C1X6R3}wgUF4Dn)NedCARtg;t|I zv*EfMJ_a`jDZIM%oJ{HrF}qH3==P>(x(?Qcl84ScLvvnJdTZA};T1kADdTFm|9h+r zdGQsr7jbX%DLaR7A^XTBesw8Yn`y_;qphoeK1%aqD{?Z4^{6gJSkoo`%S2jT>CocF z%r7}w9VwDsCkhT@8RmM`^V73Sx*G=2#mGtFyTX1m0(LnU!g($#=m+Us;y0-*L_<*5 z%IxUOhJ{{xxlm0$bEpJuS%meP2+lL}WeTL*A6uUdQexGbt|}7|_U(qO0THUq^^ava zkA(xB!jjo0F?k$LzM}={l%-}&kbqjkevQxo8DqnkQLP~8S)F#rP|{QOeR7uL^NZAa z-8#>bOrr56{(%9#Ce1rGl^td^nZ${w?4Ei4zj{rc@&6IVe7@A0s9VR>NFmHji?lJz z8Vybk?0b`F7{nQj#>w^>cMP4~2c-eK{Q2ruHv%w0UqO~)-w>X4R~YAgT|!5&jOeHA zlOgDu(ju7g)nY3+#u7ufewD&oA8$IRVq2=v$|=c+c)G_vgr`P|^4+zdPUBcyekM_u zxGKp@j5pofgjq_>H(_J0+|f9Ut;onE8otd&m!#$r73@uT{8vY3RC1=gr)d$;Au+gz zvpm>w*~IMKQ<_23a9O|bAGMaP<@ABjvi?qSei;iYx$=dy6Ps#OlK<+2M2_yqO>Sbm zrnkEf+r>ua!8d#Zq2{;nA3M9xU~=_}#^30M8m$Jusrk>%&YNg3JA1XZ%}#y354(xmGu9Cjn`f#;Xl-bKa z{`jNOg{sM>6CM@@W_oQ*RT<*WI#sood8?#^=(< z9>xPFTqa}pjXAgj%P*G+Lw9VkuY>9~2P6zDIO^uKY^Bmx@nu|yVyJ^(^n$fJEL(7>TFQUbFdz~-UPbW?O6I*9bkA$9hdwl@!Nf+fCrn}rglV&Dw%98t zX%8nCC)3bIbxTH!<_Q8^6+NG_d^ee&QM>hH-zve%Gz3ON^_y_9|nNI#$1n1Q6 zRV*wVKC`6WXIo}Oo2{n^1X&Dj zR>J_zn7BD<1`XQgj>43Ik8qkntw*sh-`sZFh}%D8Z6de0)}xMC)H3#w=z41c{Sk7| z#30uIJuZuy!Jg;qxNgqq^F)z5Z_|@DqmN=^RuxjvCDYw=0{2=2sHvdtt-CR$ao5wS zdQt8?)}|k?yD5#^Y$g>$7t1q=>R7#p0;ZJ*!jo6LU^tfZ`@tS{&e%~>q}{XGOO*Mm zgq6YM>haBdrB_$QW8Xsg))NXZ*mlOLL<^nEb{fh08z}U{nyf^BL5qmA z#Y88fRY1S*082#5Yt27{;Naq)&$k`=q6Nwss>-#<2g+EyE9SGjFSh5Hva&KSJ(dsc zZewsCb^&~8TmzwlZbG6L4N&EZrtj!OK?3UdtcNzTWBYb+xF#(P5T3u&am#le11~Jk z7vz-}y=)vD8TRcB>alaa&71?FuT$Gc{Kl&N*gRVn-ZeeXnvVZ`A6a`7713Fl_WR0_ z<=Jf7QW2n+&M<5~i@IlmDjiT?ZXTQIy7GQ2{T_0dp1+UK^X`4-B(m7uZguDF51t(P z_kQ=&xc++=|7j#(7ys{R@o6yrcTEeGm#}q3D9X*_`Q`SIS66%*3nhEqLsk-H-K-iF zhGjfH>Q(7X_V}rSosFGanhK?j%&6}Ole_if6SgC;nn{bC81`}Fa*?tepR2#O z&(^XElr7T#_?`ZsTo(NCRUXyzYcC#BajpvK4OL!q=5hC&A>2ozNdrk1_UCJCj=4#7 z8%o}sWLojNv|@Cs?b+Gc1K+%PW1D(<&oL_P2{*xHzqiy(DI_#+^kKW!awI0y^b9dd zTp=C!Lu1#|%JWHY%u($~8I@{3*}%ZSw|VC+6x(kOXAU?bPs5}2NQA5)A?a)9#upAb zsT7awqI2T?gv7G^eLCs4Mr?kR+zP#h9-VVqOI8_F$K!gs#RI>wxGx&GAJH{poE}W- zAR$=z3G?*4=Apy)GXr zQSH~-wUjekDM=-K6oEo3(Hi}!wr<>@-KxhHtit5Y1RKyjDGliGUm3z3|)`4;3O*oBGIr1i&g07aC^Qs>&Om^S z{Tqai^b7gteO#B-lGFQiqj!x***$TC-8QT)_FF+-yLMZT>dHEgiMMGI@#sBu`WdKp z^WRV2CcQ}zZ=MMMyYrY$%IYU}r8*Twoo$VJ7p8R4@*5eC0e5%w=XU!$`nc(ktMtzs z?chP_)xHy>xoD5nd4ti|{ZO30G>I;HR?<`TmN;}Zgi}~bsHs4=A0p|)n!0=PQHr<< z7w$vP-TVSRw>o~;j!SIpeH`z>DDFxOK)+*TM}m`syB>MC!7-AIkQ^(m40F=D2|gjh zHmmaZmkE`InPt8xCUR#5?;;OR?$51tP+4rtO8!z6k5}%VV}_zydc{6(vDzqEl(?S@ z>Q162o0HUF+3#zjXMgm(2s36M1xeW2y$)gNN1;Sre+&l)hqTb=wc8VQQ3-vk?x(Tg zXIjq&UW6q>hNMYoY}Ne5$x~xAt9?#Fp3n@w|L~N?XG4kQ&*YRu*Q8La?ijLli1I05e@HlYYjvmM| zO660FyE3)derePUR*vej1j{8If4mWAEU&Hr%|??Wj$6%{i(hg?5TN|-fHF#0mpz{MS80mgrKM)uGYw(eGU(^lpkc%>a-c`RgH^}AJG)_`>I zNqgOD$EcO6YdAj9-FgZJe(CF`Eu@l;g+HO1oB8#5*s@OaXrHfGy>K+(z-z}CQ5<)` z*VJ!`(W4wU*Z+9%V9?`5hx7U$_H|B+F29>Ew{TsX`cd{;bAz%!V>Pn%-Vwhu#Fn#! zL`<)^sP}O`rR7I$oS3`P`F%=@SJ(hp4Et*@a^39X*~Il%Hx1f=sf1mN=NtiIqWN^7Z%!7HNZ5a;7Rp zUCt&jrj+^)*>{RI8}^hvCm3Dhb{mVpw|$rRF-Ngp7t+{e?GBucKpO$t z`aajB=!E<{ZBP|jk#dogU+RxIg4F|pV{BUrDVD~=bBQ-JgXZ*tV8cv!gz0s=X%0My z6ryAij9$pOP@S(~vR@-C;8?DHrBuy`4j~EC>$%t%IorToSu4zTQuBJkOtjVEH^Anw zPX?p=HX}+5ToxNcF6|HR!vKacaPxD8_AF8enM5KmBiNun4nr?+V!eJ~T?O=UmVB9i z8ZN7lBq8tw zP-#ZNwz|n=sd)C~@pdJp>M7cXkRij7lSEiG5PMBla^U9RDMQMKtciz41k}5ZTCH>p zLC=Jbut-$BI(HRrqjENKhW{+hSywZtT)NY><2wT(ESE6NxeDl7j_J1ptHGwf!kFWx zy5WmVt?`*^Ikp8tUQs6qC)}x>L-(Gto3eYtKs0?hIa^;U5LDpLOs1IKcBH%?c)Ev> zfazVXFIBn3znPXvTz$&!bTbziPQI$5K%K&7WTF3J;Ov<0`ZWNH^h^6UQ#1eQdAg@S zP4a$63Rd?-GK?yv(4X8mI~Gk)0-#r!!$}O-e1GT?|MX9^*{G6i_Nh~o&aH9Lfyw?W zYj5p*ld4x!fp4(EJ*>IR@1veUJLlVxvIduREvP|XedeA?lV<58vci!P8gv_+krd*b zbZcE#8=2Sjr!ggjdXKnWADwusDevb~m)dUOnBP<5oNg7QgYU1CZI|wvc^ofb?bayV zWf7-VT^@1NHTj<2x5}BMLM!hQ?}gF=zq4c-=b`78b`|nyh0R+HTS#u0!$kk3Ns)O& z@TG`4^AB%zL&rAU#y?SctoaOj>EpdeKd3wxehXgw z*=tyzLC0|QP1(dh>D(j3>DnCR-RUsKp#+t|n`_jz8taqGQ=NLjTb=-3*+C9kVGPpn zpDqACzuHaz@%T+^byF9=qG1$8!PoLb`@*_djTNn}j`2-@pS1G+r`9;>U#Ik@Zu_4`}DhloyT;Sq3-Dc`ihb%C6=;Gq!jlcH%}V<{BxD zW^sS;xPL)rk2gN<2O&A^ezQe*$qQLsTk?2-VZ(IPis*8h;_F;;t^{{i;C zF)wp5KHw43&UY&z1;)M(}rakd%{Le=RH%eet|TE=rb09c`GKsx0%`1 z!91!=-5gq59GDPAdt2U}ouQg8@3qASqf1VB++1HM-#iT+$L7qiD@i|L-#0g0 zN+l9tq5}u($}<7xzYN!{-f^sJvP%V9ft#wFjcpHj4E;3DnfbetoXCc0k8l~guqpGL z*PsSUwJY8K?~sH#BHe!9p@7>fw>bnqF7HWnU7_KJ#0NKfSxAolo5(`Xpjytc_4g|o z^(UFtwTC?8&PJnaoEJ$$@5-4%(4p$CKEsY7Pab>3mPe!Ip(FT9{I2d0i)eL$5!RViq5fXSjqt1d2Fo8KnfsHK=tW2u!DpkzfAj-MSLp7DwIXs?YVD=L z`Yl!)V8{3Wg|m}<%_hdv*b^uY25w1?LKMfCyuJdT`fHNAyBw{Y^24KG5<0lQZ@!K! zxWTL2T2yettGCh#h@R~zc#H$QvPtE-k^JaXuI(&Tw;Jy6-wPoK9d=&g!Tll@zla6< zHb>g$B+2*pS>ofWtC5`;GV%9EhR18>lS&Ssjon2>T&o{FDF5)tzGt_dB0M!1aAwZL z&xov~H#fd~ZGDGUtMFqtfN`5T4UuK5D+?0ry8`sa?+@*UV3jZLhe_oMDED!IvDWw~I_4A@nFVid6Q}S!@ zI28F>R-4}{?wJ!qkp=`&S|%PVq2*33-3^m-KTKZ32~wtG(kd|fV_Q59dx!sdRY;|3 zt7A12YED=1KP5V(@U#==_9_LJC+Cdk?Fi|6Vod(x6<26zO5;kSYQN}6(IlnW7<}BK zr(_Ms#gc9q?{{fTFs3V7E(ry%z1V;lW7tioeE*y~M9<@S=vQ%B_Y>_>45p6|04MtL z(zETl$ac|fnLIz-T+H#k*ovMfa0+}+&=p4?z_Y3$XxlRzFu z%}Kf4lU9L&GGh6zYg2N*Xo>kc{M9=zA*O@*nGQQwuk-_!heHh224Qz(us?hn5@3NC z!GkFnz{s{88i#5)VNHyR>AN0zpHJIF$&sp&BG21ew{n>ny^NWRS&XrdpHLKy=?)ir zDcf%IV}*X?q-f;pUABCAicvJbqldT>H8kAchf?Qt4HGfS+!dk2A);LzS}jMji+ruP z`4WXct{~Osy0w2P;iKQ%Cdzs|n*`n3lk2ZFHj;CXPrsU{P(VI$x$)|l(Ug|OcX-v} zvhQ{GQRc@NypmTOO;7m8M@ASgt9|b3lkAgd=N5{hUkf2A< z&vFPddQ@a?z@A%rN;Gl=VrM3f=DjM_DEL%=0xtB{k2Dqv`AI`@X@v_gHhRdmgrE(u(b}kT#n!rWzX_9QTTuQ`U?kAMszkQ6Tw*AwZH5b`KbFy zC>=RnAlW1vE*9{XAbDJJXTL7WZ!$(~p&-vhnw@S7&1G`o=nl!B-k0i$=p#Z#6>Tvj zp?LDvX`)Ga?Y=v7eAoZZ=;^A90_WC3+XZ-rqa!p2p|T;MPy~G0t1Z@5F6m$S%K==x zyejytt7DMv(3;;eqb2h(DYKk6DK}8udY^Ndd6{3-w>x?W9)&|!HM$f)EC@`=d%E9rteSZ4gx(3Z?K%Rc9 zypSO>r1}ioHtNn)hhr@jBygC%Ej2_q>({O=R7ejGtEgJ=RkX)!3naA{Z`=|O6tTE2 z7HlkNco4K5Lu9U`(A#7ys9qO$bU4(EoJ~W4PE`6%GLc9OD?NppRE%eo7SpN7s&I!; zk?bS(+#70kfw?zy9IUGqO>TwSUGNlvQh4dzQyx=2kzg+n(<3#?dZbKz(G1o};!B{! z-Yw63KKOpDnHb*@ualL;1$!LDc=RtZt-{JN}%ly}LuDhXTO&CPH=|xyUviy_Jdm$74V-&5+ z5UE`#vCj-*v~53=-d!U2q;H(VLqnhG`}RviBDrX*V-EdQl8B0cjJVZ5Ms1;N^Y;tz zKn`005#z|WpHF;UNCB!zkba9MJ>dYy4+I`dj^&+-P;{aM?&p2$uf0D% z8i&Hd)e*7N0&Z;BvbC7cK>gNe4&;o4*y}29Hbe+KQp^i%`rWVM!K}Dm=|;q-ZZDdN zWS?u4_ZScqaFZj|ZqE*oarb2jKz?^0{ZA>Yg7H4onp9S>?J-{CngBihPvg=)nU;+-js`#IBzD%e(| zHL02IeaAEU)-cPH3y7xjlSZ@Wf?0dH?mSS`4W!52I>6z|VAf1}eqOufH@yFxS zTb~(ydV5eYVy$;;`~BMo`QGqpyvhl`%LNNRLWUDGr6#|{z`IdY5Oux?WW`tTZg<_pPuh+xbSvC`~qXDNnfd(YkWiqmI4KhFp#V9|L9E zQRM7F#8I>*%cMZ$JS^Wh9jg(VJZupG`BHzysd45oUUHZc<|_-II$6G}94!ZPdg6rB z-4Uwdx2yHRvyG)QzvgI#6Brr5Tp+ihew-RNKffBj;eoNfZ^OlkgPW7Dsl?ZHIzz^W ziVp#dTFm>%xJpsrUt%q%s7q_q*wipZ-%?pM5`Xfaz$XrLHEQ9B(>{AV+a~o!w^r;yg)#gOs;;Q%yuw1 z>*FHriE<{3leVRPx=u!aH(gIAaOA9Z-rc)87>l8#=#iM7ChkA$H78eXOqaFK!7 zK2S&<^Q%_)Qw0u|g(V9&7VzVqyowXiS#C+Ivd;I5U;el{K8Mz3*&M#abvJYn%|CIV zJpOojpRX?KUEaBqGSPL29b@Ui^NJ2Lr`y2%NW0bvbeM@E4fk)IOm@|0W1~%aqcx#! zc-6c?A=`K9^~;Qj6-zHL9)5F2sw$m8JlT}a9&1wYVXRJ;CPJJX-Dr!8U(3GG1bjC; zxVOJgi8KIeP0tT&$0lc%Xql~`i+{y`{KNQF z{!pnPw0i)Eqo1*d@HUOt?o#cduqPDlv*LE-L!px9PR(P`&W%?`-@OC#eBlY5pJ+1m zTvkTros3lNk-6~&M9wfqQBS6{#8X-h!&3Z|E-BdLAjlSnc?C{y?$XBr6jhktaAI6H zNSsIZapa@ZcbwL>SL3seM0CMf?eNB}TFm!N^pb+QCES1H5y8&iuytT7NG{t&B zfSOGSE1XC>?7tBht4iuXY_sPkg#!cDI!&!6pLY}Nj2ZX3HZ-?Hn*^hdA%q_<6;$P9 z{;?2H3OKm(TE|d33y+5O&--s;10G^BGB4O82s&W00O@%7>h!GJZd&01VH01)^|c8C z$UEKoX^WTm{gv|6Cob~4*810K2Q_si?Ms&3kcRdLN>`I+@w*Apm<++4fcLx%>;}D% z!dRvEx}hKd~)NA3jQg0qTCrE(>Vf6n|{J**?SHv={Lfkvh%R zuQ5J)DgsQWDs1o)eNDs{*TK`%8fMCtyw;N14ZUq=zJ1EBML4M))Tto1nL4{NgOdOx z9G$!IV0QXuxxX?3f0te}16z|<^bjTHQS7@c_I>jsXE^KU#05Y#OCKnW2N&L|aUOah zP@Qsn3G~1q8-;l{QoE~3SJ#3J0ko(~idtTH9hS?r)2Pb*Cvs0U!g3=ZXJJ)#pr8N(QeScBB<)VGseqJf?S-?_64d1Qfbcp$t<{9&7 z^4rag@^5F5a=7@3h~wFT*G32)pA{Xu?pO8P7LP`%JCoD*p0?KVXbv@)Sm&0z7q$FA zxwaLlRK0UiSDpWrjD@o+f+#hoo>zkU#@6%zX)KshPtzQ;$-ZZFB6^S`+*j#Mjx{qDbP@T_#FkL-R5TY{U-Ie<3P0m%1#>t&KAhUA~~i51*RQwXMy3)Fc#i<2&sP(X9~PQh@m1Wr<0+cKLu3$NKC>4O=@)Mr>7?$1$vsQ7l{)h6 zSrTqY|I8C)wYj& z)V2d;G%v}E70ZcFS8IlAj455-$Cgd+%hA4GG-63ExpzPN+(YM+72EPE!&M|k3s&i{ zZ6X`!`kl@8#s0QcQjvAQLZ;n?`Y}-niQgTM@X%u&4=FEtY^eZP(N^s`)%r7rub7gkvbXO3kqnp9Y{uR9x56il)BmTdw!`$s1J}HufYms z;~i(c9*Y#l0tuU^#S8BV2-}nH%IQCY=yTl^!eN4>74+p-DPy$>t*Yj2#WgqkQYFO1 z#BL_bjP0`omDO3Yf!yU6}^3hy2$CjfVf`ozW`6Y4wJE+}hNSs=iW7o7O ztskwZuoE>}I)a`gG(23Y5deq5elssG#+VNu>;kWt#(sMt-qKlopuneBCR;hK^+=5% zzmos@m_MjmDl1!An+Q$$d0bhc4PBH*2~Q$ddEk+_QcAVkoh_QySn zkdm5BrLpaU5yF}(V8_XI`fS4p*&?Q7fiV)1(LIHN!pMMaVy%;Y1;JB9E z{2wpctlV^4tVD78*?}jl4-w-jYrkjrOIozKm5Gc4S8aEgw&=(qW$4am{aK=uHC3Ki zAi?}r^bk^E>)`>x(r@>Km2(3x>atS2T)VG=3~2{NJU3JY; z#EwaND?Zz-x$?^d7Q#G$|cwAKZ{dG9bBFUFJi}Z@{mLSl2WA|N=Px1=d0^?*DZbu?# zb1@R8$}C@shsUC<+4Q6at)=#Px4eO!vEzVnxXGtF4#=qg+hc=_!q)4*l*i`3G@IOM zBK~OxE^6Om#e9GUxI7GgjP4G(z#G}4tT8<2kPG3GN!+Lew(X0sWB zF9e;ZDlH||>B=Z((=e%UK9;BI2?>=Nm*Z8~jHAA2c75S&duG@|9)UxO5)y}jT6XT> zjlV-+Oz7<*~bK9@!XNbUMCzyDie zn&?Z}WKeQS35_fxi3%-t#IjQk@yqPpvxn!4CJ=z9M;dM|g;rP^TX)fS`hn6TQzQQ5 zY~O|h)@=3|o#jkO1j|zN8j&hV1(9YQu}S+r&CgxuIu>_Xf4qhyrFFk)wIu}DvUDs@ zRr+9EH{>}1s0qpPE3XdU!^w|R+dmK8t?q@3ds|i< z5V5Tl)Tl^JNlB5YHfFQHR6ApK$8rjrI2d8%{G$*9#)?6o^IAUTaVT+eZF`j`_+S^6 z70UzbLCc@^oTB2r>0|J0oX7wHwPCiYQv*w%+uWnouAz+YhNp@M{-XDb)$ZU>(0bAKQ9uTkd?e68Cz zM$M|HNzayvT1d8MgHDk1As$pRx5x+W+#?>Ks;n^0!MEfk*jEDuI zVQ%*Bm^IBAP*)wolJ>zeIZ5hu+XrBoT>SjC-7Cdf!<$8qtqv1BS7ctlw_&r3;y(wm zo#2dz1j3Tr(&AkPZ|KUI$_PyV+fS1RnC1$KG^FjV^Nq#4T%}&v>~bjfdaSb? zAEUQEa&rJ+W*}9^ZqtbN!rp__nQ~z!h_65wBKX>}m?pH2Ni3;OKx|VXe-*ZYwVyGj zk0Y(E@#Sp7l?JW&vT}5Z_&nuU_H-V5t!Rx&ttjaEjX6qs2~9>j!+NwPUZ%%d%I&z?Q?uGD4`I&heah!e^$reHEGfXFe2$YTK|bn8tx{6K z|5OxW&yKzKz)+1M*e%Gw& zSo^^-_{gV*{mUSyawe#5qSUy^d@iEAz_!(ddqqNlz*f-hnstG+na~2s0lI4h z#A;167iZq`dkMdAA$1-bnUP3WP{P`HLkPJ}Lwj{85Pii@A?8gMmpWUYr8BK`f})f= zThqp<>0o-$+Iv&UDeV0Mm5MVRW-b=owVbyme5!U^>y&B{Ve>hXNx$|lf#p&&lF@d` zZFU*0c2u&Rb#Sqk_3t?rmkK})sx%kIVL9=fyIQ%9xnDpvM@w;GaXG!a-VdD9ppX+) zjlYoZ=UO4E*)f!be6iB?l4j7}C{Q(L?a_po^j8^|gS(vn3`tiI*%1E~RO(ZWl-VpN z6aoboUrP?v76_>4c(YXWp`2I-9W;@;1s{RWo!nB*tLI*TmI?;fL*NsOy{}Ah8v+?y z@>5PC!EaI+RF@x3ONG|!CpS;!Wnai?EHD+K%_1S|Yr6e99oJV005dy{K#UEk7l8OR zI(jQHtCY5~M?lnj24PVb7-YVopC_a83#{IK&H@F2&5Z?9Yq%iCMG%A&k}(!@K+bV0y(H8UqUAdf_XoTrdHNK+?z>Jdc+ z?)~n4YqdXT7}c#hFq+GmQY_JF%@1s~OP=j^Un_ZSX3f50I?lXEv}#IY7dF~(ky)A| zGyx`@mIQXlNNhAfaCaJeD-4i>W)L6fWed6w_>Yxp5elZD>O+;qq=^4dxFtCOB+@-; zGi*qqAlAp9)>cJr6b@|lqrP!~-cAJAY6fKu9?AyADnC<`vF@oD2b5E#bMvCta+x9F zIKnf1M+1n|R-7OraScI#)DHS;WiE2x5F<7MhQoWNN9_{7<3>J)-{-<#bn9~_w?@xs zh1Su5qIC4#cKesO^5aNcc|@8I&R3^9MLc*1FS}*)>TfdCZwwrfb=2^HeIYc4~D?Q*yNZAG}EtVuD9+jD()-4L5*n-xF zB8-G*#N~05zw}#?BeoZ$*H5UNc&@J?<%g`{q%1$lQC7frI3}MjA1JCqvFHXp>I`JK z=AnD7ReqVgQvkpalGc0QXPx+=88BOFU2@e1=3vaDo54}@~tc#+Enq>`igs5)^G(E z3aq;AJ-UJKJ7$gjf>iQ=D=iRem^Wc*HtlV_ac%8F;WcypaQv|)7qU3qxZ(da=d@gG>! zB%v)^*b(Gv!?$z-%c%=oSM-16li!m4CqB7n6Ck&Ik$C$;@gM?E`U~~G_SNxFC~^E< z*s2^2MGyPAs6sOj%0TVGxcyffB`e|vF<<5T1S(?nT2)gmn&IfIGCtVuIyLQJJxur) z=@Wq)k#2WRghyM7H9U;}Gs2hmEh!9@XmdW%tN+fyIFa&hX#ct6tA?NLJIc=s3-Lk; z-*X5J&&FAs^j8p{&l=oAd;}6hQm+$iKs;sgE%ZAwEAh&9HNupRmC>ghOy!6v$dyqw zSYBQpPZa&Oen>84_&kSu70aOC$_4yI*5&Hyx$z97#{)ol;WR(^J>e45aeu=VIX20^ z0DeK%=jx0h6*0!HZ$DIS8ignrz5}^F(Bs>8A?}=re{U*M*zB04J9igmy9$JzbeV6E zB^=zJMns7uV*nyp<#**umaEG5OQPHlp7KRF(0RSf@qYw-Z3_WVT8HjgH%G^GnShbU zfm@>A!!0ftE%Tx1&su$|&>#XK2k)p$T0x&ug8Sb675WPh*l<06n+;3G2&92J1`ni? zVkm!7=9AI&fuw$w%1f!dq!yR%!f=Z-PkYcYC7@^Zk1>p|#aAHMxYc$m;oT`H++Paw z6;vGqWxltU*wms|sCrYO$y8!G=wg{zJ>jQ4udqD6zY%+~Xx_Ou2U00+$g`DKVkY_O zV{Ir=3eF}1AD8}`SXtRG>N10f^QmN>n;AYQW|jkzM#4U*0k6_0Ak?SDpq@(s_T4<= z-Za^@6_kx8zsd5oxSbD_gLI3ocw%R6Y3t*J@Jvn}4;PP-(7w5`P$Y5z4%Ul`{wnkr zWj>eQepHw5`3bbq+FeQDY#fZ6`Ug$EDFN!;?`ab_EdrcIJN{nRz z*o00Mwd~!@>;h6hchu90mNmR8VZ}0Ec-1`sLL|xqb7W2?aV92 z39mjXmAK>EvPzj;EqNO;3l*ZbszrPyL256$bi%qX{KT^W%+;>eTE2>cD(zbyX>EZ8 zr4$1I_1UCVgzav+C02!s2j(&5MFot^Tzu?`woc&9gahuhMYlc)qO$RazL!Asb@{5* zB@L8rcb&juecaBHIUd}7usN+W2R_J5c}Mj0jA7MGzAf66sRiR zbU<;MGnP?l6{~tu-;`=_U=+d5Hyxsbl$K8G1Onuxx6|5bza4D?gS7n1z-)mfb1Rr4_K0%_}=xUQAxi!k9Tm!%Um+zaNz`j zCnL|$kdT9K|4EW>QN`y?U8Dq`s{ZC3(Pfqg$$!q$wpRxM@3(g&tHiebL|FURJjuuawngzP46JCdw|FIc(py7oPxx2RFUs;94w5smD?)ujMYFB5 zMGsDc5~z^G3IeP&UqWAUd3yfPWQW8K?6cwMzC$MaJw`slPe8^Xj(YT2yF}{ph1aXe zHx;HnDqUJisLF){XJ42Q;YF&KPr65`Q;VR@lKT=y4swxA80MIu3}i zX~TIhM<0p)_oIk=(U6v)mX;u&maw-@D?#VwU7phI0N1LNM-=uDJ}zjqNoQ!$-nA3s zY}QlX$YaEzybcgKGQ_rg0AE&%+@cYrPuv;6QW9359lmlr;U%Jj{S z7{sssNiC_c+O#^dTH0o>2tf_&b!_oJNTm2|FINQ2O@@Tc9RHmDWH+c49Z`U4Rcy}` zG+2O4$_fN1M}iu}`u^cvx6J=Ib>g84sY(+h7H~M<*Z5H<=fcsWM_XQB(GRndLfR_W zL=(K|G{9bjWo(H0%7s;HLu8>!CO#M`K_?s10kxOMFoOhVcLn<6EfujH=<@j zy>h*Cye51ORd)Dc&uws->c|nh5N9-zXrz`;dz*-~hKSX?d8}T>eBpbJ$}rKV|L3`Q z%iC^VUJ4h$`L7ca7EX}=0=}g;c+&Z&Lg;eA7R`k(=|A*TXoZ6528I2X8ax_&1fDyJ zAl$lw7!wutQkBc42~!UniBh*teve*adt8MT+-?iY*KZs$J*@M6g>L=`T>4u<{<~pf z|ISbRVT`}G#UGYn`Ja-EQx*IK+24yR&_#sToRJUQFMoNmxiHf4v0x;A$BDb$OBjG# zo=p2Tc$V`ZvMI1SN@l@padtGfIq`ssci-Eaxd(dsDgp)jnu;plS%aM1i470(x2r85 zxT5qga3ntB%dyMIN=qUQrTOF(o%gZ$>wWOf7XCo ziL}3m{pvZO!>3>DT$MF&rw)c|VCX@&f#(0L+dz0WXG9Cu{ErX=Fd}HONl#`ALV$bVOI%@>m{@HewmoO@ z>W_8<)T(B>OB)npGh}o^{18$}^w7T~w8fE_E@bB$$#rcN(ozb1O^gTXHnS)4uU~(2 z00LJ#O63*c-7G=czW(5 z`hc!xPwGRoMxXE=#!3>2Ikq_{Vz0kkzy6zX@wvOFXGU_`sw}jaTzazxhLH*l<6ib= zjMh$OAUm;oFjHG-cd_1Sp=U*UO1tMuA!Ih_b}^>@(-@qSuobWk>VeGRMvSh zO+en&a492*|0ma&$^28y_2v3qO3adkius{Cr(>d{_l5mdx95S-KtDbhWys<7!T;!H z^D6e(K18$)JjR~+Lg*0^4U&fo&Ez3M(&jYJs=7qYP0_z&TW4k^8Nq~Qoz60+eZoX4 zmd`|NW`fq@PFgnC>S~^!rI#&K2WP}N`!R76#x2Ur$;>i4r`JAnm*2-~rq^|DCN#Iq zGBNY(?VY;#n!D(j{EtcxQwS(+axc?IjNM_BxY#c$k?(x@vQX^i8=ocpd+$_)=H~mK z@bM)F=sPAfwAgAl7jrG265})JVmLuziKHde zrsTR%Y9y3;`>^fYp~UZgn*w|SBevGSEa>pYz-hl_KlnwLr*m3D zcQpk9L$!o@$Ez$#xdXBL^PM*ZP9sB&E_Pi&i*dH7W`?E^$1D3TcVY-&X54PxEX+G3Ib)Cdj1iE(w*ryHi1q-Oov6E-8;fH#*|4 z$_4&=8I_CKmgCQiNS}9fnQOYdWZvvAet;p#V|MtH1c`gf;+n#iN5YuEvO71TnP1h1CYrj*0nyl<|NKiQA8Tpv( zzdSh`jKgHWs(AK)4lM3MmLXEX(Y^ml8vGx5;qfe6z;J*eEGac{>FOdrU{1beF+cNZ z#dyH;VkZ&i`p2cN3(l~xlK#oufhcKfp)&eFK{JYJ5CE^h?Dn;&?mKS+&N?P^dtz0_ zD*Kodm3?WjYHz0GT=|K%1Y88hgT&Wj;5l&$$MjsLBL+(0zQHW!z9Dth@-s9 zE3d}uyT%+72+W*Z3ZHdhek5#$ zJwI8E>$p=vNRjgB|Lj}&*ma19ZTAk&U<2f_4K|U>e6ji`)&$$EX(dhM_+rU-bl}|3bpuo*MTz6m|+Z`XWIN^0C_p>T`|FX$}_wm)iKT9H1!l2 z!{^=AJIf4&)<%FNoFI6b5Y*z(X&YT5mLY_Iu}e?W?z zQ;=n+xNaU+T*bfbPK6Z1beF`Hmrmr$zLcmPoSH$0;TZwT(Y>RyeG?K_s>Z@mGv`gJ zce?TuB2vOyh_u1_$n^3eD<|8<7L@n1kH(rC-d?V>fOnVq_&$$fbuArxM(hJbbaQua#`n{3M?F;u_uI5$8fvxeZJad3CnLXn0YW{TnPzI zSj8oC9uxV-ErFu+G3{im;+5=${mBq$rep>%#_^sl9R)7tiyhd6DYqNmv|bWXi}}fb zSparks%sKGC5BnTLC@vcOx(=r(P6UIReXY3+NMCV+H!_F!+=Ts`Z;*3a3juX+t*s? z^?y!)Sz3s3>ZO;{S`&(!u?y3U1c%{i+(u);z|a$;qRXA;FE6cW%FG~Z1uRh=%hccv zE|6E#WRA3#nRUQyO_& z2u$C+c?LX86h=^}aAvST%{Rumd)Wb&l2&2f-EyZQm~a{YSl}OC*ed3#^p`<(BYX~U zYil^Dgo&t%8%HpljYzk+D3e9#n@t<>KQQke^Q?)7K3d|&d;OtBy@o^9 zlA7z<_X?Jgjm@+fDe+zuG%I+LH|PC3psjf`8gqVViEu=MUBTRUGVe)V-XruqG``^; zpAqM#Xj7?x;f9BN(H|z~>LfBZ@B8MMU3+7!Q|=84LAk)_bEbo5e^kmq1Rs+O5y9q+ zH~pJN4PKg#=988y*ww1{{?PQmPJ`7}0eTv|KQ-E!=b$le4W8H@a_LmNml?_Q2cU}~N9dkUA(r)fUN*QuKL|<8 zYa{*+`b=T2cP4bBaN65H!ALl_*#(gZT6l>wRLv>rUVLszu&TbxlhL^SJw-0{#|(>R zhSBw$%^L9#4ec%K64$RQ@tGT|>2Ex1rlg9o{lG{nl%4OA2s&w1+$#SqWpd4Q2fISj z$N_d?u_AnOJ=@;(;=V3Ghm@L1EGP~@5zg5;(RLYF+o9Wdv+cL_HZW_AgNWK-) z-^2dx57HcK;=lca<$`xM8I0xR zTl1tn)0~{I4PyL2q50tS9SiZmSg%|FA7?#%sSu8*Gd_qva|e_P3m_u4jJiNZ(p!Sw)F zi%!VH%(oBuEWBLgJyi?M6ZvrZy%R@Wd zfbPjXS4~ebpc>jU4^@lT(YD{3EwJz>d#s7~b{$fn5dCKw<-u&^k{PY{osu6)yk~3* z7;Ppl9=Kc3ZNUR~TxS@B)ry_I+31=M%+n~+&`4Ild4o=tH}uk&nuLY!angD24GBBM zrjk+#7yt?VxJ0gdK27{0VIQRtTf{A<^%U+$k1h~I#k|HadWAeV<4KRh5Z5=-aeY;7oX`C(o$t2HmP1By# z6(b0zxy%>;c&?tk>%I2f8=H4@7t!X`(k4<054s`Lo5c)|k zXZ2ZdJ&azv4Qq8m%K#-vd}i}p)1v~y${pOW-2D6~psbyBTyrr})tj;tP`6T(KzyBq zPK+Pf%Ts-q7F=HkFfoX9Kqt7Y$anLzv%^JHyzaHNx3>VhH8_Lik@LDmdk8IH!S@@J zyk|#r-p5XO*7r9Sy8D24xB4`OHg8&V#P)XocQXIAcd%Q4@WhkxKn)bOR<@ZdVQr|M z+-by90B?^{E2iNvKJzndzJ10oeCPLmTWxjlw}u@uOZ9W}d38C@y^Il{Y_C7ue}8@J9cNLl2%R5T=Ui ze^%6*{w|xK4i01mNa4X=1e}qCu@v+$Jr1w%T)VlH@r_7&(5m-2wuGx)MSiV~EANP= z_qNf(=pV{lotcN$#6x!V<16-INKw9??0#`RR3jx_aHr_>Z zI`~B3ib)v9X*?Np(w+WZ$;^sHj>9LnXF$~r?`eWR_0S;GOc)lX?sQ5uIplS~X3r{& zuZE1hW5Ext$M1E`mn7MpzkT{~beapWCLjAgoXszUQ<~}!XU5Hv1IOL}jcwJ#9&Spm`TWUJJ}oQ=f&;76txuW35x|W;5NMae6&25b>OZYgP&}R)qwM zj>e?SCDAjPsa`5jNVaFP!Oa>q)m;>k55jJs4r}O`5xDeD5?#?>!_l|{^%N1B`-gKT z0af(adms=WE*fjwI#U5LO*16Nn*o^!qAB#?jJbQFu^P^XLhvblRr%lH;!HuJW<-z= zH~>5*!`jYvA~Xxvfxsc7xuvW>A4SO5Phbo;eEIqVUcG{!G;P*sT@DwU-gxJQ(1Yqj zBg@FdMhmX~q8$|pLtRY2P#IypOa5RO#({78l2I*J;}b%shKi9s`smEaA0Ja$%uub| zbWsgb6_$0E3_W+z;WZ9wTMxwJlMpNJD;t+WaU-6EKbERO?XmHgHxVj~w7phoB;+c` z#?b6CSmqi^O2zd&V{%EN*J#lm8x7pDxi1)Nt4FDLqW|fEAPn1XHw|XJ4fT&7;6WcX z{J=y`H}P?Z5Vj%g7^q z2vvn}&rkPE3>A^(TK;MZ|6-=zvnIZ`cc~xReC3rit%H9h;BSo?F6%mj`(l+qoA!elJ#7mlD0$gE6qPcFEL5u zR&1!uzdx&IFstJTtOk+uoW79v-`d-wdp(3@gfc;w;vC3qNJq~uTFOAr6`VTTM zqV@0<%yP$pK$~#EF)_Grma@E74)wW+l{-si}eNu~pT7@nBD>uY5*EMluwZH=D2;T7&TJ zN$~hjBPcbvQl;`&WsQlTF!57F92P13gZhvG5(YRqIz9!-rKfXqbGZP{x0GOcZt@!@ z{qMZnHxKjgEaZPL)tF_8RK5}E7H~uj9qj}2%l2PWs^MTsdsC>O=)blh;;TDvxD5%$ zSDaEtm7t3|juQ=v8dJxt$@}u<-$PthZ0^$Yy462{CFF!Niy=ou2(@HuG|Noxr1y#U zHftWZ#3Okj20NqDXou@3nYwSz?lWqWGWjVuNxa0;2?&`XNv}g6;UOp|2D~JyRe4Vw z_MuOIE!0EAw2zYA2l<9k7`8nOU=%x{EXB*)KT30v$HzK}P@d+86!l zO&EI2d>;OdlM9M-yj??0u6%n9t5glEx=E!Muhmstf8$1y&XS7S9$|xrY&MO2dILdQ z%~>{jOQN+xk}Dn*y+{?(P4?0nEJQipM@>cJnqtm|uInw-py#!lNnl>-#W(zt=^xd- zDNLBouUjHu$-IFMI8%PBP@CI|B{#VH9ZLQ|bgb|Lu6}>BCBF~iHr6-}A9KSgTU};7 zM|rxweM#mK9-yqV>fhv1FgWw@nC>gtGeYqXC^^pP={`F$>zNWo(P0+}OeVFW^>A2= zFK@%6oa+8h8Kz_Y-NWxCh$oz58TezDajR<&XS#)lap-@#q)!?My)X3nfa0}huU3gg z+@{9LI7cofq+&NWo| zF@^>IY}xJrT)PX3G#_wJaLjs5(-=QF&Z4!7Zq~9S+_dN&Z`y1&7WK(m|I<&;HnHuH zDdMZY5#9gD?!&{~u8(*M&Iz$_#O(}O{d~c@TWSIt*3dnAVYai4E|?s^Lpj0#J}#U{7dH5PWcYAI zA8lORTuQxl;8tJJwxO_o4=$eo{xH2Qvt>4LPxa6_I<*}Mr4V>P(i6Q}Izvy09Crj+ z%~e_#cSDYdNU?BKbM2Dg)T+<9bmUC$UEhoRRKcs#rg`n=0i|}Dn9pEj6FT2;zrCIyNt%0IvhD1lB8x))3ae$im=$09ODZc* zZ$gaS{Z=jgV1P?!SU1a=Ixn08dR^A3%~w{*)^q8SuI=VrUaaNPBB{TqWxJJGLODg< zs#5u;{Zab|?V_K2mzED^x+1F}g)a9RV7VwTD5^VTTdIAuj6cke^(?l=ZSbSOam}qm z+oL0kWU9uOcHY#uQ*Ro5JR^P{ERvw4WSYWndpp~|rP=4Dr+)(!tKuUDfN@G@#O*js)w0<=~928uL3 zD-D$E==cJtH?6vhp$9QcfzS!1)EL7?sl0?>e=a(MlFh~wSb36VdPaw^jgS6%<=kFR zi;2~)`BD@1qn+AM3TYkO>5OYPntvlnAxy^RZ*uExFO~RoW{3GY_Yc+^Em~_hZs5xX zZut!0&t#S@0=nyhY4*j9FeR%JQOSAh5_1fC-8)`(h3ls(51V0%DwiZ5-xDp7Y!OhU zb?7EVsfV?jzU0@%UmDpiD){IWx_kgOQ^x>F+wUH#mThPk%@R#nBYo=3$DxRT>{FJQ z`s#399LXnC{JYl$;n=|pw<_sN$Jubw3k29meJum z>Z%)N%#DcS@}a_Scla?Yy$o z7$Sb^zMvtGOrijpF562LB}a#vuFAc$O6$=9+I5FAQ*X?>15K>j5i7xg!}#SxjDhI8 z1cRLkZOH7!(dF*Q>?_%x(%7P>S*bfEL~Hn3V7FRWI_?v5wEAuT?qwg4w4Z0g^U6F{ zuzP#x0ryLp!?ApJl`+B*iOGjaI*Qr6gk2&9?I3HmX!e`(?m-{1MBiX~zhdS50X_|p zx_}fc`hZ*_{gLIH-aCc%Gv^a`%xuS-S&QP`#wL?S;?PeHS;~yf&CRS-V3KoTmp^G0 z8YCL_n!BpJC-VmIk!nzxLT%J6@c}sZ+qa{SZblELeICRcwA2{X2#SfISErSGxs;Gw zXQ`_qknOBfzOLZ{OnhSEX-xMd^|~WwT-9(&jPusJBbL zhNE;SLC`Y2iRgmQkUST(J!{YF*g?k05;k$-`0uX!T_xqS*h8pr*=b^=q=QUXrLS5O zuWr3octX$n-F;EX_Q6p3-aW=9#61I)dY&OFwiQ`+c69kHa)RQ6Sf;OIGGA@9dKVN= z!Osq?zjzOQL}It#2~C?p{HIdxm&q}ogz~XVwzRSRrE4bx?xDBuGdoe%sf|k79p*b{ z!mtg05*Aom<##|Hd(koNw)L5+p#r~~DYsqIOJqat=zTopd`~cTr~bWQ>YbdztCPm> zw_FI(gAKc;%9rHd5~B3j=YuO_>(}?S&JT)r8V+eI^^ZQwR|V9g9fkPo%{%*9vWS?v zE(Nv=MN1bIId9X2KS!T#4VzhOxhdGs@tSxIYfA)NUp%6n(In?ge!`;vb7R}G=Sl`Y z(N@{T`f2p)aOZEs`o-pN#Jsyxx-Idjnv77gL)h?};`KNz@`fvkrl*`E-}M+&^{PfN7k@8Y6ly!Z6bFueCH>1_MM-AcbM92Jeen;zyfeDHv38AI>b_i8qNmdrR@{Z; zSrBF2oS`X-S+IaDyK+Ue8Vy*jZaBpm6Dq`#s}a8!o4LfVxQ%&Dw=5>qk5$*+EFwt9 zyrzdZSMdM%2-+^Z^i?N z6AW^1rJ;|A>W6lJ{;m6LLgCvZQ_ux{pK*l-b5O&4+RQChV((8ytoa9eaTiO>IsZ83 z2iP9DM)_e_woEQH&|C>YF838KN|+xlX+j$CU`h96x$)p32s}UWP2Hk2pOoYQyFDH zsc9~-8@pB1wm|gTXI@WMZ*4$uUa!F=ZON3axTvr3zUkHBwJDz<-PmXo?J2KJ*eT$w@O*%`mht zI*5!D+dvR|d?KTY0gdtaGWxim?bh#GQ)Cg*oz?op?dWn$QcyTVaNv^>Nwa2ho~G#4 zPPQe)OC`kAQSus_ZC@-K)1eII@EfbE6i!>eV|B%_U?6PF=E{%T&iVVyu8%W0Zj@H% zAq%kW*)@t6eGuDvXXBBmux1GG_{;B^6-;Gsim>YW*2$C+WleW8(Yl$ zyQ~~tON|5y$I;vC@YZqnCJSEMJ?~F{q4UGO)#Me$4B2LKZ4x~gjpv>2^TcuEzHSt$ zTYv{Nes3HSGZNw_Qv>y*ptgGM!WEE3l6f`2AfR6Bf@55LAKSBXf!V0li)pjfqGW?h z)B`mM@Qn(fSX&hsLCwEuOtn1W4Jdu@47%D6!fdIz) zP(wve%ijxYxzSs23^z#Fhca^7^5cD6IlQ{*E58pMfWggOrj9W!3%zN0;KszuBvL!!>AjREksT0`C1~R zu9%O+xwH!T?|2y zSe>=HK%u6dYb_yOQFc~ckxV$O@e-~FuYW2gSILhW?qGA=#r95D8>^-XnMh3s2Zzz} z>wpv=krH(~F5r14^}rv2wl8Jiq^HLN&^LJsCk3C$i=)dp)rpUVSiiYw9Up2!7(Pjj zS&Oxr*fAd|C}yxhkZ6f^j*qllRK2=aO)#|jXPNhM6Q>R^25_jmt|CbVVWa?wMf5%U zM9=+9U;Z`&i~Dq~Sci&7USTuxf)RwI-4!Hq_C$Rz&M9>cxU}w&$;k!mN3IL8%u-@z zH+VD2yN7O_q`b{yoS~S>%8!tbJ3l8>RUP;F{OPc$WQ{cZE28iR~VQWYUE)I~-|fGY5S+p=F0qgK0f;AuY%Z&m3OLn%5V zTV%Wpp^5+&-*~`6kW#;IC^~Jz7QykRnGF&rw!Y^grC)l0Q`5Man*b+5gQZB(wyIID zrfJWEzY37Nt5Tx7%U{X%3X*MQYQ0bmPXhv5zL^HDJc{N~ zc)ffkN+h}d+Tnq<*IKjKNeq?RUF9Y4dLkda#uIL?G@TTtCT=`= zANNw8=$B*1Dt)_~C2NbmMG01{=%*OIuVBio?ZDxRcW+$W^pL}hepkE{CQlgW44nYvTzZ|$Uw{qvPN(Uhi?^Nj}M3}DnA-{?6 z?p7*(DCl@G)t(Qr*Kw*tdJwn723Bd2BG!wCa=pI{D8-4O93&_438MF&%qX|Dk7j|4 z?hcB0h&G-(w1b00ecIGNr6J43kyir&V0D6MQ74e=R^UJXv}ak;9IdQ&sCs?gnvt2A z0!J#Z39EX~rdg}6#$7Snzlp3^K_Gkw<2=94qwxJ-m!JL6cH)r@`~QeP>F;GQeMxvp Uh5bi@keQ#<&^q?=$i?6O7x$C5djJ3c literal 98106 zcmeFa2V7HU`#+5JQ68(1wzeWO?b9kwP(hHHDk`;zs3?eGm?}dyG6DpMtqN6yv}Mau zkS!2Fh!8^5R46kdBVi<{$Vx&2goKsx-Y4L|+UM8)`o5p{{k;F?WA%^|&bjYve6R7n zPI~p|5p#tPRX&uHlT$eK&Hm$Zav$!Mllxctzdr!)T(~M}1OF`ZKW@HHuB26M6#Vcn z*S(hD6L)c|MW_|v=T*Mn*!j!JeKi98x9o?rE7auVq&bK7?>!Oh$Pr&9Z0p_IA0k9J ztE8;Bx6FFw^_@3A{{ErMN{77)Cqho56PjrMR#>A_;%Q&mbTY_^ScEBdWN<@ItV&w_ zh2LI&`&7ou=rzfE_kHrg&Q(ACcY`Qi(9DW3x~d&bv+l3eyOHb4)SeXb;*sUu02@<*Qb4Uj+o8dwPjfs=6SK6#=DHnVX(=gr? zt=QRebg24d#P}~s0n}6LDpEYpS+KTR8`HApotn;Ouzy59F^v%2KRjnYSYO}e6%z8p z@D@bvR}716&pz)|+kpcGUpnP&8)J~MWFx^KYzHL`&Er;Y@#rV5_M{6^BkxGQO6F%J zOSR)BFD6LNu9Ul|83)$%^{!!^uGSLjf0}R@=Qd-cs!%*X-IQhKrT5F^B}coC+~z>` zMXNY;LeJ?~lQA96?qPeio&HBcSvJm{O-G#`U=r%>ljlB-R?yvI`_x!-2K$Zu8A?gN zZfwFARp$oJQLQz~k#?oF>nb+;GYp#$E^>0gkIn;|@+7d=^*U5Wd}J5VwKaOE0MViy z?IJI@nR=@^ut;>%cHM6Jjnhg3vcWV@QE%G$iK&_1u=VJ%rgUWX?6CNPx1wVQ-pJhG zX#L5`!$$_zH$K+wbwCfc>vHB8!91sb2~z7S$Rq20r`3oC2EOi$ahRtK{`rmx{EoeX73S4YpkbhFjUb+ntZ z*NiyO;iDccX*_j1VxE<8cN+uM)~MudIe!7&xQvx>fLbw|wFyDbz0k*8yv5 z1B*)zz2}$hHTzI;p#DxOe7|CoJkSh{j%Rahu~Rj>r9LNlf%NF-S1U`Nm{J@E%zKgT zp01O3F-`UDf-lB6?xzRqNr`zzStJF_(XNpe@{uDqO(#|K#u^`YY>logaN2{OyOzJU z2gSSWAE1*o9Q^1wcP()sI(|7gt#?@R$r_rwUAFBy^}>j!{$gM$a)yvaTT|*|FQvfk zxO1OhNIrLTU5hnV?)16?0?nalQ`mOgXu^T0Lc%Sh&jntG{k=Eck(MrxdYmd$9ts|R z_K=xxr}x6fuG(Ha8nUN24q4myw7gLJ8TY$Q7Wy3pH~kIn=G7)@y_BwdaB{1Xb?LUA z+}d$Nf2IK}8`WQ+8C0*dSw7!(otHJdd0Zc}xVE}-5wy0I(AucSY$-?VLi2Z7ly19` zcjTh;u@l&rJz}(-p;^e`(rtzv_glJJxv&%1tU4>zREd2I&qRhcsDb9HjS(JtU8i<$a0a&l`Rgpt^b8wN;cMJs>XWeWr{%u*3v zVtM!6wd>mH!W*6uXsLDW>UG_d*bsg;YBy)@C$34TKu2Tnsh2JpMeUzF^v(JNIL`+0 zI9Q8yCH?si1^PzoUFd~(ChYwkf8pz2IN|zRS-vfqyK+#TS`lKbL<;PnIyU7I%RS23 zt0Q)3M9s4Fzi(^49x%Q_&pf5bcHP*~%vzGR=n|pIbTx)LpXE>Th|{wadeb&`u5AtT z-|Erdjcl}$=gs7nG5xoPhP2)r5gac~b4t}IEh#$)$@;>Xrl$(Ptbq&{oOnGHm_$|8 zXIh#y?4WzJSo<832|E&ZT|c3Isxj+Q{N{}Lt6t*0LZ;OPc7l=o%Oat2taI6hf#GM- z9kd5bWtWIX>@L23ZR-_NQV6l#~F z;`DUJ-6lECqk0(a$}-1KH`?9nPJez3GkD05SW?oO)^2NKav~)M+AldMax1E+&Z~s_ zsJZce(@T45H~!(v5X#1c9Fp(hl>%d;V*uK(4#u2lOH-cg{Vhzs#<99Z3(-sSgWa!g z%<^JUK5=~P90MtK*JFQ}AT8ScL}_0)=J5w|540eYe>^a3^B_Bnb7~0KDZfKGd)qY@ zEQw-Xm)dwX)Vb|~K(jIqSK0KG`BbkGEw&u|nV^{bd=sm^ZL@{a78p0-z%jcDbcs5= zQ^}W^Ja@`VjELoUZO?Wn?Kb>AKpKuks&WH7x>D8Gw8swlpz@4eD54m(&Qc=xa~@;n z`FNMry`~aty()aUL)W6DZ!9o_)KgeR* ztYh4m9p&F-M(>>-pDYjQaouxm?jnce>QQ7E)qFlUeeTBjt6t{Ed=5)yYE5lYZGU@t z`AF-Ki|!UgaLHT_$%C>nuOQ-^+RLo1HRU1Inp-)Bsq&uZ(vj{0v&Jl>psu6bGPkFq zNzl+^@4BmD=+=5p4`(hbUG1Kz>zL0;*W($J1;&lQ2|~hv5$_18Ziu}eTh=15qlX6L zE|jn>85Ks^_(O5F=62O)P?S7*#J^5yEI6 zpWX_=^y*IdoKM7LZN5psMtCV~UHD9|sabZ)7h8~_eTITqxbU|538x2)Wt%PZ56}I2 zuBGAUzTH;o>nbxw=FTVzHJ*NS!96a8h)9GR)6SV!*6xGRe`{!Cd)CSs398n6IernR zKo7kaWk&MQH}XHy-Q1F~|7KYLrMWYbl&Ussfb8wYeBW|Rqkn63hR0-G1^a4jg-jxQ z6?Jqo*mE3D#|vvi;#f_4JWrZ548_9PBV!eA^_tow2aJCAOtGt1X(+|ec{H-PG-=i- z!D=uc3ZLLoS@gi#mCiZ9@Q#LG*6)vJwcnd=snY45d5B5^g0b2L^4E)5lCU5TYlE*- zN+86*Iy!*>Ru}aOF|G(BN*p4(R*#|X3F7)&)2w3T$8_%H&zt9Mb8s82A50G|owkW1 z+BiKRJ~AWW(p%V-%#U!H4q;GK@^G$iT? zHYFVm7yG228*)LjC`9%L$>ptD_0M=PU1rIQAfhQ34fAUdmcXVL29;faHdExf{{G zZt3&-aMgy>j@s#q>f1?WDrm7mth-V@;G}^)Bu!o>_v}zEC|r(qWFFBNpWxa!R|6NP zGO5Qk-^RF41jFS-)2HcSk;uf>qgX}Lo&557p>krOm3@l@`kv zntp&m=hfEfl}eX4wxq4bj7FC@V*Oi}>y?@A4_VG?p9r2FV+bzmtZvCAA$McTr%UX} z=h0h}a`+u_yl~H2NLNfvxbRDFf0IrWrlZ^F5CJa=u3O1VFRr~#Xiap@$5x}ys!(S{ z>ZPu5y`Ze_ljk8#Dx@;Es|LJTfgzAdYQFygN%4h6 zg%MSRhXs2v_vURqxL&Y_FNMT55yLYRl{>dBVBfaBBFUXFj~9%990d$5x3%WpO2KAk zGtB?RufL{6b;yVD)&;Q)^!rhY^xvwZUq)9kJACal@CvCy?@m!m39YZ%#KLNCaVp-p zY!=egecN@6!s)E?5U=a8_gnbXeR%XQ-l*A$>-sqd%Ew3=dZm2pjEPyJ+9?%o*JclG zX;kCW;Ty?hYmC+4O~~!^I$XW*>;hTs@QIE#j=xY?#6X3`F|4UkxeE7WE*pwZnQ@5+ zj*%u_rHU5u=0YgSwIJv2&gUD~mQ7jDOyV1tWe)I;W8M4h=0=74)wd|R(fQLw5m^q6 zg%%tyItE?HOh2HfjSq#p4&@ub3a0&ZlD!I-*9~#u_@8ymj^&vqqpXv>&FlgR#gGcw zy9wUZr`AV}UO*V^Te_|Auwx`qvFT$%h{SGv{7H1L;erm4FPN4$zIsqLWg~7c9v+Ez z4U@#er@g-N2q6Uw@j9A|ilE&FOH|`|JHe`YDC_%s!;L1K* zpm0)dLZ8`?sk6Q3(bWb`vV2;uPd!A-YIm_u>DN5yuW_Mo1^`Rzngi)`XCmA0qE$C04LTwT?f_RjycrpZ8_~b6_Kcl; z24MXbGrib+=*Qlg>1}F*_UP43I9D|SDYc`3mQvHsCH-a)q9^j+HMt9ruM!TZF za$7LaWqD=WdWs{^J=Ft)t^i;vC-H-c89!VF^-%A_vQOpYyrFDKR5ho=b;!J=ecWmf z06_zIy;TQDm|jJ{5EiTe_GHq}EBpjsbCh_Gx?@ZElnSCXBOgOy#@s>lBN}sh^LHi& zXq?cm+Z}P=37%HG*}FY0YFl{Vu>YbKL#-*JzxD12X-LXZeT6!qhrwD->!U)>@gYA1 zwedYmyKLH6gk326$KEvRK5_dIyJ{4#fU_ zrCJ~jN`P7vx>lqLdAF)Wmd-D*%^=Qil&MpHc2l|dEO$<)zgWSgXzti^et*w0m~uQ+ z7=$R!=+;F1mKBqhKR<#R&JT~=o05g@yT~YrZem6cKU@7Q_MSc`6sLmd;~nu!aT7NA zwwkfl>JBHr-sW5m+Lw}%f_$?@mi0$^=Zoe7wuN(+HR=E-@?x2=6oVJH_~>q{AWMM> zHdgy>z1a!!^EDQF>JuYlyQNhu9SvdV397NH_;eFHM+ckdlF_(ewx&F!v~oK_hM%^W z>21#EPaw{X+U2XWGeO9>lK`ORfq%T!TFltVGD$~;bt2dQLH!MRY>9&mX7YJI-oVDQ zs1?XqFR;H;@a-_Jx}I5{spVKwWA1^dB4+ZX>5(6M2ze1>o^>?`SOt~2Pb~d4j%VI9 zKqliFmZ=)(d>Y7l@yQW$W@Jhnhfz42>QGuGNZw|$qT)-(Gyd_ep!DeNKz6364TGp? zy1L#_X0ijjIjt{qIyPf}EKQfi4@aILt=K)p!(q;Tt$@PEoCp_RUR2~5V`j1(DA6o@ zBA~u<8zYaIJd5i&kvp>8Bvnq%cokI6W@Jm;%5-!#1_l+|V_z7%R?e04W;1uuaE+M~ zE15=>S$V10A6c+yBV3^|qT>qpSrUR*$)34K#eWli8fp5dwQ=p#qmoXgv4@xdd%=Xk zO!jQ(FLUm8h2FM0+O!lHK)}fg!nB$?{%1kj?8$xrn%Wt$H41{So(FUkdA;8HWkS3q zBhqrCK^J7nH-IIZkxr)851#^N40Id_Wd!|I+-CBQjqP37=K7Z)e=EBJS`5x4NAyRH zPrX=X9qOX8fXX_W1tB8joCsNpIRWyiAhImL6j2#n*(acmm`gsjqf0-fm^%fvg?2oJ z+CqhhBpyfX=_mU90{e!bzBXO-#d&dxizfD0`yx2~b+_Zj^JkwNh4;=K?JzHZO6MKy zJnu|YZRoU-^XLVEF|z5!%TRiC>oMWTh>+W(7Kk6^q~{@Bc^8EEF~49f&D z(NH=!7pSComKo`NXttZXJ*`H!axs$-To-_f?WqFAN;6K4{Xy#4;A?wW&$b$zbYDqxx==@)d31=^J&_75ZmNRM^VpD(1et~KQzMME_)oX*M6(KSu-Na`^{ z`CvWaQNcDS8mSL?20?N52C635Fk10EDNpcyp_MhLDvz~pl#?sU1L-0DfcWmv8Fu}X zIGhJMybCdVxw?F|fctWJORKdGcR;z*>f={2S6HAQ=%bUI{MF^MfL^XoL@dH!b6hsV zlyW#O&M{F--{4bXYtOk<%_2+FZiFX}yN}e3F*~-ypa)q>n0wJ*y)WFW*?2!6UN)W8 z=;{lwt}E$iL?uzgBc$BYfHw3ELOm<)V>u0JfRk z@=>ngnJ?KEo5-KA(dE8|LT@w36A0X}m^@FmO-nx~v?V1W*YR8E^+5VgMW!M>4=|< zVrCDL<;o?&=wDo2G_uBTkLjF4QX14!Gp((18(qD8^!}a6$vi-M=R2Ud_hpfy`)B-^7CLNIiteW$Bx^`LW%SEPB2D|T^TnP|9h*>uSc5Vk`| zSE7d+628kqR!PnU9n)Fg8R9~dA4C$iU<*y#Oer=xh-BtHfxoEwU-1Xpx-L99Wyk&P zsYB{n+CyGzpxxQ1xnFvTlQ$zuQedN;(Ckf!SWrc9JCfu%#5F8En{{3~SXI5VNRlL2 zNN^PL^@+@kVr#E09=1c7wU@N<2B#f43_s8iRjUEz{BQM@gZ4O9o~oz!bCIP=fKGe` zX7+Rs>j-t`Olz!Fz`UbPc**2I_7-bf7w-JftyE+DwwSyN7gnNEst)P6ss?PO5q%rp zRDv=R>K_UV1jCD(j{rmC^b|AxY_DCwGA;iiAe)OiP=L6&S^$VDg~0Nz?b{F;&!1DG z22lz1ICv7A=g(u3CX*Yp&2f!5SGT^44^M?8u`0=jC*abv)3=?^mu~a&O+f~J%Sbp{ zIln=B^hf|`or@nFNNR7hjFo=Qu?ROwIxrt!fj&$OT)uIgdatE(A~Oq(OF7-@5^ij6 z(GL$gkCpyQPHxYa0C42)@4?KEFm3Kt^mjCd8hIAeDu~RBJQ9l7H< zZQ=OT;jVMr8i@Xa8TD)NNTvjCTbrto*^;mR5nA)-;I{KO&Q2cLO-8{d#3<|ec}+RF z2i;I6`@nf~igV^qs7fZ|e!(5mEm3eLvG?Lt(gF9Rx!B>>J=9|n9@FP53uF6~)-zcu zCS-h~d>W6IM@66qml0Zf@kWfqB=gqtsyzA2jV3W ztH&c9oZ^tGW4Uvadzv(2G~MtaH&~x-EPZI?MMttHhl(O-Y;WuEiR&&Ixt*L{<10hNPg@S&|*kMU1>OE98S>) z$g926k=oJiP**%A{4T7m#n_T~xbk*tvE#1cPl}x+=2+}{NA2yrOo-Z)F~B8<(&hB6 zZ5VMupWO&e_bqx+5*nYG<+psDf%kUP!~yTlL4D5L4{JUPI$`5wB#27+s+olBO@G*M zFrx^C-l$4uKR&5?)|YUBH0Xb%T}v>kq4nF`V9oR=#kRHV>8N7X7()Enzd>@7Dcf$G z;kc*G)9NSj2YRok2VSrqO3Sh#+-v!;^Zv5v>v5hMQ96cTm*yCs&yAz!e&`F7vhy8h zr-;^0Y5pqXxjK5BBjHoRT3=QIF@?SQJn#I6k!Md)@gr#&lVVg+sE0Ny95CT$7B}X- zV<0^V_F{Wv;I*Qr?x&HYlshW=XMWD92lbpty33l*de{AQ91Mw+Nb$OdkB(Tp$tuqcJ6RS`XTv3{^2A01qwf>evt>b zU38h@GcT_*^*t9;>dZd=&e6+W`r|U6c0WR)=~1HMd3BMV=4Ql*SIhTj*Q$3S9j%z% z6$%4i5x)(|>#ZcQ`_e=8o3aL3NnhLDv$C5HR>0JpF?4Q=oH%M#5@mpMD6HKVVb<%;MSnCyFtc@f(F!1QW7QZji3FJ^CV%}DZt%0L`7_Yu7 zlYQmXCL6(Lj_by(yG&7(lHqO9TL$I5>Td-dP#{KF8QDCN2PpidOzG+@n|r%AXqbH5 z8R&m+_hW@4eff2G!}7WqHvH~2FW^*xQGPS$ho42JXAcD@x@L5jxl#5|yQ2M@qjX?R zZ39 zn(B^vI?KhTp-&h)cpurRUiUmN9r&m*<)-O;YO3L|9k*kglb$l{ToekI$uG~Hj?qpw z91Nm5ChFyDHYd3;C$E++acJ?-v3nGu`tV|G`>p~uviOrG1 zEdj$2UcEcKQS_FH>(rY4Z=Dd8!Xi;O~Hz&U+*8b(@9R~v&3itZ_u2uO1l`X~_*9Ok_>GJ24(dE|bw%Y=J za?xOEdo`{Tm9-}4j;GAt-ix+NtP(n8X+~hzch0LSt_I=l05k@0G1^8xIZX~`v7eFM z_%y_?=!i|+wXRx#hC6*+l+|hE_E->c3SlSq+#dJ}lOTVhj-~t^D%_FJ9m(1&ncWs?w?OislZ-AKChsMTPx=A)bbf^OQ3ZS!`XnZPQ8p z@zUFOzLLFaIVAlTgbMW?;vzNAqlDE#LDJ=-E&klhJ`?rO6mmf;0mTFmGoJaSn;yV^~JEY6mxsO-<4U2zr z-O0MBs5{B7`7uNt@8G!8?pn*|@w5KReJm z4`Zh9+H@H;%$tRKqDi{I8?_Q;-k4?hAfWMSc-@Z)sV(Ot zGR?`FQesO#d6KPz1UL5gi;pCQd=|s!_0+^E6K=8X4OQDhOXhsXyyyb9 zH@};m$u;Q^G(NAP9L&X3KzIgcxIOZBTr(ay3Id_NYS1J*z@1F5{6){*opfYn+f z5JwXYyGputBRo2(yH@#Z2fIIob&V|X6K@3>1gIar);!SCT^}`_eRozmo7U)~O z2(C3`FcWfmJMvs;8*Jfpk^=D#tv8HCB87eDa)eoA`grzYz!ARWDD}x^3t-(PP^VhX zKJn1SZe*oV-6)kg}Wy>tm@2{f<^cAz#AjZ@x_c8!Tj5hgGP{mgOF?P7jSd#OU5u5m)bhu;tK4jGj=W z-kF~Vx?WVrcsZbQ2fRc$sQd^l` zTt!>Jh;Z*GRG?4AUv4^Ex!Gfzafd*Q!{~UJM$!nBuQK+efY?32&MB|zdHJY!tLGO0 zghFJQ_6ocB9}NxS6BD+4p=ik|-0L6VFrIImy9>Kqc=~j7y-82cEb9h+a_f&c;>qlm z)!Cw(yG#XceLqhQTkqyHkXoZlQmlLCE~MJ|%M-b*jF44W8r774 zrD{_(FML(JU<*e7OQ&l{E4Ll|15l-NbE}GBK$sdeU(kCMwmuGd>gzqlsM1G&dnZPfeH) zwIUKir+NL_2)aw(&)llU|6F7aODUO^*LQ)e2YOkkJ;m4emYP?gXq&6Iv10U3`6xUc z!xbjLeb&458h&<%dAmwW-8nHKbWkeFG?7Z2+?26WCoyA}(!vGGw|me9V>DPO1sfG< z!u*U>u@<*C^nz-@Kz6l)YT|WOW37UiULN5`{=Y&y!6mG2u%AxUVFmpORW`#o$xB#N zcvu?@`y7>x;Sg%(q{nj?=BLk8IxAo{)uumJEg072)U?u*HH+u)<}v!HB*b)NfgW;Z z&!O6@YBr-d#Ju@fj4;(nu)(>yQ&}a5aEUA=;$Z8^GY!)aqC&4IyFNU-G%2GUAN85x zAXT+4wUC{9D{4dfquo7KWF_XsH5v|-c<;|QW~PXh|aTGOi&($N)ZVp=0o&e)+61dUQaue2tH9%mv6 zV^w>qU+MCBoPpu*;e;k?F6tGkk%dfMNTHGS+)c%@{p^VNSS~Tj4C~)JzYXR@tg%=)S#% zLX{xbzS}*g?$oPRh}ZdL_`ZFz2b>hOovB$q-C)LliOg1ZE z`Z}KHbc%W7^;gHQo72e6xMyZ?$XqUV=qQ$ov^N@I z9t4j|#YWE6ffK}Eb;uUr34vD=@`CQr7E%2( zCd^Mv>INtp6;1bDX8BxulWl2hA8Y^m1n(reM&2{(s#C8m73wf3GzOkLYF_JBM&s9$ z=R4$y%DS9lGfJy0F9m`3fq4X5jF^#1tibO2C(WF-p50)0Q8akLwP+s1L7ci}EeRXR zu2k_4PA{sDWSAVY>rh?iPG{dlxxQ9zT1KS88emuGI$jDhlrjYz$jKv|p1rYdovEFd zCq1qq>WYfc(2bt*%B~(f<4d1gshX$!sI2RG#&=i&!)ZbK6f!#tVVI6z$C1D4yRPpI zB6SlV2{*Lkmn#^xzf#&9Xyup;p;^?5h?gJ-R(L9T~eV3anTkbTUQbh6S8LywuF#%)u5IjDGae`^bSc|YR1(Z7ebld|Bj zYZT_?BPM6#U~IS&@LUx8Sj62t2!J4ICXqHrpe{=6HM%a^!cIFAaR zEZN!S|0q)Y^1Z*RBK&Nrv=%S_FWHQIJHGxFN*lCR{yF!ylK&r+OGR0vI4@IbKd@=-!#HzF z?RP3cZ@Jimhq@d;H@XEFUzc`gitW%$VtM}_2^_(Xb3PdS$c=7$n4+l5IZSDVYA)c| z$&n_C3YdaUT@H5oE-WxlvI5In?KpL9SY6L?hT=qT8Fv!L`E1vDwD#b8?_~@+3X{ar zq}W|e#RFOWOyWB+ZCT{sP&` zji2jsw&tSbA{grgXahL@xV0ec%b6@&wNZ;sFoByrYtgkIX0$vxu3<23LEX`d%qR$~3X3 zFn8;6cIa}9bv;3_e8+n)YF>_1?>j(ow9FoZz(yhFcQUm4_o8&3;BWo>yG^aSwIFbL z6+PXj_|oJ^^|v6{_=`med`*cwxb)T+iTGUV)@2tv7eNLP%_{Hghx=$wMVJ{O@!y`^ z{DE9r!osD&^`Ct6#-4)Aff4x>pg+U?n6r5~xm+y(&P12zN_mFAxkM

&o^;zdZ7 zXml7s+ts*@&^6V%2Eei`MR>2B$|O>?Nh?FM9xJwz+DvoXc1gO)5e_Lug!k&}HSGE- zbxnojM7A98n=t+&Vt^2lgJB0t#M;4aTQks4W*%{$BgSElM|0VKevei^-a=ues$}oU z#9jV1?4iR5l^c8RL0*pmy= za8jwXXP1Ky03CO_W@T=)s~f$4KC(33{xh9=j6{sAj`E;GV~BrRI>TQk0F@7WkliP7 zq?7?MS;A*fc5@ueD30!yO1Q|DhktFsP{cC5yW7E7MGu($e=&%kEG0J_fmYbNyZa9e zTrfQj6%zBmb_Gnh(E&6pF%ZM^l!&R3Hj4~4lY|*`Jw`Yk@}SteaI)M51~*k33KPxv z*#&HPD`J*4IJJYQ>P*@=Pjs_yHQxduN+$h=mWDzqKvZAzx9~X6gKnnDWPyYcTL_4?j@eQ_q+KU3-n(x7Dz0&l!+6N6#5vq0`3yjnc^ z0z5;P9I1fb{|4a#vLa0aUbJ44hc#vmGZV#NS|TAQ!SGDd0n$Vzr{y}SC?p9q8ghFQ zDB5YS17SB#)CA`Q;r&EntN`5<;Gw%p2==w&;_4+%(lqD=ZYa|Lr@ zt8jfXfth7eH(KqBTWDy6=$Xyzp>)lb{_ticPw?cyPA9`|_UyofCbPE%-xVpA$4+4w z%)qn1j1A+scl(9ePL8?DpB?`TW>w9Pg!G_L5X?@0lJ0kYaK$xQzGuC{M2gMwb; z*Z-iJR#joJkzh#s6Zmy~ciLD;%^^eGp%&HqQNuTk&v{TZ;7CbW`fJ;h#mkj8XD4Y- zmSrElcRViL;av~Hlv2Yb#A_bz(ohQmV{2Am!O12&?B?Kw1DLWC?~cLMR#*{7dR&i<)va;Rp0SiiY*n9{!9y1uh?lH(?6ZD~C_NE>vZSiNc+_ zm!-Er%HWo>(TMmOaTmQSXndl7R^ZKZ^p&o56v@hkscYICml=t?laq@*?S1AS&>y89 zv2})yuUF01$6C$gwxsCKPu)M~`JB=mhp8xye{E;7TIzxOVTp?2nfR+V?q%h_0}**M ziiAtUwwx;+#!X9D)Id;2BSHH1RXe_F6NoHt0SlUmufNvUEg%wcop_@Gg}#crr*Qi2 zPV+t=!~fErp}+>A>3eQq6lj~FJ9_sdtOxWL>`mA69&k5iZZjxFcnG_x;zQGtPA0x7 zfUuG01{E27kq0sHkq$030CQ!D7IsQAQ(5R;M$<8$rk$G}_lf9CiXZj13=&>Ew$YG4 z6-~g`813nvNIC0CcdkY**xRt4Zcnx@XV*ZrNa7kn=p_mx9YCTB9-4K1n){w+crM-JmZb@PTUbHmX-4Ip|3Xq9LOmBvDgU>6Uihq&FV2VKkU zIwkBEvpKl%`wQy>r)I)lW>bSOXh;Gahu@-$E^FPl$aABoA1Y!xi%dC*k)$SH0-7{= zuBgrx5yKr-F`2)W`fmmuPmD?eLSeQgg3w%L!oQXJ0fUKmn!KjHMTdEV!-G<10?ZU6 z>-_19e^Y`!y(q#Z4Z+tE6iQsZKMcT7Bty0l=TrO~n9~nsHzE(7#17RKiJ{3qZn1w2V3E!^e6Yq64coWmx?HW`@uVCOdO)7RthL=&a2B}yO-=YCDl z=kVsPmTs0wgc5UT61s3xKMG2RWNFLNsP97ZU+!rW1csvR&oi zn{p#se}=m!4 zy8|9L7iSU~Mz860gXs^#)Z!;2)@X@IpH?+lpaj3dh{836qP4;%MVSlJe3>v&Lshppf zmOLhLKX6l)j$=a(q5C_!-pJ3XWEo+#OUVLeU;GyEkRsQE;~#(I1R7$+_PUE zhn?xb#a97zJP#EvAop1a9iXTH{inf~+tEWa4}5BhL54)~aHFH#=pnK{;J|7Z=oi=A z{{bo%UKd2!{(79d8_-}*<5U5Gc`beQ^qHCMFH&LC|7C@qpa=r#pI+4#msps1h*%v} zdLJd_!XBSELM%O&L7B}-?JiXT@K(8 zy&{SQGrK#}9lC?pbjB%~Y~x(UcFCAQDaf3YdS_ArlgU{*6;uhN&qCr5O%kVS^X2KkYUaFZPRcUUA<*<34e?vCH2oO{A=PM?WZeYBy&iZsxdg%i z<=)g^kcHb7h53fXjv?T7Ug5)EyvkecKbg9q9^i$Y+ti)cX{ADG?O)pB+il4O!~w-+ z3XpYNx(P@iTP=U5qs$B?TYPTxqVg77UvFB-ue9~Y3wd3B7RQ47Y{mGg)0Sv;NtgAN zcA(;yxj-=j5R%;lAdyVJ^JGc84-cv^t)NCoHQ@8Cpzxn9D<5QD@w=AdD=-8VXM5^< z9Z4Q8?jZAo+!fR0XOByRnW%vSr8AGbBa@`!VGxWDP)tdkHTOX~@NI*9-_p??cughB zf~)?KbzcHl(|?Jq2UUj*@?<$By zs#h;{lD}p_3trcbY>8%r)+^{T$W-}mTXEr2B-80xs$T4_HWt)W{{xf!D;MNMNgp>6 z)jib^vt*HPinoKT)kA7%)c}!ocq#VDyi+(@JoGqFN!HPknasOA>;>!{pT|xSK_Nc? z8OlRIEcc7
  • mWm7rK@-)V&m@A;f2JmLFb-RPR8gOrlmkf5vnc{pgQCOt$4b!VM)FXp(}u^!>i{P#uh_ zy%r&whSyw3Z!MvI&L4T#f~mQK)DpEM7kL-lUMofDm^m+U1$_2M*ZX1M&l~8guux8( z$4VvM@~ZV%DR_RoUGRP=|8r7UG!FoKDUs@;R=CK9fMx{0%Er=5#jIUt|Gp(GvP%!2 z;H6zS=^&uz;MB0Gu|>QXzJ-dqg#v%SJ^(-dIS^TbsBeeOziv$n;{E3uOQY2Px8?D0 zSTgmd<@UCppPHuTa>W)D=C=!6t}LL){sP{;$OwX?*Gw4HJclBqA$C49%mcj)ltvOV zx%M!j4^4QJ;bp!kmsTbk5#h`5Q8JiElzK zB!gAps&7unP!0_D2&c&r7fv?Ac{4fK_LiF{A9a<5Dgyld4*UHE#+O`a@#?a(GJKaI z!s1=u=580EO}1p$*X08^diz8s6JX?9Z@oh^zuCasw|%u92$?(8;85K52b{#4kNo9j z@B~!|F+>?KPPc$S@MgI(IxSFohr=X;xhKdD{?IIzoZ$FKVf|fWX9i|mrE`Ix7v>J- zQGdWP{(+q>x!^OJ>8u5VNqWW$e;DmFp#)O}Lr24bI-)!T82dU4T0}S~IbIalQlft$ zioab({az_xgdmADF@A8q)k%V=218zeUs@Plc<%s%%$)zbO}}$?>2!hOYnNKG%eUQ3 z_K`PdmyQ_x0SLd!nPq6S6q1$>Ap9ATFHUzXE>%W>zf%Xiv&A>PgZGK&9|8Qne0WF4 zK->RVz$-@bl_>skf!Bff|9uPycp535nHWg&3}zMNHR8}jXvlTG3wm^(dcy?F>?W7@ z)|8EWyOcfpeLQ{>s}_&m#%|C@);9|9vPEbt=jVDqDtqd0`Ry$9&C9UL5J0lyPfzc6 zZhJHO`M<%!^n4U<+5E#x^3KP2fO)1&9_jB0(7`|i#?WLj z>c|rGnCx} zG<7+EDX%Xy5&7*{)eAjO)s0x%P&)kAC=OZfVDR|(bIJta3R=(qfrxkeB>UbI-wzY#7tdRz2MPAtuhfH@tbXk2%$t`sPR!n^QB6bIVcvv)Oo zI%b`_oGs`ylmZJ5B$3)DyME#C`~cC_Fyt>I^) zAs=`VN-B_XakvPUt&gL6ivsa54cYQoXMynsiE#Ze4KdrKuN^bAz&Ae@6lIR6j_rgt z56Hh^+xdf->yhFY;3~`%?BD>IzCbz(KoPMJYz#Co9mS0Sd%gvrv8{r|fAl#7+N1WhZE+PJjS0c`0rrqol1IUVZgRi~w0iuh2#3QzE-O+(Z>?Q)HdJLh* zNMAHHK+yWgxbs-}{8r~WBsApb$j?nm0GzP8)dI7vj*y%LWl?ITq+oHCjpE;V0Wy0u*Zy1H{!Xy&IQ!bov=sdP?OE>V$D1MPiyvQ*H26kOPdk7W9lqk-(4 zeP-%poI90TJ0KG>29_b4T;UH9qyYrzGlMY4Q_w8KjHtQSTUbIuNuQ}Yop>{|py^H@ z7^4Z4j?tX1G77Zkvh_K4Z^Ecg7alDNFrSWYF`aszDpj(A=x)GyEzpk`Aw_>-`r-Bb z2Q_SDIfGfD|CS5t=}Mqciq{+?JUHlnljz?N6aA{!56ok9f$x6pq$J)Yefi;64k(*7 zVa`ANQkUa{X-Rr(93l_lTYd>_(Wmxp{d@geWPYCRT=%MbuBw7vSh;NO$NX$VyDCB_*(^v%FNWFk z#ur78Fy2Bgv|u{2WkPx3hKsrsAC}qTypvP_*5t00aUtx6s<6pXssLP zVp@=VVR9OUC)Q96B3!V$M_J!gzu(0BDR{E$zVaZI23Yq+_CAG2Q4E6z4ZVlh|DH!_ zU2^ITR}X#*It9i7bie@AjH#zv02sZi?XHHtCm?a;?oQc5V4tW6(*jpg+|TY^pz!-z zq3XDNqbOoyNVIbpslfRocW!8a5D+v)|;f=pl zU4aKhGu8f1ru!ZSYmv8jf4lN`RnglI;ryB2+`HN7lBXr!f6pZr_^x;UAqn*di;;&i z=QsKC(nkzryL)Gg;O=ib@H=P!dUNt00YKt^UA_FJCi(m9<1hEe|FM97T;LxJ_y+|4 zi30zG!Nt_+-ACj8$pc<9&`Xl9e-feBli0tjj{eC6mpqXF2kG}4$a!`2PbT;WvwH7K zcHikN{Qs^P0=*pZNLzMNvk-71llb9kh>{$z(f69QE9V}HGJqo@?AJN0S~@NM*CrR* zoOg5NVH)vK3>Cmg%D%EIgFP`&55CcHftFO(_pvo4pZhyib$0ZO!;1m0t~g>5`=SHr zun6>Zge5}`5jCh}cUwg=_$H{OW#E#06nMsKr+9Vr%3B~_jjzv{I5KkR8x5a4rb=>@ zu&;oZsR7@`q?#P54T(bM0d8`N6%jr2^^$q;g(;D@M-twU@vn|vp9S4JNo2TGBL3nl ztBBO;^~n(PS(aZ`5%$NA&HgX!b)*?Qoy%pFje!4SDvx#%T_<}C7JLDlds)vJ=sUsY z{~l%A+6Nj!#YE|*=+FVnr?+)sPvG_jz6q>AU|T%=7pxiT!q5 zzx7>^+zt^M12!B`d2)Q%V17W-BBElCL~f8i$gN078r+}sHEBq+>&@#eBn>b1A); z(m@6SClJ>|8VqLcs2P~LfL2%M5|Wu2D@)n)$`YuGEY`0O6$}acCU+(ockypR{22(r zGxN31J#@SXXnkgAY7iZnybl8N%!q_u5R7V}YXAgcfr7Q5y*8VQfjI?qLp(+q%r3OV zDIe@jdYH@pZ3OP1YnYhk@>OR>mA*~5H))uTxYsMD$`*p3rqd1lzDK8}N*EmVm(8We z2~E^YCu6$sS#&PUDm-2xJ}|8oZ%{$tgey$Re3nU3cGf>^hT*`SUM5ghwJkDk6z3I$ zGqN>@Sq~jAK%EDaSV|!trn|a;Qqt8vD8o?%&Vg8+T(~E=9Wb!9!AT3atuQtM@DBC8 z^>m6FAL-*Oi?bvfn0-3br&6C z14pc^5cimX$aJ)!-pm=jKG+PSg#Ku`8v+9N^&+%iZ6VGNLj2m)YJw1c#+#uJfxxFk z?sL>!z)YZ%#0VrgsA0G+14be*k6MAMvC#s=$^=n$ICzji+HRbRh6kbf;0c}76J{K< z2A`_oy73e(1_03D8N<^<5RjF~<(BurK$a~_z7EuVsJ!^|Xa6ocm_hS4+?C0XW;p|O z;fmln*c~m#Y1Ff?Vh9J>eCb*zmYOODNy{J<-}4E~g$_o?)#VT|y7n{IK2&PU3N4GXy9x5HA4G za@+ih%#r}M)C?L6i9gWDG8-Wb2SZ$JJTw|?-A(KJ$0VgOfbY!Dr9M6G3eRzzK&QIk zCbeK&V>m8%Vg?7lN~&C`6UeRw$X;)~ATRnc>2PdlJI4tGo;*?hr%L=^z>0uMY(gB( zup-1V29#ll9gN_VV5K%qRkM?O!g$&p-(l-=*P z`*d8pB@Jk-%I8ob24j+s8HJ-d7*YrwaCjwnpVR76tSdDmZI^XjURbE(Rn&a9A^+)BY-k`?+8++9#Ree3Ab{ z68m;+xj*SZ(ZH@_bpaVme!w)$vRCcy3^+&gHtm}5-Js#MnO&xBQbhCUXeW za1i@&Y!k#&{mecZcO3uNf>l=0x3ndtT17dD6sj;m8MVJdNRfyYf9` z%;Tg#C%^ueuQQ*?IH==bnaT9`dqTzqr0)peP(XIx=UytGd*z(C0-Cr28o3^Q?({x! z2R3mBHW3hDA?-SG3kqKhjJ_Rs{<3TIrxO7Iqt$80aIH2G6EmW^JEmjoL{yZe@ao8R zKLG}sS*d?_Q8}>lR#iT)4oif}zcHpw^Y9mNx zm9VnRX2H=MAPfJOA*-XD)^jhn51##MLkeL!0=8rBRi9aAKrp0#VM!v${z<0mHqHd%ghHsY|nH6TgGY z49KYYC#rTaHQD9(xTfPR8&Bbv%&NYhHmemh`8KKCr8~Psb0st5%vJZ9Gdl5Atnp!| z5OYxVOfyIch55{bI?l;~^tJ+|77-HU3s_b)~?B1}>$N zT$0B*DkR-SVF!2FF&xcdw(9ixvXcq#2i7|pR-FE!M{bvvlvIix7o-e^<=m{Kw~0f6 zVG7E#8sT)-zEHl%)mA926aDAfA=_n`-g?$oV8`3&jXGY6c`=qrQqOT$KSle7Q^ z1gx9Um_aI`4ijK>(~oi)QNZyqQbd%)54@QVg;O;l?SxNWXrAEgY*8yVKtHf3L!;oQ|s?J1EBwB@r@>@6- z8pIju2QYf&O1xuta+Y~EtmsUbd6!0MKWaf3w8|)f|9p|5`_ITYlK{OGQlO<5&!r^8KS%JcUKyp29xGyb= zpcp+%A+=VjWIa32qQ+QZjlj(v2(n!3>fLB!x_SypTYdp!Ptfsk^I>M~dHER+=2c(; zvKW&>?~t7FVI2UJ1~9{fRFz5Ax7Nwnz0{u}AKF=a6Oh&ro!}h$&>0w9!o>_CUDQ8e zEJBbyT^YiM%-|UcA@X5`_+eQ%tPm{Vk^Lb}T00(>sX$OnlAebk$AVRJ=$$mZo?)cH zTvcIzA`E`M5Sd?p&NSR=rtj4x02m6FpvnvGW}kuZLZ^MIRw_wBjTp^AcG46L3p0;U z6Dh^bK`1~&9+sYl6%`TH-NXs45BBBbW{2|7fki7?sD~P|-1mPGHG8r7(@rG~*Q{ zys8%vo*AcR@uKAAJzWSStuRmbEfbH?AmHgu^DP!e!sJpdc8~}0iOY)}nOYdi9b_9M z08a(WKS`ip=7GSAHYQFgmnrF#0?lEPnUm@#Bt~`v%N?dNiWkYsS87MG8H0rp(}&>D z2QyJwHwYq17*`yr=|jVWM*W6+MYDl2@t(O(2L7zxM>#1wQ7{YfcW@y@YeWc&7J9|M zV?3H@jid0#a}U$O-tvG3DfYYDtBU`%T)I*+l~#YZsj%WIRv6_S-8+6RBP{(8;Ydju zyi_yWTl~VQjBnYHuHGwSg~di50#wGnMNc@d1LcRXyECxj?rQ;2 zg0$Zf=+WJmy#!3A(p?f!+ZV(Se3gDxgpwkvya5NPo1$%bxda<13l)-!%Jl#L&3~pj zH_r#D6z_4;Yu?d6y&2!SKKYT?w(jhrZ^XB}XNJoJaYOv;6-_M9QOWbhDl;OCX}L02 z4yb!T86s!|f2;^@e)4ODco9|LP(DuHBuJ3AOn_(3|5>yo5dZpLiSYin0ez)3@0njk zWE-YQv)xIG99g}iO$SN#EhjWyQaYJ5KE;wK4(fXdgUlB3MrE$}I|kF|7qD=~Y+4t4>8!tz^ zF*ESpkE|5m2|K$!p82nH3)Wu`mhvbWnX0Mflm0#T*Hb&Pw{2L|wSUR3$P>k{->wOm zv#ex6{(;ZSexvw*o%Jn4ko8u#-K3Uifn9r2wE;yO$4yrDsRi z<;K%mpg&pJl$Du!slwc~<%Eo@mKEA!F zs6Q-ybfItwra~Q-htB*p01p%+pKqVJbkGLNLPs|%;3pb0;P>q_mp;+yiJ6`-?};@% z;m9VuH`v(Bo65wN^+%_GiG3P5-k6UM6Ni&OI`92Idm?kV9Kco%iruLdx_SC_9(a$Mv|&Sp9d0%g4p zdpn|LpsT!TApX>*CJ;gn9iJ_auIAJBU$3)+4`}3;&2jF#T+=pRO$nlN`6+I)jwW&S z7XMQm*~;fk^tltR*~1O*c;?eQ=ND~LE-%=B=4swBa&)b7J|@E}wdiV!FT=0)jC-vx zJ)=1E2|(|T;pY3cCD`GqK!I1cs?;uxJaAv{0VpT=njtq^a?Y&1@2QzGi^xBPgl)W z3}h4O$~E030W$B7TQnw$Kr9D|i=Th%`Lj`Z{#9zQAb>%C-4U~r&3&7}!4`RuH3tQG zK;v?31yU|I_`S=xw5V$Ekw*R~i&y<~$S&^?W?qqhM@9BUWw$D4LB_|G4hJ#zs;{@P zEPymt)%Gf4%A!zzVtIQA2kX32n8_6@h@C5(-9Z*x=ZVabrBCA>uNW8zI=|tnov_#cJx}~u{}_K|TR4N=$Ew*d>b{Nj!VD=VX52PuQX6!UeLRr*7Y6$TE|EtnkoAkQz=sDRa2- z?rc?1ts(GxW=iW_8&Y04Jv(8jt1gq|mnfkE%9Zh2Re{Kcl-m1P;Q1t~VrkVieUH#s zQ&8%+k6(2xQ`JCfxmJB3l&T<(Zb2Ns+JC z3#GGrKqi2oyVFTWnK!r3TXmhSldw;3^BDeWX<%vxpViMR`-HRrR4achh-G-5@=fe< zWtD~ux8xiA;!}qcd$`7LQ}$KZtCqH0Vcx+K7Us`sqU$?SGV5E`a41ImQ?oGA;kL9d zDBI}C&lMWJ_i(Xjj~u6K)n#5Q>ds2K7(ngwOPPh~$sXJxs$6gA0bM1lxrF1?O}7F>$@zn`#ca#qCKG$koQG9?IREfgVcs zzvK|gm+AV_z^HhgP8PF`@|=f)PdJ8J(0^N?e_7-zGnFCVWhkb|Hk3_SDafUEqbY++ zMTe~-`*po>ap55U6L+bI6B}$uM?VRABBwDo654K_^?&&*7ibExl zHYub^;_+YYuqVn;lDMVp#-yeT0j>=qx0Tw@k1dy0RU27PrL_`Fs&|<#UnILVAezF| zs9QXr?&4%*ur?CDl{xgl^9OP>`G4<-=i~JR<6et7%xCnzH?WIi5YW{XRR=RwGf45* zHXaOR#D&|Pv$9tW>ZQCzBeO&9j4XTETL4r>gV~f+`E7w|+B>*NDc=*w9E+*X#7wJ= z;ZT~&*bx?0Ui691vJd5>hkBtIEG**N9}7AksC^t9{YGqVfbjvzP-#)dQ93bTM@tOH z&`Q^KS{NyYRqodU8tx-&!PN@BK`e4QUGVlZIZZ!Trau*QcMD0vY>45ymPY~u#>VV} zdz4fzv#C1hOi^Dn<`oGfiCQQp>?4;ow(e4@o<+UBk?>95P%}MVaShY+`T%IXRx2!q z%?C$~KuVxI-2_T-tgci%v?II{TMmqIvhaF!Oxw*l^h|p5x@R|O`amb07=Fl!#~^>> z^9xuLNz`YaU;#j8=OY#^YtE&%t;J*a#BG;VI}rnlO2M~k>BiciQecSf>XUZSP|@hv ze*datwvu*rN0PCv0D34HBgliyZ4Mo$;fF_wY_ypuT}#z5=iW*u-kYVp$z9tw18u4BC$4jXfs=R`my)iybU?a@USL0T>~3N=YD*J6(_ zSzbl;xjgz(_J?)GTHC?fNo7&&l~QY5vwnIeZ092;WxV497WU^H>>jT|XefhD1(KB3 zRbO|KvRG}~O-?|tYvFWI5$eKouCAGQhFS5usg{^3TB4g-(@73 z{7qb2M@tqO_L5T_>DtKs*r9>&s3Jh+j!KmWq2aekBUztqYHex6xLhhk8r~Q)frWvd8n$@^iB7d%WbkHfLnI8XolyP4$ox z47#+GSxya}3I_SWU^g(@^=M194tu%^;%*+Fm< zQ0)F&E`36H&LS{|;!_Wz=o>4(olUCfI}%9gUHVYz-;g`ko#7%q=kOWNEB1QU=5U=z zwewnl(>qa4>;Oi^fb^myauKO z?=Jn}VB^V~v9um$``C|@wxO%G)$u)PK64b2DTj0LJtu)*ZjWtI-fsejh})%loswA; zMeKW9A$c1dA~4_pXWRC*ON^{L&hQ^e)d6L>yf;yqJ=R0#??&U@1FlmUOzefUn|1sZcOF&aLmjPao7?vS-l>cCl$s4l7cnD>@bFL?o|oEqg@ z@i-UXSb9sr^J7%VZ*90%wx8SSBhl5h*pfhcSpX~F9iUfsfq=}QwVd8_0TmB&NqhNA zmP?TjX|$HLoWyehgBE|DYEzal^5?+4eDCGbkkrKb<*LNiw<~nbLFXmg7XTl7@b0dy zlmoF;N#&~gZyURg-E=A{{pKvIEqrcoc>p&-bSvv3*UR_&UKVsjfmT!n9pRAthc?V3I4-tsKzvJx;oiE(IYS*~ND zHJYw__u+FG*Z=gr@VO=WcJZ+Ak6;%BKbBj&marO=JY~rGjQP$+ zs|0DOUc?e$bmS%VqwvgKt5jzRrxzwMR41O=sYGe|Vnxo(Oh$3jt2QLgiy6>VQlsl74k;QeMuUi!`@ySNPp%6TWn$CDK{L(+Ny z$!7=SWNhAGcB2iYIb*@yF0jw6;RqIw^~%GEfAH$1YlNwc6`f+2>HgpJ7Wzc6_A~s| zakKpdBs>gy|Jh~1=mkFkDolV1 zrg)0cd(G%O)d^4m@wlTG4_IQBH-n>H(?poZNbC(TE$C=D`YwAoXvYjOOa$~G2ba-y zG!e!0+!=LZ%ldD(EUA^K1iYe4+`IwIsEDL{5trgq=cN`kdr0sr$k#$Ympm+PGXTzy z&6uwU;+H(i1=^@y_*B+uP;MP#m64Vlv}K1+L5{q4@7P1_qzPd*P2}&Rb|x?>5&%O~ zU7DLpJzg1Nmjzi8%;KF9II;@RTCecw|t`S&>VU(94s}Nr;F)Rp6}r`E4GcN z-JB*rY2n?acTRf$O7qsgtiUhzb@%^9|Bjras~yK7Roe$H$+NF*j0m#50;D|E*Tt#^nyG}XqGqGyN{os`MyQ7MRe(IY{ z{b}gq@w|r!z@@p{srfn5#WrOGZ1gHoD2F92fnE<(s6cWXxX-Bl9{p+D`4`e75sevJ z!+jG+FC(-lG$lfD$iSKNJaPt_`3_0{ufEDdY=t`WY}AnLoAehlisZGwc*v;N0(lWcvWrpAEC|ieOza}sSqsI}3DmB~KaD@&H;$Ht{0r*_Q_DjWiB6n%jld9ROL%gprE=Ps=Zd+w! zLQPYG?j8BzaRj!a$^m2-vT)1JM(=%mQXOj#2TP9$;9%Khgv_3OVxm_4b@Tie(t)ZE z`!o1G-8Q5aMLBaHfSH1voGCwpxu26AK=*DVc^k`>ibfxgA)nV$d~vyxoiq_+z8M?t zl<)1X{BMe~mP;@5^5Ht$brpvh0a~}f-fYj!&~@!X6q@XPM(nS;ns3~=>rur){ia&F z_aOcEho(ORrbize%N(pwubTXjYr zoKpWx`Nd{nr43biT(n%erFI94TSx|QMJ=x)w(g4OH-}I_OfRM5r*DF3YqAqC(+(o+!}Kd^I+52;SDjU&*`PpOIKO?#5dbK|aD zIXk-ia+X3{tzdUX@g=9a3eOgW{ygpl5q&aoWmijsb+}LPbn%WaxqfTKf&u_La#hOD zWO0OPyj)%&)3JCjKcpv-MDD&jR~-I`$Qmue(Dt$g!tRk|olZy>qN0!uWNyic|O6C6!CKaR%q` z6Js5MfCt}$rDot7*2eqc`N_^W?HfJ$Q3j>ZE>yqe_L%SgX{)Wi<%cAL*s0>9G;iM` zZo{{ffc5C}MmMRdmoTDc%B!yL)c&oF)iLG&Q-0(=$5fwlK>GE-Oygu?f3-?Q>M zd#F3-p{XUNNE5U%3q)k#`AFme61JjB!DVbCSlejzcz}Lh8x9XhW=NOz>;iDSm7o~J zH4L|>f+|l?(C-RiX3VF#Pn%)}%@zd1=!(LN-3>-_M za_WV~+|XqlAR{P$HkxmaCz(SR>h%8qhQ3aR;|nr$5sS`Q_dWsvLbFc}{(Hc|O2frE7-=|VU(qJcV?1fmc?Obrqj zqV0xbA7uJeD$ub81F=;3yY#zX0!RvV=k1EzcAXt#l{_i7&aQ736~O zi4>g16|&4?bcI;t+5-o{@MPLxfTd4K;_6qfaMC```3Ru?MQr17WNwsp1UN$FnU$ab ziT80LN5G*So~XVsc)sOCO{M=2jv%^4b(0F>Z`y9^BJ@w3;ZP3aPy#Cu2K|YYC=`kV zfO^yb2x&=B*6L2!!7(v1om#dNF*O`9051B1v#<~eKQRosZiWMA)a{F4#9ywynA8Ks zX;Hg5LJ-=}W?>(5lDG-oT?^?V`Af4^npq+Ry+LzfBAVm>_h=4~#2aK?ZT|NvaHKpd zie&S^8&G*@>21yYwvh8!d$n%=(gan$UoUz%)B+05__G)cC~CtTobjjO9GqW(tV`rf zR^L}$V8d1?(#R*tdlcakS+1cu-^ZVku8j*}7mfAIWE0&v!BvZM{`Ko%jk3fLBdC`F4BeOV5wPMnG4f4SKGTunS=s( zCxgOJM-H?9Q$U`!nqcs4JGWLD8{f^$x)|(JGE23}C_sg91ZhOUV?2 zxr=grSrz-GGDv&XQ`-1o9e?^b3VaCNdjTuIrxrA8F`fPTGi5dTHP@qFQ2~+kv4QK0 zpPjbRCSIk#XOK4L2h~j;YiLC1%z;VvDhWtFJ;pHh38pHv6#mlKMu);oKo}Dc2K;{b zd?eLqPcQ*t{I?Ls1OPn&K*PJ(@2(<8A66z%_X*T}BGh11Sc>_8KXEvjIGjuzPKFQk z|G%8btit#JHm68P1p)o^|p)(KuLL)ccc@T0OMV&PeyHa`Rd-W~n9E@+E zIyO~#$Y}>E>@2&i1oOt2dB^rs90n+SF%)7Gxen!2!n@(slOMV&@OWLY^2LHgwGhLcVn8(lNN=Znl{BiV zE)yeOWIy)Xjf2$ywGD-X@vnx_3oktbaROC43dITe%v?P%dO(o(Z5l`M1vKWi?n*)W zZ_Z-0v#z`_fZ^mwVPP144Kq;OazeK1ld+;!uk)zKt6cI~zMPD+n?zZK z+ZlG>;>QQZtqcwM@8!xN^0sNDOoUu3YCj2}mS%j8VoU;^8;#GYI*g^-x?+Hsr6YKZ z8bESO0VD?lc!1D7XeM-*2br)KaS|nv5O>#JMb%{>XbjG{Kjq!^_9~;CX`)s(QTq<< zivqw`rjaZ21DsuFpr38Xs_UL&uVU*lXb4dPgk0f}tn43=0J}uP^iFTJXvZ46=8G2q z&8aah0HfNW1bKO?=RD;0mK%OdYNALtenWZ%WxkYY6u)MmPP);5a~aE>x6zWQolXdmA2tbQe+djc2?z6WMHT>QDi@|P8w_;4CDAqoT?Ut8 zB~Su7G{pOc(N{WZVBB%ienO}D*@^U_XZOei;$zs2mKIOa)-#9-B$~7iCZrsP+C6= z0u_fkQVdnY#xb;^>{V}&YgMT=Gy=Q1DT<^{tF1s4!r)|&j4DFN_xd|X37YRAmT=5s zXYP7jGYo?!+Qv1(g#Ne;$^nGdP41(E@r+qa-N{xRMqosPnSqN-PvY+97!KPqiGbY8 zTt;m_)B;VuC)MC-(h~5&^5LSNRT360LtRZ4r=Sf-VGF^ah*hAKRv@=>S6~?gX3{l% z1OUDx0stW6Z2`^O)9(Z~W1vWo%j3?ZQ|}J|*x6ey3U9&NXxs8z&@TZ-uqB=gF|xG` zzrB34f(@huhGBfN;V7W|4toZ~GBk&VdCDOC1HnlL)jkNQ8pGp(cJTeACL>aVM}a7V zDhI!PWC@cx>RWEk0*qup+21)r_SRNsxo(0H$$<<@DnCgbS;kHTpu4M3fkEUakXrSP zE%dviwI)(fWh5ma$6H{`nb<}>LJcQzOb$cW?8<^k{JLu0672;@{Nna{tbqz?T|`h5 zxz1PtQv=onJoCfmVC^cy>VX)*1(~q{p+oEHU^+CQ>%-a+0MJij7Ld9?^-1KuCYYNq zQb=5TuzJvVk&{jZbMccT7wAtAVl`^O!366+#DJ$j;k9(C0#<^cXxfm$NXm%g!IRyu zd*Ted8(2<%O~^hBNaw(12q8&{!=(O-!XI3UH-If9`Jy9RnGN)IK>Xlmwek{S2nVIT zZJo`FV;qhRYgce8HD8KfDCydL>M9Bc8>zgoKIzmZqs*v<^pET>1mJ?`jpa?TB`R=; zYRt;v)p~=F<xgd@MEm2BcsW+yjt_22M4=$NPx=WWFRErCHmk4v8CXo z`!@{Y(Qq5!8zaQxnGj}|#w&p^)lUUwZNx$lIl3E9FZ>D@Ge6p{w^290O-yH94;usm zA2|Uy_vO-=b<}y8nIZfX8;Gckz;_6(*&BQ~f&u;|+nJc|`2{DVr=wN5xjs36C9#m! z&79_qKYQD)F2sSI`x%}6nJVa}7B$xJbYNPygAX(2YnIwgk@|XhyfE~S(_*WwOm>cR zMlI-8cE{nNbV2>h4b>f5m`5BYd0jdV5h)M_b`zk0l*UcPPE-FHhO!I_9UY*?`Naw( zJx7myxr~Dr;6C+HuE*G6&`-w^Va|DH9Kpgi@e`efJk)4Q8m+B78dLt8nZDqhX+3EY z)$y~BS8pASnEozeck#rm|KSDpU###6Q~$$jO_OQ;0c-&_bfEEQsLCdO+T&dD zr*WxbI+fBWJL|TXJ_sWl{#BL_HMv7HlD@pnR&ChP97=Q#ibgm<0 zr?7b(LcCzqV$o8COWC8?yeXa5=zVc8F1V`63=95~zMI1R>sKzz33?vBb|?v8yi3d* zjA*NB;Av*|RmQ%pJXE4li|N9qFn7GPEJzqR_IZHhp_?u&!3fU``nvv_VXqQwaS7#1 zB2y}wK(^XZjtVr=9B~4CxJ5|$=r~#cWhTv5HQp_!d^ptDoA^NzlO%n7X=6c?3Hq95c?&{xUyB84Pec_J_R|p zCGUD4v6H!*HDeoD(!(t&)McPgV)kUvABT+@?HJu)=*+Y=~=9Hga<1g*U$ z@BlZuQQA=E%cM=6a&>#cfGx++D>K7$rfbRTY;~D`d^s#t`Yj|h3X#}&RBb&m3Q|=) zZCJL_Tdx%=qY%Ia7VE|7(*R00nnkDNl}#lAqMD`TGONG-`R8!z?wr+vQ4+3N1zS^> z>_fdAVC*BYi57Sd*Kh!a#k4FSCJIm>L~|Fz?niV`g+`i%k}JSIKxLtPyzz#;1qQ&< zi2?iwuDj!frPVP|4`*Xg{dG_;FQT=2fDdfvP%08&f>mP*ifNMQHS}SIM{1@QYEb-C zcayFY6qGiZ#hryl5nP|8xHDPJ?lmk2a?4aPg7rXp&QpP&GmO}vbAq_K_N zusSc^SZ52+Dp>5y$jHIA(dYw{(x@`7PGo`H`k;zn zd{0YJ0opF40FFQ~iV_&v5*qVyYzr~m{$PaKcB70HU@^CK&Q`%d-ngd)NFc+L1y}iK z0nEH}iW?lIz1zJE#^k8ayp6jp%DBTVUkq#B_HoGoDml!&gGd59;{-6Mm^Ikz8azzr zJdBY+qRWO8gOn;0_ERoKRCry>3*a8D7Yx!MC0^YlY9qv5)dLM(Dx&SorU=n~X7z7= zCQQ3y>-7M%f)HBk$#pLG2d)QBCB zWOb!8-WMYIC30QKp+%*@*0wkqdcoQmY)=Y#&3}JNVqGp5`ysitB32m7!J-4>40UG= z1WK_pr8!%h9e|6wu65ouS^}QjdbEsci~V|`u%r?{UQkAz6ajihhC&;F(samiy7msi1q2WrrKh6z zPzxYI`=NVPATgdPcHnju0`x&sKWq=gR>VzF2lT-8Kzi^=)L2+TnGk0d^uCr7?Wza% zcg>b!f`=*ki!dZg)x`@B*G}~eADCltek2pzWRnezo54?NHUCINR(_`dHPM``CsKRit z^u%S{sCV3n%eaZlxc|{*+(h_;`8LQ>Dc5{oWAZB_7h4mc*#u~2&i6fm6iy(86Y%H+ zJo<+s9HUW(i4)m>eFdNYQ)3~KS^Eq^|+9{OfKMr&d;Fz8tbFfkjha=H5KLW9h{_^Jp!Czq|&ni08Sp+>wRoc9ipgSFJ=R0=6k*cKpixdPVq zG8zbEM3oVNQhbbiQr;a+qQU8vE^DF0rk0o-BQEOs=X^j8*PjqQ&tXYOgN|VsXs>tr zmXFp6AmvN59%w!rLBd!NeqrYK)iF%@5eL};7CAwajNhcZ3tBEuXyT32TpDU&=Swa=OD zk=^S)TU7)>Fv;RUV+>rMCt3$6j4--`#~B!9=;M9>BrHMFa|*SOT-L%Gq~L)h34vV@ z@)H0uG7TJveFmlGBp9A~L{|XR%UA&};GyK4Mx`yl9t#4DPQo%o_K~f(!odl0sT!k> zg`hi~GAOkpAjvfu2PT{38IK!T8x|Km$Tcb;a#3?KI_Ld^H~nKl+OMTaj^0n2i2C_J zm^@>VaK(>Z6*jR-MH%77O`5ki8G_GDxqZpK3Iw9k$P5oH7^1$R zjD2=ENw71D2}5k_1=>XjqLeMqz{QAxSup#M?5s{ip*=t{b^RNRLBkn1bqGVys399e z0VNEPaSfzDfq8hUK`8AkNW{rDv?1!G`WtDW1Zb8oxc0)yT`R0uraYV&bO{U&R-lHs zCU_X9y$wUIPR0j>h08J^`XHmWBDfd=egx$Jz!9XP`)|RV#`KXaoC{o~aM1(pWe7A} zDDd+sSeG$a%VC3mKw??4LOZxUhE)(PHjaz-x3Iv0U_gc&tsIQi zNPFv@Mg=a=V}k~5-vK>igEYM`w3scAWDmi7Q28=Ng*3?>%ERhJR2zrUUwRBk&k8c3 z!a}XX>aDKj zvVtVaw}X#>4lI4hYz6Dlg>d13{0S@@()H~+%7zC6PS=a%1`eS&>-PcmTY6l-D`E&I zFjBY$c$5y%seuC;(%p9kcq8COQ;k3%*p$HcsSC90_Wl!w`Syu}>fUd_yn&-OKe;DP zNLGK?^P=TDNl$X=;IK&q@Q+Vpu)FL!HiwD*fvn18q?0t?G#-?V@A!ZXbf8aXVK=nY zt99@5QaZYcqL3MK?U?f(E`F8)w)n5Q)+aeGN~=wUk{!B&@SpsW*f2h+D4p){Eq_?( z`VzQ2ALh{fRW#G{FEYLRI2l(xF0=ZUO58SA)a3I1$tp^c@knfMy*PX4c4;lH$bPs!sE*|;3Qa4YR; z`6c#E%(3s7*sbJ7lw=yo{oos_kH?Obf!N4OwE(;fi8@Frr>V^$z3yE1riQoq*tYC%pHB6s8$$pV+4-_UVazdN`zQ^iKMJQ<41v zxJCw|M*>aZ3svoG8Kg0%bSglO&;sc|VFrNmq!Dd$QAL%9OaDMu9kHm(YEjztaomJp z|99YREkY35yqO%Z3%g9!D9PkydJ+3~3-YjgnBgl!o~)n}#)q^)sY`6CxoD#?X9jW>LfQ{V*+ zYiN8JUy_hOm(@v56&~U)llf;c$I)Ed-vTfV*2UZ%N0qVXR1N5Lai;C^&%)LIg3mbc zV76*^3(AjigQ6Me5}e8&yIj?upfoQ{g8p}dwh+q0CbSWXg21bL#E&F4Ad_3b-a=O zFgJtQh>`uJ4Q_*fgbazm5@? z{NzE;;tzOoTiilrUOuhq;xz(d&Np?y^LT#C2fyA+;}XF?d+YEh0xy@T-g$_SVNFz& z9NENHY&3b(V>I#{!hjvCd{O!lr)w-WT^KB2l{Wrsuu*53*QHSuC1o_G?onoV^vl&vKDM%j zXQ<-(!Wlzf2#NCqad7iMV?dhB$2C|zQQO^`zD6M{FKu|4sW{t_F8;8W>&a~&e;vtP zr}6UK{FSp+5zP;AL;cqeb^|%bDv#O{jj^cM)W5rS#fFqX07edcv6H+{5uTMmc=AJ9 z`Xmoh!!MlWU{F(1sXZq>=v6z({@_zyJ!sk<=NmN{Zf(1uNjTIgBzB}bBj4rKr*UgG zf0yPRa{K1&>tLj7*oym}m}1SN&X#QgBDM~E?b%|RCQoiqCyvxnBVOu*I2ztsbvny6OwjWZ-M!V+^&ETEQ!x3| z8<`Jup2D`5ZKZ_+0p{0<4;3JN&UwL%e({9Nb0*Rlw zsTk>Qv4rLw1FXcU&v!Z8a1MQqRiyG_O_}%8x_Qmge-t<7PuWzVyE!MicNy48dY`$* zQL@fl#rwfZB!h)*da?jDE43Y!Mip0vE$aKgcw)sdsJUSW{MJXU1t!=m^^NbT49!*L`k#XZRO_?yB?k|g{!WvCXgLe_rPG$k8M1_ ztzoO$uQ$Z}od0Z64l9RPR2k;*C~SD@Z$$CuI^QTKFI@^xV{+4ur>&TC@gS40Hy{kHHJ98re&{`3(p4zJLt8bU`>^K8G7u#fLSfy|;yVvjjM2Mz$Len!mD zmS?-*^;kKe%$`$kgUq?js*gAct+Vb z;~GvfTg%2mZEMQdO4}z=`aU&(cmGuc_vpwGf9GG zQr%iJ4{Xdh(o*ysoQ0i$r`MJ)LTY@&DufSy>oh&h1wDkGwphQGWy4R8l>IkP|L<(c z>feu4m1vnvL^5uiFA##xcgT;=D}frA=jDjC@Fu&7x~H4XuKqO5yW}Qh#tW02+U73@ zYpO%uH#_OT42K)_JU5xUuOs^Qusi=`&eZ@+_`vibT=p*J(YWbMGHMaBk;{c|X6_<3oS@{?;kCZU6bQ z$D3~{QRwiuKKSY7Pv5!Hm=&B8qaFB%BIS!Ssw=)8a#hc%P3wd&Kl7CQdUmScstlHc z9bOpmT4U~=rcCjQpts-OOHYZK4*t}*bzsqVto+(ft8y;}zQm6J-`sZMxNkW*_TTCa zA8gV-zM2-Jjjpa!UEf0$(X=+!^U&Oi8ADS0%v}iuc^NBd9<#v;B zEAdyj>`(U4HpzYSmmDp+;-!6(xd;9%PBY^pp>Oc+J^Za1Zua>xY4`IPaG9GrRz4y$ ze70|v`^n&B8^7{R^2UcKSIInq;nMxNWb0pKSC=ha7J7tx2J~f9M!Y_VauK(x7@`V8>HQ^O^nrvm*_WPmLZ9NIgrl0Nq&%KB6cPYhn zc~_qKO(F-U|9mQ-sthAcJX+)we*K3~KJ;Yd*}Nlf7I3Rd4?}+p{m|U*{@#~!Mc@g~ z?Ds(XwXM|_lf+3Mo~2{eQ~d;FP4v!k{r=4NcL&bEqtB-9GQ9bD-T#5g|J9;sYL4DH zYnEzGe01^259O-0y5P^-5`I&*Kz$T2ktwweo;Nb5pqb`LT?QugDr8!=Z_x`?M9BO& zDO(-8;K4d^HegHK3}`s5KIC3PjFXe{7M=CV146^BN@)IW-d!l5{L&x#E&U9zN%R-Q z4;5*p z29ghQ@sUjtLA&{pQ=hbM`|WlU#%RzosC~N2UkTxgvXAae1xrN#*!$*u*TZvu*Cu3w zpthVJT8P*;oprANQ`om0(@jll+mZ*CEd8=qi0Qld4^YVZ>t7c7A*K=#Il%OyWVT2<>>r=Fn>+Qe8^H?IfA(YkE(w$x#I*fqv~1Lzli%c z7!3bk`WF_!?X@m)9qG)u{(pcKT=r3iFJd-!m%Y-E&ddP)1ViNVMcn6zo!shKkc9Zv zd)(YZZPTGyfB9taIk+>RN;$GNekD=Tni3bxqlt?Rp>xn#Zg}i}_%e-W*;UKhp!OL2JN#Fg`$?L#(`8 zU{U9xwm@j;e70tMh3rK2%#n^<|AKf7GTkHhw;-N_4eU&r9`zN4wrFd{F3=P9`Jo>W z>&hbenEcU__c=3;BwYhGi~ExJIWjyxgD%K4|9-tN296u#XyZZS^}XeYaWPg|y20)# z!+YbSpQo0|yPCibnnhapuKo2G?bj6+uES?=yJ=A_+Y`h;ZhZr|R$R+2c=duJPqTk` zgxDPTn_&DX_awTlko$_+b~k?{iwq5bQbXy-)oFwnP1Ifr(qT zefjU<-VghjS@}z)cVF2B{2L=Y_Xs?;4i=sR{Bd}#bY%!TuZ!;A%N5To;AC$-T;yZW zsYTuy4V*sPTHr~nzqt?5{T3O~XF0e!Un=TAR0DwFpg{FZ#XSK znYkLa4pPV6Kl2i#b-3>TN!vk@i!efTvGUFSh%Ir6h{N+Sk4o^y>Ri7W-%>7)iIaL(i6^!^I(DGB9SDrdG6W&(;a>dIAS>%3#(kv(BdGWEjZOb}j<}1W5&;TtC zY6LUXQMDk6fA3C{=Le{*8K2ORIlSwjQg>eQ6iBc&h!R52pn2Cw;NRg)`;5*S~g@Pgy(_9IB-IC#mn{ zzSKUKGv%kGsrEPC+RdNTe{bx)1vaiGUZnnw&UZL>)4w31e*7>gX7WthRMNEQ6|iH6 z{6FHGmxeaqWrMdeE=S0w>K`mv6eaoR!Q1Gn^Vb@ELJBC)7MutWxnK68uk%0~4I=9k z&{n|y89dum7F;Lr%kSmV(GbZ;Gt%@Y==G)m(f0>H!xkSkN zV*P7x{`SvRabdJ!Xi z2|Np~W@MC;&N=@axKM>|Dl%kmM^|a$h#hZ?Y>0guuGwMywIP+5%a1njNOtdC`fdB! z;uROkKgHhEuEZEt2Rys8f`k`qnpUgcz>$>ua_Db7a(dYlTBi?f_7yWNp>IHvM z`fo!g;upNbVJ-Bw6MFegGv0fq|7{wzAoRZO%G|Bz%HHFIXWo%IQ<+iGdE1Qie^7Tm z?yeC=1-+m2eV3D5mOJEnYKLQ~OB=K3YQ=N{Uf&~(pIYW{(z%(@|ClQ4(aSb!D?C** zrk-f?E*7r{RwFQHQTQQej|6ko(`#aP9|L^dr?#i8; z94d#ELuD*C9h|Kk5|y)@$teVICi%n`VnDjR z_-Xz-H!jCFH#8W1Qe)Tr!wlt$sn_pCNH5@#fl^MchfKdrcoF#!hA^r)c2p)PjkNdU zEcaO^3s$+rYEbXqbC3KKkkJ8?%T-&Tsq0Aamb`f?{5M8zCi#tH_2NjAStsohJu`H& zzC~-yP{KO_acSST3~JR+WuL>VUr^9htuio=2#6El$S&o*eS6aCuNQrmr|&oFs7_x? zZR~mUr}iI9DWn;phOPO71ow`q**+`utxYI z#AMKI-<8o1m3M~};Dy}no)md-Z-L__>ButLU23Q((^+|^)LV|GmLdDtwfx!pJoyd1 zOqoE{q|ZPYXgKq=4e#_^R4v2QzfCV!c&krX2+?W5kS)mQ-KDRbs?b%9A%=9llr9xP z+7pb3sQ5rzBA~}WPZwVdR^7Gnj?3Ee*Nd5SH`hBbw3=Ga7w~{Z^eynMJ)V@JE{VaK z&2qsDE~ka*ru8C;%!kLW-irEUlkBa0r{2sk+w%Aohu&(5=1NrEOTkE>vVd635>ei1 z%fksst7C*W37~MpaUn7nU7U7eO|AiN*=f*cq^Of$6k9|PF`_V3UMk zbMd%I`h9WbN*p!z+Biscr7wLxb03k4sJt`YC^v_3}3bVOS* z^ZBjPv|kSkV`I5bP$f5m$vy|BWCO{|L_s*$AEGM&1)^P>6}I4daKEn9O?80FpT!p}K>Axy3DS-6G zg>)(i%q?dRZb029%Z$S4j6sIy=@p-rhDmf04$ivjmtwnDJjkHZ#wdAW)GsKlmP&H7 zDiAu}Pp#gUn&Utke`qr8fh2s6yU27k9XBfb#<92EvSs>xb{-e|(it5Rp7`N3>dZ84 zH@ra4uPBzxgNv)7i+wgzitoK#UZ9LwjGQV=ugj?=&LO4g4Ll*f3zO=%Sy4zNAART> zMq*k|$#MPjn{Q6A0i4EKT_HU@P?&hjV&jHdqRhj0YZ;g^^aX29V!c-S^rzz~CPE&c zc*8>W2$+*O|FgbeQ%}=TRAIl`^2;|EyL{37!6!1Da}_^fKks!hOK_~wsN2_0-+n1` zY*Am4;S8Bf$1bE9P0pwhCTA9zrmLu>IXh6D9p(hVx`Z7TcMU6Uu{mMK?=8LQk-7`4 zH7+Av6Y;Q)VGjLd3-4|}&z<^;G2xhdT|H+XA|m!A;a*XJcqc#&wqsf!eA4{HknNDS z0nBCO51cVor7;pYBgSbW^jZG8KyTQhI^CT(>Wyt&iGd*DPhPrrs~vDS3RJ>gAmwB> z*&Wd=vg4z-xiMwD4j-yfz7WwwH4koa347r-Ez}&E{gig>hK-2!e8qpQMxsO8R~$M$os3p zTNpV-O3gi2*tWby_jRW( z80l-&F&z6H_9J`0vv(aMJe?9AER6tztkGv3X4AEa!S71Jw!TVwD1-O5Jikm&O8#7l z-lx-X&b}8iWIW<$sxRkf#mXec_~q46-Gv=Xh0oohWWe2wlM5tsI&fW7@tOOM;&;I1 z24RY)kO!nlTP$DB575(39yUgDyU~{{8lC6%Oai?gASAr30gwgt5MoJU!?YuFI?LVqHul}ic1 z#Mi0m%6O7T03$t6KAIkM{A(YWs=DjA-Uo8?Qeg$p!WX&LqJJGdqbI5dJWop{XKuTo ze~s>t2-PKlEX^^552j8VoLQA=D{qEtHJ=JW$o20KPdWFt<1 zf*bdLT%%y3^L~Jr5&xqd$E*?(agU|9?(Y5U;?F78p9Q6(%GMdUbUckyLsryy8h#Cw zC+doi*QA?i)>3?mi1fW%hhLO~KN4BuoTg)cogLYs(IP7RsX^xYGWfmio-X~?P-z) zJrPG$r+bARElJT#D;uAnZqRxyVtxEk$1cGXE`=shhAbqp*QUGlBI>FDLS48q9ZHdO zO!l2yzQS~*D}s$JH3Xmom!AYkQg_|WlapenNm!P$Rg%a_9ZyQ2wSde#MsTN$jGis; zTE;$@rzmKS3PzqA->D+3;9aAT7cH5pooC2RAI+Kl_#U#Y2iW$nHPKd}i8{75vIMnc zAiXE34O8OpCwilsmX#Cc-ymA5#l>;|##zYo-{DomI>q%{OI z-x=+xiV@(9b$~X@^G=i^KOu+8I$bto?(`HHOU%6bw1Bd$#-8mSLry-@QNJr`s-pxf zn@!&7dpp>iMn~4$f$aTebOP-Gf=1%>z-@hl4}-jBUwSjyRlo>=c2<~WPh3#TCoBTe zqE2Z$*z7`C01Oso0Jtj&miwzSnU&COPrfL%CeG&kJ@)|8nRB)Uub=j!35Bq_y|mS! z`#g4j7i~rV>?l8UJ}DM|CKT5)b?zVciW;S zgyen>rCbM?!arp30X2`&-v81_dkNbI2ATR$>sU>b+vc%-e+V>-OxXjSQqf^}u-GdN zsa~!mUbOUAdx@-`Y(mOlj-3CjIb@TTUA{%GW1~h9o(F`ITJ{IT46^-I4$w!SlAe*5 zzp*UjS`Nu-?II9hiv&idkOV93c3gE8^^UtC7~TNs6<1IAHa4tAQI^^|j`^QQ=lJI$ zQ4Ee{NDXgB|FRRROrpK90>q=sTkxMb5dFlq5hs z7{C#&*qDCjb$A0A#__DTP9Nde&KaedaOeWKwMpLAF6~JxN6*g_2JL8MiO4rao?LvXeZ%TvLxIRjc9#Z#^YwBH4q4aOPzj;&?$ne-7nXKx6r zQl0^HO6;#<*S7BaV*|hLnR*M%LG83U|{ZRseauTh`?#9xqMO~%BLvey^VB$lP@=QE|h6ws(>)d zA$Ko(!+37vD;tJx7X8yc8vM)a-6}2QbR}@y zJ>F9*Yc$IUUeOi`_hP2noM7X0Yo&di&HyQnnYLoEvk!d(TI*w7?8k=iQbL6QA?7X_uk%1i7z z3#hy<3J;*Mt=*Hvuj)kI@UQ22(D}iy)a}K`P zadBygB-?a#j;i)x>sO^;V{7U>1`pwi(XhO_L*w~iwo;r0;%PRFPEgcd>U4yf{O~lE ztXwkI9>;f)U)FrMu}1*1ZQ$;(YK-ytjKk<=psK$LIcHrg(yHHMQ|@ zYk0aH<>^H2r}8b<|Y&wN~ew-MM^o8RaMM|dF}sn zk9t-9FtxTnAmMA>+ANl9BfY$tO8kna%f^I`w)*+_fbQjr?3oWQt|#!=TB|f5f`p`Ht1P{(C#Y z|2&>w7lzyAJX4%%9y^;T(2N!TXze$MWU;4BGvs=HMz%Y?$u)5o;^26%e8@cx>d9vZ zaqcP(DV?b}tsppgIn{qi0*!Dg>D3#<1{`PvEvv3+c95@!HzdMmJiTOlkqKh%K16wk4YM{NlH5Q-%Ty`iIWZgdB90oAp7Pmh=tKfz_Sx$C^s55~ zT;%7JA=ehg?MGo&+I&LS7O!Vj!My~doU{L5O}4fFlw14rd-=ik?dwM;z&%_)y#oW@ z@%!gb8^AIAe%e$8gmeA$n<(H%e*gUQmhA{W7gQyq#W|g5cw_Y{ NV?$GeD!t3m{|7I_AKm}} diff --git a/media/images/gemm-hierarchy-with-epilogue-no-labels.png b/media/images/gemm-hierarchy-with-epilogue-no-labels.png index 59bc99fb900b60d49b2f60311cf91b23af36f292..b87e8e2ecb17304fa47fc19a90a56a071d395652 100644 GIT binary patch literal 184294 zcmeFacU+X$x;8w9sK)}th=L#xH42D;NR@67iS#BYWk8T#MtWy3MntO0(5oQ5N$*9a zC{=m~1&3awcjmpH0o0vM_Pf9D+vlI}ob~$!hVdz@-1WMzwemzxMx698^k25;DF+maB;vmgz&ED`wqoGdkCwmPRQfMyPXBe!9sb^DsUZFvDy`x481j$$*QBqZ zP#L~QHtz0+zYpF2{f;FHb%q!DZ=YBIyB2(T%1Z3El_EyZ%J!~>F3Q%{metVA*iz^2 zeO*?Jg?{L?05u9lxPSBdH6=UrWFP7MaO-fM4NaND+1Wq7AT@pPhn&RCP^AQSj~|`x z?C&_SWS*1uac;7vYMoAtQ&(7{h=&Jf2^re!@kMqI$FX+ z!c^i}X=hw|>kY|A-1Zg?tLaR$1|gj1M?O0VaRiR6m&nB5U+mFb2lYqyqj*!eO*(%{cb+oV=}6NU zBSVR}OyVar2@dKuUApk+9!dm7Labsg+ zPEO93@CG9j6R&`P0Cnz#ANG1O?zxl$B0GDwkK~v4z6-hf@_#{}{;oaz-y8pTa{f*@ z{-;ZhNY~1wzA#OPhHYPM+QYkGy!L7yy`cje!$I7p-7{&&Wd+zvz%rD}ZLXIQ9G#{u zBOSY}FCQ)0>qeBwvz}6CN0Y8hysZi}hl41TtBS`9$~`Ye)ie}ZPYga2r+csAE4*b_ zcm##Q2L1iTrg?oQDK)I*rwcdcYFjk47|~9%mAn%vSuVEDg!KX08B}@ay}ua}YB0~a z107h)^`~)fzH^4d;`o3@k?qWM5jjdH@b6yfP6szsBu7Wj>_^qO|J_wE#A{Rz`}Q6p z3Pttj|DYRdlrb2L)8}*aPdq#h2iD>(3e;HxjVsb5Sqpw$n(hmY;QQ-5xn{ii0P7A- zmx+IFuF2Phv-Ha&wwL^1=C5qc3lliDHW~@No!i%ZH_2Y?ZfN44jFPg#o7vAm#>he^ zJ8NxA=fnTWak^%d@4h85FOTOu3dJiRVEI?KRNkr3G@*11_3^Tiv9agc+M0S=V&apf zr6u*aVk=u)@msfUfm`h?Bk(xeI?Q3@P6k{m;5x}#ETpQY7W3l8nZErsw{&!Lva_>4 zTUc6Ja&mFuub{l#e0+Qol9MG_EC#Yt?##D9=d?y4&{!7NAL(vA8^EeTedOe&-$}n4 z8w^F-)YLRZz2HiYyN8Fz=$Lsf&1nxY&y$sdm7bRN?J0Mfx`G9HS~B`BQ*`-Fa|Az0 zTn*zw#p?7I+h=H3dbG}w>Ak6hA>Y4!%Hd4y2T6O5g&$78gD)yye)v$kwW+2+f^vP4 zoLuy3=Bni_qpcs#{=@6f)qXhQB_PUne-sp5F3F95;rY$0{pd!tCCVM|;k;!x#F zl9H2KRCdDbJx7Xi?w|Fst$0W-#GBNyIuaRK3ON$7c}R<4B60grH`LYC)LJ__MlAjs zj-T~=U$M1WHdfT)hkcm&>fZa*jQgJ!o4UKHNl8h^Smn~FX=q|%W6uYMhK9x@CNc%? z)8e$UvbwIJp<#6F#EBEL^n}^i(Xm;@8uO7N@aFRup!jqD%0e>e;PfTBbsaLAn=B?a4*}}$q7vhF3+fg_l`}yHYj}y!a zwfsScD2XYE?&zaSqHOs34@d6*0$))0JwSdgx$zXAF)JL4Yeg}BuH%sP8f)xyK*!vGh?zo|ALSImR6ahQLc;S&78qz)E^-|yPKx&kV#$s>~uepW5 z+&kN5TGZk9_byssPxXHjIFZ@y7vZjmvR$4#T2lT3{uW8R0&k}{FUofBlmipS4ed&D z%3)tr5CyeE!E!&%P~VJ@Fc*q7|6_M4L6Fa7V`yd=gZd&4*X(wg(S>LfT^003_uq(k zLCI@{bnez$T8H4K3e zBp_ugSx|6VOxBd1L7|F!J4%;wI!hdF!Sz=b^)IG%X6ef!K`;sxC9p?~=qe;AC^*|Q zu(foeXlbe^aO-`H{!o?oY(QS|c|LyrmT-5B(-2L;Yf6+WNh-R8FT!c=ee2m$!sbkM zY#^GNt#n1Zkf?u`m%a%Q)k*BsSNYdUS*=ki{duU`7h9P&`Fb>liGO%55W5;U+iTev z%WuzbGwFVi&h8hOyyBwiLgrW(!sY;h_6Q}b6fq#tnzS7JuN|2ye|76S0b~@cudmO5 zP``Dp1UlrqIzJMB0QGdaf4P*W0j8*REhjs>!nSO^Z#x>R;@)d9^xD%jXd*W!^%W;< z>$y5ARLa)cf`pvG=%AiL0zpYGxFAUE|{BuEJo~dv+ejy659EA7{+;74hU0 z=N02wW>*`rLKiPy1nqSV4COV~cA5_z@bZ)4W9obFI2^2h*ykNlPQ1AGT<2aaVbgrT zc{w(K$7N$Be=4^pIy3W1-#!%MBw5(_=PY_m^J2GqwWPGLbn_HS zt0E^SN6gvTnYE*}^}3^@Bda1Bv?(Sj>Ee}OsmTFCXeqzoDgaKB0lm~mDADw(si{pL zK0KZN0iTqS!LF*RYEe*7a5W?(#Nrg{hN!r>cxzi*Y=G@d2~SngfU}DcO8x%p9A|=~ z)y7KoK>Ywp%T;Q?f76(9wah_Y#rE!i<#3XEgKD9b;icQJKxTKF$Ll{29oL@eFG<`i zWlk@?GMzu<(^?fV5&__rUKVDc)QVvczVck0=%{{q(+`n{hszbmWh)9sB0^RB6XPSO z8wKX2yl8k>VR>&z{EOK8>bj$TczBm32W=0j=rh4FA-~|Dkj6VCa(u(}p_I(ZDZjJK zXkAdxWboPT+_&F4y@90*v4P&j_qaGYsqFWm+@^BUolU`;#|Pw@^%dsNl&)|4Q!2G@ zqI6UeQ~326Q7el}OI3)OX`S=o9dyrM>NW=Hql%bFD`JgxvF*3lmmprOpU|`>_d?@zIzjZQAR^*X ztmPjL(iv>rnrwuD^Ku4iy4Xu>P8Uw3;fSNC+VkaZeV~~cmLmw8Lz*2ZQ^s0A6|FC?&c|u+g6@XdHEw`n?he6kiW#` zVF_C+)3#-y#K^0>Q)>@Ip(>%jzkg+Gxhzt(f3}*Q9^*WrhOELV;v9@4pfEHl<8OZ*TJ?K*$1jy+*DFlj)UpHGTabO!=c|G#WHhHc;(v0QeK%faR#FnZvD5a zUOg872pF} z=jiM{KCoNw8Hk`p-DiY+TL)j`pD3oJq;y@@H>>ZX+$Gg)#*=L5u-YuWdXI*N$%0^? zDYqBzBCI0cOG(v_r0ip)ogc3EJi)B|GQjroaaJ(lu5WAp8q$$GMqj)*ht)b-UW-V( zyBbF0IW_9hJ#q8}X==>3rH%R#H}s(vWSY`faOt+uF@Jym4Bgs*AOU*|a7C9bN|#G! zDkHbXBI5#<5sV_>((I}8D5^tlXsyqu93F_mlDjJC!t)p<+|ci?EGYED+8=~aP=x&y z@51xV2zgSl(Q~YUhJrGY69yebbi?8m^K@^|t(F;l{dpGpQ4@i`$gg6h6=PxKTkCxzg zMA^z8KriH^4wIHl4wRh-U7+Io>-(mQ=Nnc&9uy8zMK67*DO)cplO#oHeJ)w=oAFtn zE@I0m){k`Nmf@euZbi9%f&rat{BYzn@)%swC$f@z&CI)!{q^@{H-U{WZ^1aM40Xi;WD&#f zqWZ6|(y0Tht5YAsm56Wa=n&V{)rGlwEDrWN?EafWiTKSgkA03w+;Tpa4k-bL05DojO>NintZ|jLSxj}BZkpBx;iF{4N)iqEDH=OSs7qSb}ga(IMsd-AD#83 zshvRi!gpeUJPm2EjB5}tE;_!Y>3<3%WFCUrlCy+;NlD56gGdn&g_3yk)yWY5CjOuj z?e6YYsHweog&n)K0sTdv0xiruAQVshXB_oj@4B%7Zf(d{I6}3*x4;q!C_o5QswU9O zg$XGsQY;{J$lY?|d(IMq=XT~8gGo)tZcc#A+2K!H*0Gzn-_ef^2!zCmtYv#8lRVI6 zwTDorZ*&POrKYDh;vp0*ag2Qo9oyX+js`T0o}XrpR4J7dNUD| zlfy+#P2H>!r4CW(4M|DKZ$-0mgVE7ZcUY0uv(Al~L?weij;u#;sKlPkuo`Q={YvZF zc3g&xfa2ky;}}CfzUz;A&**xreUSdh6*+!lB ztkSw-qWLSMk@hN5tgR^6M}S7-IvqclbP#j@7u9Yf@oG|rxNK$j-c?Km)LUpO`8DX((3s$IWBz4k~5_K7}+V4y! zSMQ6!y~=?Sy-uo2_qCM6eo1yjk575k=_VLR)k|bgmRj~fsxx~tc$3nPn<`_$9 z{kpX9aMk{oIu~xtOGo~0I*RUH?&(}irY%r+-dIRTNO*_;)V(<|qBDchbe|QMc2Bxl z9b5FarKRV1kF5=U&it0&ko!5G8vLh~R>FMON?TfABHsGajC+CQFa$oaBSy; z8+;NZjCBE-_)V0pt<6uTtx7kMv@e%dZ773*z8>#6u$Bhqs1EoR~p#UvPE1SZBb!>F>_Z@$__w^R|8r)i2dwU%6 zp2LEH0-FmhJF2=@<{&qCfUE?G4CFO5kbHME1ZGlV^Dt=}alVOMhrf$=s}xrlYx2oz%X9s|Ku^8|}7YV5nKiJN%}6#XH)i z*-givuw(lR#-_ha_T;hA*?rkk7s71O=+t~3SEutSY+?1N56yiU(2-GmZn%NHmqcW& zXu02UD6^JTl4ZtIb3OT5m(uon2sRBJFPt7>PVR7ARrGCEQ5e1xwELDg-*^%Tv0s-HQP~}t*rCP)2Uhjwoid7@h4U@fuP zATVDCF{(#x`3SB)l&5;SuLz7Iv>iNr_xkPMQP+{DwqlkhyFiI30)KmG1rg58ZDX;r zSMdGwErZe65BtVZs7fuUpE4~nF9;ByF9-3?KRz7(5_v*xcXpq# zC44*Djl%7ze>o6@;^rE7hrE!8l@t|~512m`IHVRFk2J=h;#QUWppkjkk|T2W z*CrwNLbS=#W>i-lBTzD(aaYrrJEwZMuW>jnzC#%OrTQpdwQOjlAkuhqDU#9&`@70O z$Gotn+DKgdVM5Tu{C1O#J8Nw}xMrAsoUciDSxg=sAK%7>qVoScQH_&hjsYy7-{l|L z+B7Bw5Y8zEs2YYN z*L#4hRd9VWYulAl0O;|xYr7xhv=7s|xpW)G$8KL=V$ze#Vl&eZ5dbui**%992Q-Ot z5=a|8+UkNi6vvmrZ=5}k+~7Kh;5>hA5>hW7ejtUh#72LxtVO_??X(=dM_L|Laa7>H zN}{9BqBqA?0QZ;c1%yW6+aTEX9K->-KV6z{GE}b24uN50G}~3&J$qBf<}&Mvn>$P! zK!%b_kr7(`%sZYCqfn~`j%CLl0mb95gFxY)hV$E2A*Qz$5T6!+78qTEr7jTrNrb3( zEYe<-DaJDs9qF=(1LY5!Vb=PX;=`UR^dqM2UrfZG6nsBAS9zU+)x)_xXk>F-ntVdb zZv)N_np_wnr(X^yTlsh8=u|h9{-ye<^)f{hN%~+STon{YS7Sd|63ccX?1`gOepgWC z?!ZO;TN+RP=jW1xP(_oNkt0)1xqS@E;O6u7Jk{168iD`1UU9Nu<_^S!fFq)gYH0Kk z3*ri%xBsasL3OpOam0C;Bl0E-rY-3f5slz39w-8g90HXxquy(HIj$m&8}-<%}Orb zXGmE&EX_vnlB}Nd3F6l`GeTXq>2+D3GH<)35*x5Qg~9T(mMk;_cK6^P4UMdX_5!75NLSI$ao-th^Z-KaUkWThU;En%R@&^isYO5*#zA5XC=d zd<wkwb)8Uu>9a1p?ed`SW+vg;Ng5&<^S>j<;E7+_?FI%0zt5oleqNATX9kpg zTF)XG((O_wd!v}forysd;mPrJ@*f)3$`1b6p3R<%#(E zAog<*i!~$ZO7-XWeu}k7QnzCO5~G*3hP6;AHwc4?MDhZN+`4%lNl6~W zWF_+0a&Ok<;ksGM{k5mkhwT7d5uK$JXNh|j+NAT{l|XzcBhrU1D4Ow3~3P(ozGUX zxv=Lnbri&L=yfzewlv=Qem2r~;92*FsotD($^x5N>ZKn^DR1b;fwmaq$9+0Wb<`m1 zrnS`Xh%0)Vg&~yQZeNN~PYaz7;oZ7bJu)YE%ZLAL|7z4dgGOi+Ka}>qAcb8%03Gd_4v~ zhM@6a(~;!^mI(+B6vkez9zk|hY`4>75QWGNZp=5zvdnx{DeuxhMA=#xs+$iIuI9Yx z$yr}ru^&kyraBgpn!*&}#wyhfBeo#O(mover5nbyNt^4-)jG9376rp~SIte@k}d2L zd<3+eHpDi?p)+#ItVNTV`H%}KdP0oqg~XmP-MS+#1JWhhj>1djr3U>d#y{MgWjV?=B<`b!TNM{~T4|=ls zAbz)CfRgxnq<^l@mXLbU7G87fd`>Y^kw!=+tv57BLK=cJ`1x~+$H(7i7!75%x|#L( zq|SEw`3}!jVw_c=FgQGSU$AS-&dRipq9rbZYJMaU` zirEU|d3WEs?Sv_N@gNwFk&p|qS@9yG!aMfTPrEnYJQ2UsbC2M?qRxGe%hTR~Rsd6> zhAAE}Ejfpa%*;A#3uB4+R0u(K!ot14H;HnHfT98Z%_&9-?QmDLP_129PQedmgFbzyh3j&Ix1=|`!wP$WCH(EZAbO!MyN~89Ah`wz@W(YU$uTAa; zfZ4q0g@xjzf+0#0F$`hXxq-lJz1#PY?0FAtH^3?Mzh{U%4P-=c3%ED{VM!t#iBI={ zI_Fq#~8xy|7K(lG2p)%*5_Q4uAhOim57%P9#oqq zBO?^hHlNc#Aa<;reA9$2+jpC}wtaAqW$;{cUmq1WlU0yeH5Ym5+76Ul zGRyxo0cEdl15;CPH#fHhvOV~n-^L7fNO+omU{yxlVJsGkOb{W?@?htA5>NOxaD}rM z+1MD_|L5>-sP6Lx*jDsjwNxZ=%^3TJ@S6gfd_gt>4EfVryS1vuRrGxWI_$I%hN88?4&@S}GrrRx$|o5VEBv zkS&pd_xG**%RO?TMmkrQCqzqnZ^ZZx`G)?i-{7>`!KbvoJmXs(ARJQ5rQhJK zlZ^JS6}7I&KQ|;u>GUf%ru+Hy=Wq2Iot8W>=XQ;b{eG(b*o|sz1<+MqgIh^~qVH{_ zVgp_C$_D3fHCs6GYqapS<77o?X=%7QFrP69$o2Dwc88=NL{nVGTNx*UEBB)3d(#p4tfEExov@y-#G7i4z@2o3jlOa-S$U z$Z-&e8d6aDm)F#ctmWe3YU=NgsKo{D70L!b>({+hv_7qC{+3kJw=KEj#?H1yTUU#= z&Q2;87M6nE-rm!|u`W3Jwe!e_4fBymtjh*$p1jGT4IUr^*xjujlK(evZZ^L|Utpwn2(;UAb1uV9?S~g$) znPQF|L-llz4Im=Vx-;?kA^II{Snvj$k6uiI%(J7d&41hf%nNsq-Tto%Yl=7H%XM=gK0=$6dQN#n{>f8Vl_dhHX@&}_flB`)=c3xY+ z%|&3hRss2@-0`AhdwEfJQ(%m3;|UpYALi==Sk!jFi@h(mwziIKMPS{_?Gz5BObe%p z@_sqwFE$A2i^wr%tMb)_u@-=ZsAvH0LC9%4al>8=GmqOZUIhMj@C6A9m-(Bh&7+8i z$Wu@^{Pr9t2{~^hDzC=!88$Ye9w`Aik~>nN>DNf2o!D#phJ z@ZA4GglM$(WaFI80m{|r*cjj?JKbX;iL~U#^aH^Lnuz(HqO@(072{>3523V7g1Jpq zCF~#F06I3bz2{4z>uvo(YC9AkIDhspS4RK^@&MdxJEOS#ebt2c5cmOto@=Tmu_?p* z=}$!V0)A_}+LMQoB^rG#h-Qg_&FxP|^ z5G2vMlA$i2q;P0}sO(=|e$nccJqRF#KD$)UsLp=}RR2%-9=T54K#h}-M`DbQA<5w+ zP_7Z4b@*V*w}ky>b#+<)@-o!Vr>8Y5yERkZ<2za-jt5I2rM#@hRBz2G7aF8SaKoT?1pyu?!PpEFRl_c=7#|IziD&~9;1zHERss)1chK9eV(qYIZOE`kUg#3)2T24N(ZaGa;#N@tA+Q6~u)Uys5#&zFyupqP3eUugjG3z6S# zH`jElsfR%q<$A26z5V3k;$nfiqT=zXsVO*G;r-S%`|TZ|L4xaE`6CgJ*!3H{5D@a; z#l&_egenbdu=RMQHD1`i7Py{1hlYG>bZZ(VVPf4#)hr7>CAk@FDi|XNGKerilu|?l z%J6W&gr?_m8DTT=MErj$Y8TjGkN8W$JQ$eIUd~-vhlB{LjX+F?bQ;;Hw_S{-5=4CJ zF3%Rq09wditlj374dB++`h&)5wSgG)AiD!R&@-NThBpwoN5j^{Au2@Ca+gp7kf{dRnR#yPPh zt7mKWjTqh9{JfN{9>a?uQL9qP^9;fVQw{bJuHX?Fl}WO!x=!w*r{LQtr~ ziZM65n-@}@M#PB!tb48LpVbH_T08Ke7yx7W5{E??<6%We;#(F{5zo4veVCkJS-lam z57nN(kL1AhBAcnq0^pCbn{>W1iPH||GFD#t=tH*BhA~_2z4x zF&<=hZmxX9-T(Xse^CHg{2ZV zs5f7a8g%9AN>oyF8U1vwy41fpDOA8(;ppjJ;ZXAF4_7a$q=nTd8-ICqZMO2|Y*Y5l ztWMp>ea$wDm(=qwua(ZJbI)Yd_BXw{7Gr{@?ZGq&UGRzwQph+adgFTNR5ySvk1&H3 zPnxm~TW}uXj?)FRuuTRa3}~*IF;K(1olH^kG%I^FHX(s=yshVQk`j|dqujB&ki}bd zp*(ChebyaRoCb2;Kb-K7CWe=;9utv-4Sg?au8Xm6I$$SB!mVUw3Dhg2r2#C#pw(az zvjGTC6QjdjCgOrqL<4KBqBBDW#%C5!@(LihDF=fC4x|l*pMR;2y1z@t3uQwf$WI_U@s1KJ)WE}6r zhuf(3I?j|Z`D=QhVKn8-ZJD~YRDupx5(68w1)EFkPAgxwRk+6IYezE0*gQ|xuj&mQ zkU8<*o2Q?SQvJUC!gL>hq0Lmtb}!mxF=d-J$e}x*B%h@G4%M6?HtedTNp6>(Vb&Hx zr4)~`m=^RtBNh=g zKC98F&yW1Mp*TPcDlQVB0PU9&rt}`HnsE(0|0F}WmBJ--%9qo``{9S{3}){0d&4zl z#HfK5bp{B}+yUxz9e`fn=`39Nj(J(Tnuw+{>qB5|c##0v!tqG#mOZG?D?Nm@zQwYC z_qpZN&t~|%Z+KJ30$%e9&p&e~GEIm0uZLBdo?~9Zt~PI)%UUAY#vQDoz_Fu+_t)xO z3_!Lf!!D^9g;ZYQ#=l96!BCa`t`x6Gq=a6 z_>S7^Hk!B;%t$wVa@EKT_88f0(uuwmdOWLOLbm9}48H?%B`PI7t!!&;1vUeOD`bE} zX9hnViK|I(qWAI%Fjb3FN|7Hr!2UUgQc=}fF;3ctK<3OJN^XRGx9p-Z_QBSzqT>Bf z%*`Ra48@-CmTg;(b}5nNp2=;AXwr&t>ncvN=*XMQojSvQccUVkiCg=fyFr#lC=L~+ zji}WSlv!jFZRtlvbSlNcMpex${qUpoue zo@yz(X?~QN_ZPYeo(rZm4Z8ZaPkaYxDKKONPScvI-cyR&Uk6v1nBH&WNqBppl83U@U>laX2i0(U<(*JULl63%|sCl_!bL9A!(i%R|y3S9*s} zmtSqIu7?x6Y9x~?86t601NKknfw^1x?%BB^;E0=ci>xABrVN0CCIR*Xg?kD1QyDRI zu5B@+k79}TUCsch+OM=kZs8O%mItuHxv(z)<`~8!9g3mI6UTg!zh{gWSBLbN)9u>QY}3A=`p=eHJ#%nkAMPK z2F<2g>tzjUQ=9G!s#CAp2*p#GH%jKsPnWJYYHxKM8LTN&r-~1ARC0)WNk3zYF5awq z)tQNpo2^MGi#NTQE;JyVR+??gX;vt&Zn1c^X)TnVK1zT$chg>;v+TygM2D~`>UKml zrK>;U-T!$`8~7$tCaL>$rGu}Wr{*+xs<7&xJ{l_@b6|FQ1oJX`{DN#a#lr)XWw`~4 zj~nC*Ebu014N*5z54H>5zH!nal@0up0WhJT-aMKeWNy@*pDvoDC8go48NQYKt9ErN znH(~k@B9llM#Y6C5<8uj`zR6i;Lc>mC^!)FdDlN$z?JhZb?OW~VTd4dVNE4fFfoNq z4%MlWEcWjWb2bBJUl(Jmi#X8kIx?%dpD#qJT64Rvk#c*vd8;`2%`^&cp6#buyH>pR z#`SjgefjPHT+x|nR&R1&64h|3yp2y7!%(2&4aXD1!W~O^o8Ea9A82?{IZ+`t0pdA{ ziFgSy|I!Y5oR3pu>UhS2bL!Y~;7ozPQ0Xey0hEVncGC^Rwf4CYapD7XU)wGcKQP|b zD6;V;<>|dV-kwSW0g!YLSBy=cs(i9dJ#MaOrps7+6o;)8MwfA!&LwN;KkU4lr`(hx zHe}G6{rQM*V8~Tx8>w&sK_M5cR9y%=zv=w?)SZs1_BQ_OMX5fLp&O+k{A?Gz!cWha zp`UW22*yp(%|{ZnG(rT|L(2{lVV<24p`yO9V9(9>E*|q|RsjG$>Q!61rm*xt2$X~% zsD?%iAdnA-mBADaA;Ng<+>Huwoe{bCMtKHn5B&rECpN1D=%hoqq6iH4aLbZ|fMAQhkn;cMN zi;`XaRW;k-S@T-zwHOEUo|uxsrVcZ2)_A&*+*R5yZ%eZBuC63(HeV=X@{5aSLTpY8 zZr}+mn>$&mtcNGo#=846k45L>+41>Jor;f58FsUS+56j79?j`9PL!z|_ORsjZf~a} zc1q-v+SZL>h?69bqeMQ{+D;>C1C5cKXoSF-qJfmw*j$k*L(mP(pDvY*sg^F22;JzO z%(hnVoR2pjx39kL$-*tu^Sm^IM_8{u$giH1x}%X!a=v^@wlYz8L;7@XPows1bh9Tq zPG;@O)>0tD;3EdK=OG=2c$VFnf!|NFHoWU5Iy7e|R`;O}--I1~kPuNbolrz#SQH9( z5W-cY-DZRAQ3XjwD%#aBw2#?Qt_ zJNq#EJ*MX6Af-e{PlB0JqU!n3`BkmY`sSG$j8%WP83=KUDI`74B;%!GF={*KC7Xs}* zF2%zlQ%2aauo6}m%|(43wU_Fy8Xj??*QxTlc^sFDFzAW zYa(xV8$1_z<0z#(q{4*C4h<)}D|r~a*-Gi_1F?bja=havx0!QI3Jf`WvTt^O zwq4Zd8;n^^ub`n9e9pb&$Hag{B;;x^2;}C0b0m3AYC`nmK=Bi1C|8 zhrX0{(ew(Z(DVvDeeQs-&p207tm#cf2Io^ft)oBnbO>`!7rInF;`A#EI>`~=D08T$ z3XWH32BQX)IbU>%LoCsc5tmV}6RF?CVOX#+fT#TyIJ3c^d1f0@DMSJrXs#9`K+5xg zoL@jjV73Cm7LYJjki-}bauYIFL|7HBE)y4`!WPc+82Sz9sd4#|xa8YTzEVq0=+Jtb z&0+}J>p5kypY>LfQp0fb##}|joila&Qe~tpRb{Klwmezh@3bxYfF#DyhCs{lqPU47tLA^>tr2iduYga9ecNBmzFoetaq^!{DJ(HX^WkZq^jZgGxkChcI_3{p> z2+M?SknuSUvDh-Y6qu_X0}V}#9l?cLJ$P#6bBUZIJey<>CWn* zCJNg5O->+jV;Ht_Og`12Et@6rW|o^d!Lp-jahbqMCSVeoU5^bNtjAv-s!;|2bf}z!e0zuc&P1_bN#;St!~qOY`~j6fwIfFOXGEL= zg=#PGlNGwkTJZUKK7evVm6V!NYEc4RR<(K=>*O}?d`?`QDvr@9?_?M(c)eT!$|&3O zJWe+B)UD)O?}ra~Pw4u_o?03HVz)KhG5G%A`t@zwZ}RcoXz_&XP<8`lbr0sKq}uL$ zf&MQJ%<)`qwDm&_MEp*K;p-9Lbgmx2fL_;qB+(F1cJ%_BM8#c4%EKJ}|1$ONBHem0 zXFO)#o4GaL5Z_c>smGM6HWJZ&{s5^H|MxjxKjeI*#GEf|%ufe$KJAv=^_TU9%iXok zgE$?sn@rJQYuTZIiId>5`|^>9-9e|!XG)fDJSL-U=9p-pwe2NV&m}6SUDlF#N542L z{?P7rhr>dX#yuhHZt5g^pI;oCqy>49#8w% zubwV`n+TMbXx=ZcoCj??B0|9(1eN`gq?{b`b9JsTo1SzqJsb>!?iT4f#b;%J9I{3x zxbPwS!Q7eU^((gMq7B4Id zo0>9BYjGZ2My7PGCvQ-G?{2iurN}qMfCvgxO=Y&$)TIbNA%HwyujlS1pIH!&Vc; zi#Fql@l}vp1^$dulJdJC?Xf;nCX6U(sV|t-%3%G7Q{X3VhR#63x()HhjK*q%g}nr( zHAR!61Siu}o2Q&NdVT}}bCtg+#LI+d-^rsW5n0VMXc|mVFHn#;kFU|%a$413w{Bj@ zU)EHqh7i*)VpX#M^Vn2vzC4Jq_y_MWPIPc>9UBontH{O<9y|Fs4=&=GUQ=CT3 zVJG2&i4=N=(HA(aJ{m3~B~H_B&hyHZ!2KTj_>dgQeqI7-h#KgCLvv{1l$?(efh9N> z{YylBz3qw6Gp{h)?`tP#zG(k`?1TM`o!J*bcCE?du>r-OF9nr-3C-(cSTe}d$?m_K z3`Vu4f-UQ}AV^2559Aofnch727)#;EDSR8;-WGjJDjOy9v8)>B81Fxd=s-u-Mcig!L)w*UXAPA z$re(P{8t?{TSlZB);BKF4mb%&3g`A#2QQl{4+ingdoE<{M_syw_|g;Wz+q|m(TN_8 z-FgCmA@6Bj8zl7DG`n|;(f&aW!!lPv@4l2fTV+3JInN^JLEU5sr$Ej=3*AjT%2j6} zX^cA*WpXPS%{A?`0b2t)Fy3ipLJ(5+ByY-?*D*b=xLZy46R_M3)tXyx4xo>|y1RKI zP&s^ksCLLznW0J++sa}(w{EvEGCO{RWU%A~eZ`yo%**vcyMqU4{7YSgU{M9>pNFGI zW1F8-ajZUU6S^R^{%dL2x|oeu_{QD-ygU9n<0mhuO>ir`s_;=1b1D`Qf9;V`1 zY#|OD!EsBir*b+j!T^6&5y{EqSv;WJ&`e0jm7rdgzVj~;fa*6>YS6AbNO*Fbp$ehB z1HsiBkRUcwSp4dUrmJaX1Y7eVlE8Rr7na;k-iETl1z!A!c(rmWw`9-81=+9wL2e|v zWsY2P9N=C2;5qmqywN?;FCL&8;4OKU^)207u%BC`nx9?C)^IF^Q}n^PQ*-D^9i8i67{NeZsZz7Dhcl_c`(Fp%3N^zwto4-QE!)}4{6ogZD80cvfsC4Sp?RdH(Eo)3L5ZSnObA?O*qdELUw zXTQu%esYT~I5V#blG#nQ=N9^K{6$mBoh+OO@Lf}XVNE!A+s|TvxvoSh)wG9YFx+-p zy5|iirhg;hPL?9K?YNc5!q~h4H%WPo-B@f#v`+P4ddBi(7e}G(Oa%D)l-s^&!RQfx z!K_t$DUaA=5}WsW7bBL%eV-_s%G3>~2~Sa%%#4P^l_03^H3G|8WcXv1F4;==;{!D# z6jpiu`@KwnD7hdfvO^komb+8yVGsv{Eji7{_56>292tbSFtOg^NJ3 z&i}bXceamM??EVe7jt{eiu_mjet0#GR3zxJ8|GooOI=-Dgz{ycd+(l(4wPX}!y$f6 zP&v%r?>aXGC%iUpITxmg(>rtez&WEh2_!xumL>P{S#u{!ylDBkePEmRq#M-FCp2}6 zrg9}41E84aI;Xc#6j@O{{T#C)ZDF@$Sr>2X8VdN>U;K(iU`BaF=8&;XJQpZ9Ud}p81*@sICnQ#g|PD zptv@rs^yJ#q-##X$r*-6PQMn2c(97eS0P2(kL5yz^`*d{eGz zt~)45*_-EqVYIfG6OE*$4Aa=@?#}qP9vC?;>*l~NorhJ6nbpllo!3iNdNd>4X&icw zTk17R1DFN2YBJOhsZ=5-O2Avcwv?NiE_XKgKI#&oMilCVWa)g=SWEmi)T-6D16Y+) zt3Xtdp!7a4YG1os2u-UyH$Ue1I(?xEr(yeL(|O0SzJr4U=jgqrc{Z+?&V>rj_gDnF zc@vi3JO%_Qj!C=u&dSZ@0s5KZd718E;ttgAJPf_?dRJrffzn8bNQvhsd_yi6 z)sbASbf+TAi0RcO-|qO~R!CASz;XlZ)Ag(AT+xcJe$``50{;Oo=d(r~ss6c@7o>uo z#tlY$LnTT)$EYpZe5eY*Dl8nVZ@0PF0j29L$j#w|COQ_?Y0 z?DuT8*Q()z_QuLAa_Chy5%?g!+BK64Nm-qZcT>jaMKT8V+b?&`+<`~{?BNiGb2x_p zIrKau>UJGrgXF#hi_QyY%?^#m`1)wrbe#u`3wCEOt!PkYmCjbt+O zWwyd2~`7$>mOt^}85bjmHhniW*VtYOuI;owUnk&)sTn{$l!8wri!!sBE zgyWSRv!a866$shsjxA&pH^O1xhd+X-;M3OGhO#8QFWgD^mP?TN6mr=b*&oY?@Cf}4I9bbSFa zQ2o@{2Oc!*k6_?@@PLHka#uBA-vwgGki$1y;gjEMpeXyfjH?E9bI1PMCYaPt^8|LS1I4na(j^yyy{b)6E339ze@Hx3xY~jnoL{8--;XU8w5>y zCU9{1ER@dy8bHoN_s~0e>Go4@X13*wT%)#&-%AJ1F&wX=>Flq7)E-2{@-|kVZmVz9 zrJdCSBk~K(p0uJ1WRqG=?$`agU-!qV0tFC!2Na#De+X^;OHPNj>rIqly2>&}w*~rdL;U^(K_t}& z{3INk_^1WbE%F!-^!$xy1|0`JKQb*Y;UQp!@gHwEFhtaa7P|l>PA*bRUg{nA+)t2d z5RQ%^IBDiAD|>8(i21Eu!dS40)mHhYIw~DGI~%Y4u9w4Md*%8XX)IY;zYUjY^dZRp zOE?LWDu1R}AE5zvoiFdgy@{1K1}pE_BFDVUR+$p7>KaoJY+nC6@m*mzX{kb_PN!FF z5On@y_Q#iO7M{NeKH;(N>)^mcIjDIjWCv-Jfg|&qDSUDJYk3t)4#a=kX|Db+LWcH5 zf6hqf@cho^Eo|5l8<&zL-8s)a>AcHLVMG0*Z@HVS+h)cXS2?{gs&Qy<61U9Tbm1OL z31BPbSed3Ouvgb>44l<%C7hJm&-ADBPB~>d)Dv>T`6`f>OJ(NMo>yx;vyeJs|L~`X z11O1>%r{{9z|$?Ol6$T@k~E36uu%&~@DSB`{IvLdiQ!X3PEqEJ$eN+Dw$C9HuZx#8 zf5iEbZicfAqp>kvHNYWUqbgA|ORTdx5L&C+)PW;$Mq#EM?>@VJdfQS$Qmd{vve8mG z9@?>@eVsL0$M_HyBBi4_OM3F^JC~+b_uc<=DpRmRF1u&+)25P#g_kO-zlUkx?c+#` zTaGtTy51tSYhN2qHbdcrEZKlDdu5r{3;&R(+?}z>s!fwO@@cN(`s~bS+0ka=PkwI! z?_(t#9rf(RjO?Y}FQwnEE7bGnu#v5LABPzCX(;b>MQVF~?CAV&XI_r&(E1a=q{q2Uka`e&Et4KJU z*+&lPdm+gR!PAT>({TtDVEAxhaKU4#t|QnoPw~MM>@c$7^*!`obsIePythOADcJLv zy6dQMUE$odgIWr@j$Kl>tV`980QP+=7+WD~Wnh|AV^aaW+L)*_stmkb(0!-AwO5Hy zciOkm$LQ6yRhIB#A)pew`6f~1yMC7fC3UoEgMfWF7_@hhn*3(JDR%VS5_hr@R*lr) zK3Nup+V1gYE0N%UU1EKMD=6SU+Vg|wT0jmcmgUDTpyund&weW}u`BfZpCL=7gxvU4 zyRBaDo(T^v#kc2`uk**|jfl2bD`!630n5M!j6^|g0rlf@YR=Abg}^fZCb}1;s8Jp_ z<96?)5-Ru6bk})xa8aHF$2BRPllyF16^o4go2!_o_;-f&s8c0 z%~T4JV>t+d%fUTs$o~5hP1EZ^yv8lJZHt^!2?)=a$Bq^rxn_@zg?t`YM1_6piLJgY z*7`1dG3A=YhqkXW3ZG;V zNHV?+nMzlPREf)B%S#6XRjP8%vqjam{`yEIe949O3gr@6(a=W2!o$5&BRr}PwnrnE z`?#Z0Js$J>(48lixq(S@dBR*TcbU7#|FzPhGD@=&HUSi`4~;oc^9F+ zC9oLL_eqnVW~^v%bq3z-KvTJ0!&TC69Ma6&g0JI04ya8LUHL(*|6Jknm89oc#E7_w z@7PW{A9Viu1+mIynC&#T52#A7khgHC9y9}2JvhYf$2;>xergIIW6{-rvLpRgm*EvT zDItgU^v=HYkIh3|SNH^>&gRiDui-OOmG7_)`(?(+`oJWU1gCdZk9CBjA3pSbl>N~m zIPz2Itx9A&J!}cyE+Toq7AFxD&DK`&87lb$uZ{ID{g{!nr3fzEDWDI+pqr zP)>Ub571zuI5O+9?Wc~~9Q*3tnAZ;wzzzU!UfI|OddbMmlbOZoACeWheH!U#N8O*r zV*#P$G7N|Rp+8-^<9i9CPBp#fZ?c}603m!D-B-6Frd$3pW6njJ_DaLka1YUkH3ssN zm&2xG9f3{BMqi{ZH#^3uYYk6heg0ph%l}t)@&CDx72dzDUO&g^xC^ z=ZJ8$M|;uN)|3R`MY z?5fG&k=S9|!A=x0?q2sYNW(RXe7Cpk8(+P5l#=!4^2|R!k7EVuG)I{;U*7Q!4ZIpj z^-<~e>VahY`1FG0r3>0n34t>}ArPBWEma)qo7za2+XiOTZBpO*YHW2uHz5*aYg(lF zvX0CCm|sw51}PLY+7ngtrY142Jlk4kk^fk5LT!XmocDOQ;QDveUtALcNi}aw?8(vy z=3%omY^?q<-@VbLPCa6lL)}Y_d?(ArT`WGio(f@Bb)D7+Ew>fm1HEt53a{3*L&tPExg%EVJFI zYGBMujYDvM!wpbnOF-< z4GHtn$-IXJIXw|yw`#|pLf#`Mglo>cEytt=7Kg`vXbHh+;;COc_sh(e=zf4JA#)9f z;ix?Um^{m-ZA5R54tz0_MKP@c?tqhEqM-%zj+x(L0-T6TuxcT*sh_GrR)Uz!Pp19f zP|>STd2gQ5z66hYKRQ^5Yl`>fn1!BMwANyP z90=+#+|Y(tK^>M8sK>G8&|=8J%2-M<6L|^Jj;tAMrV9ei64j}mhf)^WcRlEje-bB$ zFH%eAC9V2QQt->v{i1@J-4F2^}3RWU7(6O|X;aaO~JhQ~`}-R5mBQm?xG6 zx`qu9>@?)0uzZdKgkyt=6gaUaPQ}qHRP=;%p=}{ zWrCIWSvt-t0X!%uKWx-=_nA*5GTv_3G`Ei+fIW%Dr2Y>Hq$~XhU@++eR7cE$%gi}h zA>h`!gEt}g%4IE~@Jc}w*i-}rAc~}o%>EhPp+BauCO#2!>04!zhR@Qgels2|&6D{T zPyD^4XeqekPf%mfVE~pOQVoEna^}PH`_$uS#zyhJ;Cs7*nlB zwKw=Iq;69Rb?Tc6u-$$#e6GHva0hv33@K2#pOAg-ZLubu9_9@G)kl&7GTe)q(!Wb$F@#h--dZCNR0K6Y|O|t}LuxTK)$tHb*^E=ovqKzlk6&F0R zflrxlgqxQbKN_mOLT52S#_O9n`6w7blj@`An-R6&ceemV2)Kh41Q(3Y14 zek@tA%@~~&fY8SmV!{dSg-pBoe^0MC|dFsxKoBE;SjCS z0O0{T-yvpfbO`g8-QEERXFs{62wkJ%pxgH?-V-a1C#0F)vpSlKsrbzMWAb9q-buo= zPY!;~QYHo4XbTjQgF`bKP~Kc-wh3f!;_bY9sLanFh~0ajvRMQZb#~AM#OejG`uYGO z@c|acmE$m;^tfw3GV4+8eARCX7&L;wPRTU6%wNc&=$0r}4nAtOKe*`7o0$8Cj&THr zVjdab+IXk5rXVoCgmK-?U4Hi=!l3~-I`)E`{6NP>85kji<;rN0*f^2fsVjTwCIHQ4 zXvwPx$E~J7qq6Zdm<|y4IPE9T;q;IGiL0`2=RV}Q6UcK5o5X~=HO%R9?riwSTEQk| zCKv(ICGt*TEp(rK%xV;;u4R8Jx!Zs^b0Bcosxu6@GzPKFKxzyBW(4>E7-rP6t>4f} zOf`sPW~8NnHco2Wp}voey@4ePmSy6Tnbe)86AqNm+)drLmpYZTrZG#X(ixMPMl#$= zBDb5C>uaS4uidv2*P&Wv- zY=Q19;;0cq3UkU569#xr7;dp}%ic73C6XU+fONl^j(`}sHFAtkS$ol+5_V1rp0*Nf z3xeg@d^vs>cU;q16_M9fvh+o4pRYqf-|$2B-zDsO##~I3J-5_VMHF<^9{=SnNheY< zc03*!U1KK{?es-{r{Qe(67~hGbWl}gW7*%oUto5^uc(M=VI`vwowIsj2_M1VY-1?9cwMQ?%7r*(=Wa0Od!g(G> zbA_1-pREt#EPKbe$b8olUY9QEZ%OT8jE?8hq@@Ih!*iPSLG7!y!uDKWUac{Hu!Yfva z-F#O(2C34!NAI^g=qBel7DvD7D$I%drf>4(SkJIU+#>0_7UI&3l^J;S14Y5iUs@{s za}l8|34s0-dEEo*zR+opM~yKsHeA0n<0A3GIFAWHiEO-xh07vuZFJ`_>};%on1ez1 z*gtN-qv0j`KIqZXS#4b@bt8g9%Oh;Z@;Eo`Z)I@y?kbL5ATLiQU9JhH9ZS`*q;~=5#g50HE?_pUWnUQ5qykpZgc_ z&wIQX@CaEuifOQy5Ztu@Oj52VN{s(5S`GhT*Xwd_iJpF=6hTjQDr03`a!L0@G;aMz z092d54IK-@jbQAxs2`l~s)4~xnvDI{Stl{xx;z-PQ2l;UKR@0|rG!*=z@)7ix%Fnz# zsE=oV1xW!&Xp$eKGaB(GAONO`O;epCyamYl^{Bi&DM_h|7P#Yx!|9MGRDHs{=MUkv zWcFrQzHVjJ99+S{U~~twO=7qo_kBl+j)y}Q(#n;E=bggTqh_eKkC?{huH$X8#c`xY z+=A2!l`)XWm#trcox(RzJ>E#pmfcl zZXp^_(|=o%1r)EB>%|@OYzE}%&R{TtsgvxEm7N1}?RFwJIp*qsZ>0FIN5$?B?T5+B zwu-p7#7(30Nl$QGSo@!g^Ntd!2l0}s3%{+0(cK~jYXy`s3+OmPqR;t{^B(Nj*^H_E z%LGKQhJ#V>fLC^i0N_-L*4Y)H5jXZDQ>W(>2Ez`nlylb?ZJDB>3Dsb!hK)f^OI)WbVgoSXp3e3V6j;6zMdt4q4DOo|KDYA z|2FZ=XVdpqHGB7k&cHwZC0RQkCB%9bJ3;&!s-vdB3 z8>RVjHq*zu?degw`o8}k4n-e!zQVq3^UJG!_G@Ps`)Qiq;9%5AcdToI<^G}Ya!ea? z;>zS@TK)g`O{kH$fe-U;-^FBiMe5TT^VwK*_}##~h*-tn2$D1~k0rE{gW96V@0_nsxqzFRx38Zzk*&f9`n<%FumvF5j>2vZ-5R z#!#t2lnVhe;AI6u?TKA<={sg9(2d$(lK%x2e#8NaXLC4rZV$Z}nOMmuKK%8C4}&Kf zqBc=?oulp4Kfy3EBQK>>@mu{Brac0_o^h1BQE=s8uDtUi(hz0z=!GEi>YrMGucl`(wRyG`X_`>uG^wpMYVlX4}Y~X z6v`O<9<4A{&3eFGf%IS>T;4}%iyh4F_HAN9=Lj<${ zO#eo9M~Eh~={C*QwQ_m6Vaz(vFaZiTaX41c@eD9P>X)wcJA(sh4RjwBE*CM^x+GET zmp77}h-!5oj2;>y;)>jfz1tOsY4xC6PBr1LcYV$Lg~}p#NoNUvcxFVrM+^eEE;}d|9#6^swV5W5=y#Z#LLe z97~GV-h~u-B|Unlf1yKLtnF6Q*YGXK>D7$)oYk=YzLIIywS^@{DZ;!*R1z^Guxyjw zUYTqBK5`tLq|0t9-VAaco6Hs(bKz9*)oe#i06JplP zL8%hi&n#(TZ~$&(FBDlk(N;pg1&Y~r%pU7V zMf{;Ic3z(T{X@4`;AajN4@VPjMV0SgNIbR{7?z4##b}kX!>H#fghk<5Qfd}y-&NS5 zykVtbko0KqJlRyP0gB17{dC@O?KXi z2#aEp36?S2(NR~_$l$R@7xE%P8G7iMH|Zdl92&V%hylY!l`?v*m?-n#hB1|hU5Ivt zVAv$jl6sjZ38qT_O}z{PfhAF^M7CsjMM<=7SN1ig^~HlpC~jz zkBsVGax&_4JfZ<9aJ{s|;E{NXI&m|1>&{wj7~w>ORR5Oc?{a-R{j$gAbJ)droWEsm zI_s$`Q6`Y4oCTLcsoJDmj?P1TqntXy^9P%q#BGPW-am><$`$SSdvQ@c@nxJq*{z2hvJhlfE4kl#vq3iMVVH_gsTfY7{bbnfAOndR=fjD=)uOC(!|6;ga zb%&3bH1U5J_qtHsSxA+ml4)hU@e#0{KIm>EuoSo-uyVF)mO2tn?jyRXU;FFyFzQK; zZvhZTRQgGPp*}hc>g^D3eSa{=5{a?)TwYd>x{=}Hu4fmE8{YftGRvy=K0wZ5oTAZn z*)$#5gX!;es~F>+w?@jLzm)DBaxkM285UMn!@8M*29PIRB*CT;dXEQ`URF~iXxTI?^XQbiTn0Qd)m`PZ( zS$(zBq5NiDaclJg&9BzNLY?zrd(F9gu(JzttTmK3w`EB1pBRS9Nst z23mwodwlXA!UHf9>wB?%eM*JyN-X0Ls7jOzSG&7!u3lq#l7fb{ci18KbAJvU?|H_t z!= zj-?Oi#8*F6PQ{Rn@zBUYkT9cI!7oW|AE57XiRAUiM^&Rf{EZB@$+n}K$@8NYJ5cQ7 z;}A>*nbM;~`-~RP9bX=_P4!zWzaic7qdwWB!jz9J+z&4Eq+@|PH((> z-E~wTnL2v#DUDV$PiHO1V2e+h3ae|k?0=}zhQy_WikAJ*xgMPxek5*R>gZrAZzM_C z@&zTSo1}9xSGEYPNf4nTMdo^MP-p?5RIZriMko_po^-%98io z+G7h0nT1i&+X^4^SiiAa;aAYWXYnpo$G3O(`0>Z`zVl*Q8s2fVYq`9z>z%`xyNCoQ z{B|Cj<2WOz^7EW5GG1>`?;i2kFP+|^@?C8(z4Y5qeqVJSCnM9HiQ((=Y&n@s{bS*O z@UQ55gB!9(!NOGL!IsDIky5y+&doBs?|wAd;(qg^-am#%!C$FA%Ll0Bpo!foFEru* zq*JXMun{n*c0A0it3^#o$w%QM-y_|8Yu>)rPi3*El>RD?3mMiJ*W@1V+A!oa5ZYby z%F#(Bxzfw7;dYqnkloNode=vvXBAz|DPwswGvb3Er1$VpLUv8a_4S~C0k;%R#1q(9aQaulFu~H9p|aMNQx{T9pKdW4 zaDsc*#k(FmAbfT}L#_dl_v$>39a(U^{Lv>@dkZ$n`@Tg}%JqRk)2=^4%F_%lycf5r zwyG;Py(?T1Qzbmq;uNSXnrQY^wl_Vo#?C*rI6Ck8ke#2ishIh|pQ#pcn{(!Ac6bx} z?byxwu}RExyf+j0GqFF!`bJ_;K(l(@_7AR5!R19qw-kCp{v#OL-AcLw3`&Vn;}tYz zdv;|!%z5J}&}Z4~;B4rz;=S;G2~CEi_<;LY>Wi(TSK(?+ zmSmHkh*j?&s$QkR8Q#b-c%UpKvWuFqBnC`eNL>r24p@q*bcH5{EH;S^h4uvm@UksN z1^dH?@p5&BM><{P!%sE2U!p-5pV?&qyGy~#9SGluwwaD47IT==lPFA{L_A@39PWgO z9&L4MD8BIhiVjEgem2JJ!fp|sx}$&24N>4unmLuBrjj1w-}GA#^IA)V>vAVrS~3aZ z)s8pZk3lWig||UtFwQvdBwIz(%=gV;G)iB{8yvFW3>F}qOhk|_R4mDvWEa3T3;k2v8(NeK1Xw9k+U z2UZr31v&pk+#-l+33)Dt7?NG|>G@VvqoLgvvz0VROb}VHr8w)4>Ay`hrjND{9?So| z-u+L(OK*F(3!3F!C=T=VERz_OA3iRZ5kKY5>&En!^aw|hO~h<=`+l5=E~VRkHb{hF zT}l)*h8h--z!5^Arn2FI-O|y44W%^0G#e1b*G2TLB8kz91Kqtyk#t z_mEst2NPe6k4f#QW}>HVyMy_#-Gvu#?>qLlqzLWxpILNq3#Zr?GX~FAaPtQhV}9O~ zfNsFQ_pY$Mx80Lz$}2&Z!WslnSc*Q?SsDIl)JSB9oI_1`I;sXOU99&uIvDs09pZ*`9M*1(Q=L_EVWtaC10A zXQOb1fj8bLKBcLaTXJ$<&wA1Df;SjTBZ4h|X`1QdXERVl91TaCm&~4IvWYSq_hE@( z8*iE5;uVr#pX;uNaQiBgQN+qteyGLEzl!G!o%5#`S12rN+-^tiE)Yv!B?**ScYazS z9zP2S7|7KZlabhPbS4vTq#BR};K70LQwe5O!lbjnLm`G13tJ|9V^f|y@#k3dP98Wg zc*2XnFy(WkX2|cIFX~n0wPA((*nGp-)0*mo1ydOPx>POTafK=sd}lWKGoO-21oOi~ z$9%%L&7}Y9M^c>OX*~GVYN?(D=}M!;WHw?RL(Pxu$pWW z`^;y)YNkinfPhQT#EhR>+0Q6_5?y)fpu;E7odnAelXaWGU=H#R+j~k+Xj?CX)rX8> zEBG8wl2U{0VJCHal^2!Em`}-VCe9HPSJYDbgrLxy{#y;O1`AX7=hiykrT=$Fge_jy z`&h#w9jH_B^g!D!MiW$Le99voKd;PowRnduwn^29>KaTM5qxE%#CbuYBDn}`Z(#a>G zN>3o62qq7bYi|bs#-Nua*^Gfh%VBkHBWVegK}M(H*8-s*Noi>88aeS7nyocXan5yW zqv5FVQ{Wc`h%i<-b`q(OIAz-MaP9x$(j=K(^27!M(+$EU_YVXjKwWE05v6%j*Wrnm zY6$P%@r88w;jG2$V9ryzSbpb1qE8j40!)N~*x!VMOZpM>L*O8c3xme1Bl~$C6|6(mAy`S({uW#1W&{QY2F$&K{pT5kD|3GcIc+ zJnTZzA|CqO8u}G!WJv`+Ic&+xBpD0s43qU2WLk&>LwQ1ws?zTp8m!EQxJdP>->LhD z`CZY|MwDqn-`hfJ^%7OzEkG*tw(erw8e0|na|@l(hzElI81r#4(&Jl|HRoQ1qF(u5 zwhc9x+;l(8JG_vtPVL7a@jv1iH4Fe&p*cN%roD0-z^YmXxlUWYJ0eX3Ws4|$qCYPe z@O3PIKdtdye$Ds154l4Gvp2_sW1^|>>c$p_L#bb*(@WAci|s73UcvKbc-dgxF@C2r zE~&@a9doW*mU!=~&o|bM>ZZf3zg?er&yTzPB;*HCzfQP;<6#n;Mh}&e1aQP^w9|oS z;;Dy$b;FYZTdPH#8Dq{^s1TVlB@apgB+UlddC<6~9I8z{s2et6eh??HEJnN`b`<@L z?%KBF>ePHErSPT`ukTP>sM48VRd5UJ8Sogtetgm6JnCiuLk6aki_#M+GH57z{K0ak z^>sYh-urcN{Yw+`Oa40jw$;x0w=D-c1uu&^)IC+m>G>t-^0~z`b25w1!}KUMBQM8f z%z2m7TE^pU=2nDw<^aa9VWEjnX61yz70{7#s7!k3{(-}wZpLlm37|wT-AhtjJ}p`k zG++gU*(9@t8PDR)$#V6`cvS(NbVH8Bs3l1ZQB{ETj!UWmDrF9kR$1&qhrA2(z*yjP z3JRH(5QKV(qcNzk8UlTR1~R&wx2<|3*~8 zU845vM;Q(WuE&bKb)^rrx7bvkPEi(rs1)fn(#Lqr_v0=HipIq+rAurLC&DvxXV_mo z*@a^%c421#k7OOYz~P5bim;I^=uPTLGUTpkaHk#Y5P>jliu6&D8i5j7h^#Z1ArVq; zOC0mL4@}sbIJX88h#)nEDFx8{1)YA2l@aYSV#03s;oP|Rl{NQraR!T~h#5ypVkQ5K zZ)}*sQjXNv#_2dbx@@$|rNLv!{2=u5b^p77c@h&uB#-uV8D0Z0bsaU!<5;q`w!{98 zv$}%*1r$zjl`vO1@5MhG{y}m;jM%bSWVpn90It?|jpOcbGOlkl?HDr}1LB>@=_qNM z4(Bl(P)1APXwyU#VS4J*5`mnM^vBQ$KuDHNIG-Z5%EH+jt%4iSoW|Oq<@9#&5rjVk zt9|Xo)1WGEd*u{G`b(7=Z$Z<2Gj*omKq5cp40DriM~`N#!2v7+4EL}$7YA3j%%*Q+ z@vuFA^`cJ3(C_S(qhABIomQ#zduKG%9@4OC=1U8~A!~NKQy1bRw|ks8y?>`o_by9$ z|61c;Bv*y=VDiX!wNcTCRf4Cw?m^G!B?UzuH*}j^P@*;$1m8%D7S$~-_)iC>p7&SC zh{$K(ctZT)V~8UfA40kW*wbWwC=K#mItuMDOU>?s03;`mPqP zOZm~mW^J)UMaG(z@ej&2^j28~mmhzd_I8Hr9))A>IuAEoS{VFIw_X2R`LnLYtH zpuT1zArgFG7B+6Ys>0i$kjK+tRn=NZQ`$1wZoz&-o5 z?|>cF8p8ge`WQ4x8hN5R+?=_EDw$|==2xa|tcL)a$xUH?bnkxd6oUN=L**%x_ zf`uetZ7rZwPxK83tP#ylw1VNu;K@BqtnM8bW^lns2cA_0R&} zgm;Z*3qE&t=KDIf5MF354h$|FXq+9CRB4p&lUJLgn3EOAHQ)VMgEU^ncy?9MNpHP9 zaRsf=PK;m49VKQr$S0EQ4(OPYf2D+;TZ8z?qWz$7jC8#;kO1@x?-kxC)ObFn5oGjD z3f6Mq3IKbO0@?s$l2(=63vk_7KkTU5RNp;P6{7eVuPsWkiJVA)Hqn7)9^J{OMPBGk zMaUFrxxo1@NB;~8l$TdTVSo7CXEnC=QjAAf%Pj0k9}pR7tEj7GZ0Ji&9y1EBF3b%( zG*VZe>;6r@v9?ozzX$)*5?fm18MBs~anuYzmf7Km0eJ`!adMgc8Gw4yR6Nj$v#<1X z>B|_|d#gWlx@Gohk$*n1p0(ph`iFaG{}`1-OtR^ywGuqvqMm+vh^eVOHlaoBA?g^KMMF4PC7vk5^7=QaRe?ek#2asMLX+=hb#Ou86qYhfKcrwwD| z$U!p%r~j@*5sdyDCmuOsqoJDw&h^x`sf0n3g!4(6g#g@5h-QB(x+4aP-L>tskd*^W z8t5;yFqh@6ml#{V5M9M2pZY+TI62SKNO`y{NsVlR)-PFkbdybWSG;w%zTC}khB81V zTQR+6F)HZ7?Vxoe*9JKzt}~Fdy8Od~LJX4q9##Igow$;w0gvCdR%wID2@b&sfxLr9 zT;8?&=+_U=ey~?f<_d|~OEgipGADa5g#i4GYT^(eIwSZ?&bWGDe0xdzPTbRa$g^+6}K&=&hqgE6b&Pl&#AmThE~$aq@6a-Z^HOAh>l zhZ9AhyMXFfCX0jeEdKePP)LiiAt{U-NXfIOla|WuTO%owd6EMZlTDD!K0_odxe}^s zSxQwl+5*EH>RxsN9MrEeO9H<6Fr+s6vhn>RRZkh;>J`5-R&q^VR!@xgj+*6gZ&=2i`{O zW^|?w=&d4PNMc=-Cy+!LHswW;UQ)qXR#$o_hEc%H$_ck9;tU4bO_!`grP7*X>imYX zY6IJSg&8bfO%`%NunqjjY|lg{mE3AtzAN)!k5Qq6AV_=2P{ck)qJ&2{e8| zQ~V0YC`_|s;1c@}mKdAp#3!HGO#3Tv{V^3rfriOQ5#v(cQWPP6HBU}~KY`itU~fAc ziNp^ISF9zyXuNb9Cy9k?)Hl2f@(RC{<@EJmnzAu2x#IWpbFe+` z3_ny#RygzN^(x#P0PB+BxwRM^r#fE7jbHVAEQDRyp5*PxtVHS{dmUvVn~a?h03Z|5 zXbejUw%u0Wt+1~mKcf(gtCe61JYHJ_F0!21xwqCD5c)-uXLf6 zXRcN=XPVI=X&Z%_h{rcs#Y}>OvoeZ?O|)zqLy_34w|(f0DSYpc#hKI|h_$tdcF=!Z z>A4|n&AWI2G@$ho>RoS4HSvdG6BFwKTPG$OVP;pi-Ckjt=<}^mcav`w z(0%frY#dDp9^pID3F*V#`EYaq)4!PJ@5o{WqpuRqjs)%LJfhpE&T+=_ZsPU%ud6Sv zMG=U@2OaR6u-j}?;d;efXOh`y1B8-%rvVc!g|l^48euAurz3sKpQ$Zzm+w27M8DKxt(m3+cI!hc&78SrM4ZJy)e#V%KC@M! zZXuy6HVqDLyBB_n`VV5WwL4zAC67szB@&@bx;+}EAtf^)b%)7!D(~fPZQo>6q}4Td zfE=U)n&9%JZGYAaWe%&t^RS4*bHHq8XU?132V?pqYcufSJ-q@ zL@H|>wQdc(>ckXRDH^GG8A5$4LYj16JQ(M6)%>-Zm=*B!C=?7`Rdu7nWLHd!tu*&) z8^iT^%*3#6O}`x|K|DntUEfDa9<<<4C{W~uiar=o0-2yU;r*T@1P*C1B-r2?1BDhL zNZGuR+R4!`EO5ja$?D^Ic~#%k0&r~y$DofxzYbOncVNbc4#B!hC-I@Tjd8}lVMKKF z(sm$l-i6w0{QZ4g4sRZElCr_bUt8;+{8q5A%7VOp;V(1i$+4CGCDc-Mag(lx=Ycqu zPZQH}YTIZuZSp*dA0UR$l?LAEFdrhu+3=dZQx;D1nMqH4__MqViT#g`_@(f^#b81( zYrHNbeG`mi4o<8yn7%iZjvymx8PolZunbl^_>IAJefD)EP3?hf`Z}h)aAGg|?8c21aqz2b!qA4TCfMblSfSdkeI`R z_RS|7YHvH$rWdu22DRvzQ^tukSy%hY{@8!?YySW(vdgw%cES?LT_-j>@rL$!_B!`)(#| zz{9+C>GNcVd5&#U9_s+QQ3kMhu4ICLoTav5_p@iQC}$$uOfY$3HQa9**_JykZ5S_2 zqoNE4e;f35)ZpKs4R69%YGiLf&)gsFbZu;4v-a&a%>~Q5dV3Q~0ZL;8IW~=1951@1 zzKC09EqYg_<^?h4H9JJDzxb%os|AuNSjfV%7AYm>d6K`7Zj^QoGQv72Nv^*Ml{G=L zs{F=s8O{|e9x5;UgV?;)(m&36J-&K;&*A=_6$MKOO`pwj?c<9JvY3|S-o9Nedh(9u z1O-`!!mtn*jr7IWl7bq3{+W%$gL+;C!!>e4j4r_}s7%5NA_S}!^SVecT(rVVC zwXcO?GhMTr5ph-l+@>k8BofeU}y^uUGl4quGF9WrB4|`gJ zdCQzHQ*9S4U#w^FTmfn|er_3`*HQW?0sS#=EsLZzetbQ?FBN3Zfpb%*H{!ni;rn#5 zECy*S4@2?yOv<7vr*P_i`HMI7@KZ-Gp@TZ@3`Sc$(Dl=&>vdA?o`Q;UT@w@(^mKN^ ziQtL5XN3GVxy~f#z|t>} zT58r!Y8A1D*4(jr*|W7p{nj2>r2Wiyo4^riXS3l*$F7rN?N81Zu20PWs@z~ueSQ`; zNBMN-t;q`xk1m?LNk-6m=9dNFP(Vivs5w*`#UOq;dn1YYik zr&BFz{L-Ck~nlworjgQ1G`mDUA2dqET$!fK(uy*;w#Vu54CT-js#d2|6?8b8c`M8%Y8d)=jyX8yos+ZBs0(_Z}E2=WL4*Pu)GZ zVRT=*dRL?kgJH9qHe9Cnn#s-Px@D6uQ;vz{Zu$4$f0usPGO=8vgo6BWR7p=Ub{gEP~>+zp1qXW{8j*hrPlwWyRyYcQTnoW8E+GW5aCN!ww zOh#^C7cT)#;jZrPH|BfSEm)_U?x?birVQhU46u;bqu48%Sa%o2pCM-1v}+_g4JAgX z#q3G(NU*5Wfx*QF+EMfLVz1^T=HELQXK+rp(WR5Y*y~%)@~xgzy0O3F7x}c?PfzU? ztle&Aw!Vntj9uXRiXxrwNgb_itS%SIByx>{if^~rbZ4}D{v_-fcYDX!*K50R=ktFo zf?JpFOwb;gZXDu{Y5?Yj=ybc`*)f8ITwGkVrxb0o{-X4`A}(w^=&8QRaB-=~aLH(^ zOpq1x7#J8(_usvHx96PN&$VeO(2ZbHrXmC*dGuL%l6}XS{pn+)4#ri9k$;^(e^P{~ zn*$i3y9cj|Qf@uZ3NgP|EvM7{j_|vbV9LIPf8)l(s(p6$_FwXhs`mP(BK!g_RQhHGrfIW^XJdk7FD?~w|x1=qpt9V z)aq_7d{lE0gBj~E-xD{(4CYxXDaQ&-uFS1-VLlhe0w;{=2%&gP{NsFIdsP^=!Pza+ z9_-hV1f}?t9^7F@oR&{c>0kfMZ(qH3|2@f41^Kk5yu*9naOWN~8;JE`zU|FndBMh+xXFGx9b0j<2-Vwy~J^!n*3!xXfO?+i)(-_@3ldFbNjw{sHRh%?MzQk zFKyz0M8^kF)P!4w=fTJ=gc#ii`OL=!^=!L z4{4Qr3Chfr)Mn-mB;;>D((0*WMs^i&o;AQ&qZhB@y7q|>dP-TRE6o7%7VLA(|Ijkj zsPLY}mg+bVtI~_eCbWF}Da32m1q)W9dU~H1grvHrr=_@z&xl{?!8NFRk06t*2RMCi z9suGY4}%I(zOP`PQ0be8yM^0!`S&_kE*@okDvgYvs~Qq$vt@UrMNmN<>~C3k=E<> zEvizHfXJ_N=Dd5qi!wShwf|V&r`G-0`Rmn_twLG&l=g!W;6_;zO%Fp$7HbkuQx%IO zR4?0oJgsYr{1Um!$_8~}XD$xO4fox3dzXXC>?Io>_&Uwy*r5ceY_zYFFVB%Z9$XSp zzPkj}9!o;!bKlnqLfXA32kyOZ7{b{h22cwktM=``px0iVtcd>cO6VuNfY48r#oW2G za$?o4+%W=SKsN@#UjZ!$LP}F=z83@2vF7?7(_uxTk&@}1?=Kwo?vPeBk_Xi`y4Z#&uV3SaVACcGg)qeL#~Vopv>S7I>e1d6^&8(Yw5Kbiw$b zQ+YEEC(%DB$xDX}a3u<$vb=QX(ShTr)UT@aDj~2D7Q*HTljkxy+TzWp5&t0GtV#tv zz`r?i4&BH-=ZC5PKrTgZ-zuv11vDHSzYoUOoeL)Y zbhqciG-WlUhu2{%@(RTS#ZE?<6@2lLyb{j#7y*!YQ1Xh(FDyJ5Ymgh_>x8Xy#B1$= z5Z_%l?p%;|+gZLFW^1Ngu6m7_a%pK5Q+AnWFOS_9zxevVuJ?162_Vw2@TP3r10bM3N!g&Bmy zJbtXE%o{(dmXKV0b;Ry>HeBpX-68EY&+V3tkXmD_5J1Bn10&+G>q(U#{d_=u5Y6H=1e+=pS$Y1a*X{%V_F2 zz+yyk} zKXlMdor^pU$f^KhFtQCSjRqlg-(3U!{ri!Nfre+Z^6f{EPjFxE&ceGIP~Q)Q@5!7P z*a&^%vqBB&&BCj&AIQJ}l7Wffb5VzYuJedJh?2*^*%Fo35eT`ZF*zufHUCb)oIe`rA-qFjj};8CIkzk)#TzIKk> z_QJhtH#gs>t+sQ{cp@w+8TGn5!Z-dOGA_>hk#0&dYihI!?P3u176v5uFIv0* zP8ohKYXeqt3F`+f0y{v1Cn;M#{lk1_E;McNvC$y@4iIei-KCvouS{{(*Xf3SIf|jT zdFgo2YnBRXz0M0N_#n4M@@+@KkPyZo-|^XWcXeII!p}!Zzrk%b>zgQjqpx2;Gpz!X zKNdfZqdo+PIvaUh7MDYS%R+&vVNw408LG_fmi3=zODdhzj{Uo>HD0H5%)%Fs!e-&J zWe*-XF!#W}EUwAizw&0l{5W)sWvZ*IC%%D^*-_AyS-#OUJ9QtRFxC#{nxDo-Lyw>l zSaTyoHiau%!gLH^ZOr@zxufZQXiPuH`H?JT^l#~X5%LvYT9BVEXK$p4}5Wt=FW68xD@u+ew z6~}KWSDYV3+i1Or{pC?;!>yn<`dv%MZ?<^7=n5_TBG>WvM!)cb+>b>me+&cF3itLe zq$EzOz3nq*W9XjIzVjI!86xP04<|P=xfezm&O-9dECs9!rmX`0sGL2=y#NH`T0na* zo+BaB^PI>3iz4g#tXa$9nhWb{Rh!OQ6!VMLAKCz6^;4)J^LBVJB-ft(W33mjy8VGb z9$#F#hoW=$+xvBfd5Ok!=~DgY8qLkk|7u0%+c|Y|hlA#uTx}m_O`ksf0NRK2Y&M_z zAX)L$`jHM6_ECxDn+;xLLs8zxc2!|UGgfw%4R9^;SLsbS^I1XE?kn*wsM~IAHTD8Y z<6&u08GnF(Xb_Z96DV1Yino~Mq!WNA0`%&Wgv1sB`6HW`dgwblJC~9rMOIEuj)oZC z&1%RL<2w34R7?c5;5Mso_@i? za^sn2Skg*fUY@^=9^(Aty*Hl!2K4%)Ru!6rbFS+fKYl#s6!0D~1(r&yDA|}W-xT7B z&9hajVvPz{M7HHm`?G&&2y~3mwIb9M2qqx>auM(nP?YIdjtp8yGGBP?u@^Hyi{b`U zR?Z5MPPOYg)1(Gr<2|&kx{)XFa@FJ6O29#e!YuvG6HEQ^D?|kuneQ)@4+$4H56^zF zg}{t?D{MYDHE~0zP%7w}6Vv@+-|~bnsy9ZXN9uqwo}B;sOD@;PsEuGjDGRyxU(1|bm}b`%g6@(c zUE3+Ck2Ad%6ZZLf;>XVVSzWk>`9MHPc)q-BoYnN7rEUs`%kfl{>HGe{k>3&b*Fj|M zh20N=9F=B1{%sA1oC$FxHXYTx=gytOuE1xdP51Zjz8^juH8FDz^C-X|Ugot^#M7-D zUpw?gAI8~n`tg5k_IbA;(DoD5eu(uX+2#w;CNS{()U&5)o6^NZ@o(L#gwTX_0#-F2 za1>=cYLSZ6vvDfDO`V-WIi6g=|B2=`Cx=d5#6HMF-x+JEXGNijTWkKcRevf)YAEA0 z>CoF3&1+M65v9D))Rawk9Efi%$NUyI0zB}706b6L2-4*-dZAXC3P&y5wZw4DySG6l zA|U=m$sVFWsEnqc(u@5%ojI3TuM~MT?=~F<(Xh`RY|scReh6)>f6oIhQ)! zkd-czDCw>1N$#oc9qrr6QQN#Jab&dW`y*_o)xS@!F2+xG{nz6Nz+M31XL|}oA~OMm zT+0h;ZYQyPT(NW4F7DlbOw+)_d=izZnMCUzK_4O`Lqtp>1s0wFYo*e=4N&36W>t62rS43N9NO&f5#{t5f)(aOAiiyCc7Dv zi$>%+%$HpQLNUqNwB=V@@a8IN2;?d8%%twxqlFVZV8&|d=C$uWHl@l%Sv=-O{Cwqf zR|lEng6nD?Ij`o(mv+7~hq0jNFRuOOEqDB{9lc&)Y$fsCZo^nQuNqR_Jt1enW7lO~}Vyu+z%aKp4(4;#Igud6Sh-=^7MOOm43d`#79`XfxOOdGqF# z<^}Ef{dZ<}>?hk9Iv|#?rN?5k@EpF2;4r_M!UaTcZZ@itNSxA#rOUC??y#Wt)3xX; z#h7#5VC7eN1qv*(iW&JA)IxfR*gMYS?RbhNno0Xb(c{~I6E&t>l=;r~&Rfqd?{d;T zhlohI;k<71`=xKT)>YN7>f%aHo|)rtM^NeBHF2wVgT`-_csE7dRAL)*XM)vHCvl|G z@o)$|jlNmm52in}=edu}#EZuN%x4wS**Spi+XU3}&f^t|bOX$kYg|I?l7`X!>Bzmr z*P@gu<&(oH(cCLnO7dUFKtwLmdHf>I2JxNy0={})=Q*-^09=3k?4GTjo2(l5M-tBC zY_-Or9UA=C(IV+yHg;CWbD?;33kHZB8uj73Aj^LpZ8qs4{eWW`paeRbuLde4oK%i% zYlXtd4=p*$-nv^b>;fE8;n`7{0M}k$IGu_*_Jesg=uJXLE7I5LeT2Fq20J5JKd6Db zxCet@s!TX?aNL^!FQ`crMUrA3)@|h_PcsmpjmN9jm|BMdFYe1oP=~}k(g3VC+kZPz zk$gG~)s9pOBPdKxeX~rQ@Ig+8M|G)DJ`XkaC>)m&sIW3{`sBA+dG@UBtSfUoI34Z; z*RH=UzL~-BT#qDldX$1Tr@*lDe_MA8TB*PU!ded!_Z^TaV|7DotI6D~v^8j!&YzPZ z=eMEam)`|}lGu(tw$65EWd`C)#8^;fQEs!pLLJoa{hrr%Sf7lvyE3$^P`?or=%XB$ zEAY}b$=0v_7?JZpDg4diUHe{MS$=n9)0;5)XrRF7cG-PiSZ@Fem{z?82KXdeRgF#E zP5FQb7$q2UyN+A3vPgEhD!ip@E8|ZKY{9C_>svC25!TG$N#>spVD)?Y#$~sR*UL zOWJ#>-p_f3`~JW0^B(`}IG*G2lt*3H?>xWf_xo8VCP;tuc%>kW|2^Ic;$*|bxXH-| zFtwN*s9Vor?(YCM=8LXbOu%QOv;Ozg3=i$CXU{gx^r-b8O4FY;60+>b2o`cskli*} zgU?cqg04D8(K+UG5(ebg%3YuP8;+2OLEfPOr&3bH>7WB=Ak0}!>UE1|MX0ELzc~F~ z#k0aY;Cl7LXnu`df$AvkAWPjaJT~To^?RMZi>$Qc_c!RVkD$XpD%dYJ-itJbPc6l zEpR8yB}AwK*a#HXh)E^u#!J#qp*(KvX=!@;^yy#A>M8Fft}sWu5db3kc2~=7(ejZG zd;p*#MsODaJb9eIMHH-_eurMyULeB%?ftFMbm<$>oB=n_q-;LgzyrqD(}aY|_M{zC|!KBfeX5#k^Z zf`;GHztr_i4xMdTbm}Xa+6Ovpf1@B{ygb@nNKL(g(l1*7eu9uE{QFx=1@Qd?anCti zU&e|?m(l|>j|32^IUd^&AZ&lUdQo_ z50mhqT(f)Bb27GxJF4QK9&~{6=>nj&s)+lr3ZCUENF@+Ffi4LKZnim{drIcZOf^= zp!dGksR;ap$Oz=q43tWM5LxaNC_Du&QZF|h#ZGszxd}aa_N*s<$SD*Ha)~Om1=uH# zp>|Tz{74ZwIDh-z_D&|rw2>n=m^bv4h*X=e;)aFvB~*R4DeEw*yS4S7Xg>D|@_5LL z_3$6;us+kI^%fyH@g|*hP5(Cl-S1V{*w2bGn8sqVd6(=qP7bE8{Cg{fMJt4H?INt! zyucnmFFr!Yt1ayzS~OjgiN$0^(`?+$Ixt|rW~*!et=qRczk6w!wf1);yh7xR5? z|CWW4bFR$4e$V67zo+NRTQQybp;#;X16+Ga#v-DrDfleyw>OYby};xV*&zI@`!A@k z9+t}Ptxu09GzB)%u_1K3(Y_9ISfYtLL>ku&ufgUh4ONI&=OzmW{0XE!IG4ZDB&1{L zluEqvE&|$EGHUc6_Cs0}bVQppG&K4T9WZS>E)^=0Nw_OeGVzqU;+l!^BN7)+3Lu#m zrrw*DOtaXeI=B(7+Eo1D!Gn0y$CqhLOIJ*a>?sp_!sGHVmo;>_t!ID5ey2)#z5S^n zDM6plp$`-``SW44Q`YA1B12`Z{=4)z8%BUf0&m4T++O65%or(*-(y>TeQMTXB5iNI zpeDP>HTw#MS-Z^d440j>C+p4qip>Kw3_N>@N(h}lG*VJnmkJX;aAhunVB@5sPiiCE)ney8lpDcek!usF3+dY&GLprcB(9I^NU^#ls{{}qyO&ab3PzLR%O|uN3HA|yJ{2$~z)S-E z6I!5diSxbH>uCIj!yP>t%=|p#XC1>Espk^4?xuu&R=OJ>{FDh(i$}O-M=P}-TI+(u zq1iJO0F%5~6rDxFhWych9Hp2>W5|9C{Yo0}6^Hy8d1i1+$uGdqIZE?ZJk&Hli(89; zbY*QA9v}betA$eJeM#Ns1SE_v@#7!|NWmUlwQAL;Jv->m2BQJi&jp&E#O$Z{-0-MlhOtGBc0%wz35mhM;hy~DzvG7zZ*wjrxP z>txWjeigF&@sjwL@#25LIFLi2Dtvi;V>#c(0{VdM5$LSGfC?zESkbud0sXNG2+&Uf z4|_PkGsG@_FD@@HHbdQwns&BJ}d~gs{1l6$6NRAby`ef3EAR_^I~9 zDa`Wf{25yt8{lDG`|IXrXrChg83_3ZX$V@f+N|Gt;o)IVLEq-T zbOjCgK#t83h=oAUj`1K8tplJLc>Q-^_dt_zy5QETvh-{~fl94(RI0Zb({c#n>oAj& zf+)Z2MOhN`_8$A}^^pQv1#JqV9W**fhzV&yT7HzCmoS^?a_ znB_rEY@97ru15dC*^V!rWAwVHytL z4DeQ_?f5I8@VXMO8oAMox-*NuqIVAHfM3}MpHeXxkr<|;b=wYIR(pS+?vzkfC@wFg>3uKd$0@0{MB)q0s&#%W@8{Ww5@m_Q zN~PDlme70}>B*0AztngdM}UI$8#hM$VzsPA+$T_aG=7(!3x5y;{1EbT-o0Vlh8^T& zFU>n(+_ZOoejX_E<;$02{`&9@0!{)Lpet)V&XH480p3a*1^*X+urwhaEdZg7wNahg zIy)m9UBQcRBR%dhfke}D&0>{Y2PBNAg~@*v2LTb$|7&0Z~CNy?hDl#=0Z#BzT4h z>{6nJEb-?J!z?Y{TW*L1sW{5n;=y(9*>SU(F}>afS-NMN17T!2wRmbBD#~U_@#CI7 zAPH>ZZR1?ed(I1`3g^UlX7j#qDV`UO(=5$Tj;Q-~u=9h48kh4*sv(1IYj;cz4Sn>T z%P3mwYmv<$I-WLOAa2&tV5zS%*gbVXE~$UG`SSlEm+P`Qob{pIV1e27rxjG@CLZB) z&;M2CKu`9kmEdVbvXB?%)iU%2d$8_viGZlZ0E0gvc5z~mZu8fy>HIAYp9}$mNo(V? zS4kdX5iq-;0ItTOsu`+@l_NxVH!fOFwAfqv>9Qz3`2txm?K&>+pRl$RUwCo$$R355 z-C#W#NE5l#d9-i5K8ma8^zQuH(0BLC=99)d4%9kLcw%|Gin+U=Dta_)AYP`bbS)tW zcQw8_Z>;pW^YMlI0xBvMYY2cIBsUWL3qW6#g~v4G<`(tg(0m280Tlo@$7QEJvRj}GPa8pH~wYBxNg9K(Jy$Cr{T>}Z#1_P|G1!L{-Ke+I#7&@~EG}Yd_XbbNy zIpnc$|8Z58sS2l8d`x+ukX-gBt?#c|<#qb>M~2G>Fzv9;1ZY*X)*k-Oew}t! zb}7%J8>~6f4uEIq@%za2@)w|-)o5cx*2=tTT0|2k1x;pl`E#0A{Nc4!Z%wlf;Y`t9 z`vLvs?ZqX6{t*2pj_vbpMILyYRL;FLITk95GO|aT`Hxn|sX=f=)I=~-F5sgA786DE z;ixl%qQMZKVIt8yfOKcD5#uPGL~V$^>lEmjP}rWzXpCPXZN-u)n@+pCw|8q#BDp$V z8rrr36%NU0Lw0Vg_FTqa7Wnm**d+zNXsf&f&+96+Mr+*Yk@<^@ zs^9=NlgD1SZTOfc57im}kyRFq{yFAUPhxgvhfajdP~;_1PFGp>nI<(Yd;77wlDISp zB5A>36d5)n@O@Pv|4uxnBRJNPpXBCWIu6S4OweueQKL9;(|FcXIvVw;vC;dsq zVxUsJ^n5R40$2^I>q0O-!_JvgY?_ZWS^@D4lAM*l+Z?H1jF1zRug$Wqt)eI zd^0VX$Zd9u!`IfI54g$W%!S~D2e=GT4G<%FvD$*yIgaJ_+*b=`5ieX~ECaIV%G!^P zo6YT!Ciq+);SxQ~`T1lGIDglfxbz5bi}({_Qg?Es@FmSDfmIaE@8giE=J8CTmnMR% z0FPfF!?pa9PxFhTGzi%Jzo**u{kLx2(k3$^Uod7eeKODRAHmNhJCXEBDCm@lHjOB8 zO7j5h>oLuyCZo;EwCsmwfqwJmYlfCqV1;`Jzf|11eLL1E$Wm85+$I(*BA9N1Mwek% z?O;kvx0^IlYuuVP#|Nd>KPd*a@<3~&NY!pR!Xc7?XGUmaFgwZlbpTibg{|Vi!^)y@ z5;SNWi;vqj>el@-`Wxr{OSnH|Z@9K(TR0PyGt(j4!4S%uETT-S|4J7AmGn7ZOLv&H zgt1Ak0~m5unxRUjMXw*B{i!aYZ4?2e=?mzsBvdq;^e}O~0MtjSWO%W0z{PnYdJ*j- z8UHa};uIqVg7V~M1vJMU!45i zLq1c(rG=3oSiAv%>nm<6eDiCQA@bIq7hRK zj=BHv1p4F$i!i95LSHm~t+y*Vf+vtH;bk-0rB{@QVM4K6Qvvy-JQ{gmuM+O{9yBWc zL9fq_Hp13wRzAC;oxVoK+m_w;&TtM4I}&52a4cNAEQEh+huKEY?rRHwHry5p+=Opf z^Vm)faFabx;AVm>@sprhEbJP@JZK|A@OPP+XodCw$A%Z0BPrazkv#mlLw%YaNn zdBaxs=sP$90Z^JsN!qJ!^5Ng&*$b445N~=78Hq5MUceXxqdeNT4xGcD@Z&)v*Y}Tg z$+MA@xpgD9xyl08ZzcK<%MHE~?Yta?JfU{}I@7i1=R?@PzR%+{o}Wo0eI=z97JN2O zY5t>#{!f1;!hj_~;6~ClI|u9_vZ$rOT?`=$v_ACSy?d0R#`wFWP#yYr6y1PGYfs1m zC;&mWBp_-aWkiZV@D57nZ`8QP2dwvbpv#}1KPnFvT7;ud$NaUGhz&cj52{M@ATHel z5g@dYh}GeRM2AR=_g`njJq8%#uY{q%ltLXw9%`Mb0p_2q4X8UFaXSQVqsQRoAwwY9 z#iYDd{(#b59x+9PTt!fTM7BFI^AqnzDF%|*76z(>or0J54=FrVaDRsp;!?#yqe5$& z_U@HcBB$S|?>425b8@F6tKe&VLPMT@^$-u)FuW$|?%XSZZ-A=TAWPf;JDc?6h zat|RU@zlS>X#vLZgRDpj6(8cei6rPMxD@C5N|JZ*cU%~C(48zBxy_~1d%xJ}(n`Rp z6l8j642PUhUHh~+WU~{6=-sZSPW1#z6M1eDN^_ork6C9yWn+p5D5bcl)4sr5SZ6Q+@{r) zVEW|x@&SYklnz=2D5La%Gsts?TDK`Dlm6}{7#PSz^j7RroND4y9r3A_Wxp_&qE!&| z>aEBixLMr_=Wiu<)FBR(LsM|#Rw_>Q0T}P9zKVJ=)nJ(xi2i{g!SM1c{L1UmXo*ehLf zn<2t2ve98#?MomX*55)j#ec@o|8!WPfnc!qqA4Pki+nN#Jm9ULa}{Kk`f%=q>@AIQ z{&DdQrQPWJc8+TnK}8-Gs^2QmSPvCzc3Td?n^Jmb6&)IK_{SwBBmiTkKoL!>S`aYC zsvIN=N(CG|V3IfWv`Em=7)>mxPdDZzr=ukkZp2BmpP%lo?kCa(()ZUeY}*Ei+`tR_ z^!l!6pk5?Fzs8P3NEa&up$WHA(sA$Sr)QdZpVr~8TTxJ1$`brP`>fw(AAg7PI{r72 zR#QxOc|=VjWY$4smu!-(gpx1K`xk{i{ZR&I7zPW$YvicH_saEAh?4Oxx(VosT;H9z zoyrd;9muRi?IsjSE%CuTrlfxg#FLC8Z>d`mtQZ?MGj`PuR6vkIz!&e) z@1C6<%*T0sIhe#vjJcdgg}7`s z4nm}LiuvJzWw#+oqKL)5K-Uzdnzzle+zgFmb!TC~#|F$`*b#~KsMRr-r-)6OKxJyj z{kL%@OULGXjkJyPM;%JIO-ANM($8_>^F03k(|=U~e+J3*FNr||fRBMo?oUWCDPW+1 zDSX_-rvY@9>~$>Y<4o|Qsoi&*%ZhdzXcjtY7&YgiRN469!-p(4PCs5%Ufu}%Gkd!A zb~jIivWmMu;o-h+;Vso%nU$StlN@a?extN8>sYd*K+53TYzq&MyOQ~pC!1oaEw^&1 z)j2w(GYhpaOeeH5SoMO-K1CbqBg4SFCV_P7D23`=ULdwy(N8Gsz()2ZZci>RR16Wm z^|NRUNmCJfoM2s8SYP(Iaf!W0k!P9SjLgESJ!?Gv=Geom1>35JJ-H11`a7-T@3bJW zIugCh$ql*pqnm%&h`yg&W*64QPXd9jIjx~`ErH*J3oITXeZiab_Zv2HKtcscB6I)A zjo5BL{d|$ND}%*Ciq?>InsGA^;Vsb^kuwm215ox6FjWsz5r+K%QDy{|l78mbL6Nlk5SqMpQuzT}q?vV1{ z7nsU^+DCE2E-Bk|S?#Y<17GhyR;&-HYpDJiNZo5Use52>cA(kd-0v+GrXIet?o*LAj(%6TQpaxoG zgx5yeQ4D-wm?LpauYWYVOrx)OO*9kK`+zacI;Dx#2X!SN1_Oga#wY+~mZ`?QB6OY> zqg`k8;R>?^4N^X@0ys~H@S5n+6GK*+GC?&51x)Qsh=2e&2uKN)&oI$M31^N3*({HH z1R3FGi>>J09gyan6FRw{(?2x=0_mTkGLc`uSXLRW>;8x&g#TSzx;9-G<3s6qgo7LUCEQV!aB6 zJy}pZda=;|a>HktR&C-yI1R*Scx`V^EDxE>n2l|6RJ0ekTK!|%HeXMzHbXP_w2Ju! z;fl}wxmjkHyH6^m_9s{dx7WQBG#i)_EAh#9vkcAel-VTt_+Yd?ds}<^PKuZ`^%1jZ z-Hh|_-22KOychNTzS2;g&2OV_+w`3sr#d9?tT`Bsr7>-#(rB%84Jx=m#eZ8!R_Jm8 zmAvc}~XDO-foa#L1&;GPDOwSw4Fmt{sls84FEOycw_7+2hN?G5@sDb?LHY zM9E&g>AWfkM3fvhs69Tri=GJ-I6*LeD0FJv=ZN$WIU~Z*!aug`(*sz)VdF;aIpNe~ zY$zFkT5J#<*Cn!ui;L@DGX(aYq9sS*Z6Z-J>#m4pvP0W$fBD;6Y)rxrDKC-eBNl(I zfQsd7$<~(78Y^@%CPHMiQ73_GbK-*rYrlQ_cB|WhOTHb8Z_4d>D5jy<%f9-av~TH& znEhA6MD08t80X!s&ax=07O)&s)L%QxFJ6fZ6wc^N@qTzOt}tDxk3K~sEr}L@e0&P5 zB>i%H3lI^%?J6sUTAhw^dC5o0l)i~HPc`G7t?2g;_h0wG|I&*b&nAne)zd~_m-nPw zzlf0Yk^OntpoS+n_NnnXD9v0IcD#E???28}gPQL#0h6!N;OpkfpB1AC&E{9Ub7v={ z2Y%A##ryBMNK@-z@2em;P@efYNz?pW<_F$OFNe#!eGKNU!h;X@^Xh$$@ZYiGq26Rq zz+@8xPl#@;%tlUb&hrsT)pFqy)ep?7#3iEngwy&w?QKT&R_GFbm9)Rnd>m6B$A(Qf zT&>VlD7z2>2eEW#xTvfTbF|D>7=#O#PX;5&NRo@nIr@cqZq{B{FQfT@EV>S5?iEqS z(4j-p8C-ghPJIJB7Z;UWG;RnO!Kou@V#3$GdFeT#|EktaE}zbYbeSlWhUPbh96>wROHn1N8&?WI~z2<40J# zD-THvO#F-xgpi?tf-yR4-J@mkBNphE*S8nlz68Hih*A7oZ*v2BIm)~ZTtf<}7=XqK zMu%P;v7e+a2n_4-A_^Exr*hA5^eT8>#d*i*Nkb*EkA`KejWicQW^|brv+vIvr{6Y` zGATCHH;3=tDE)TfK&2|MS*f&wr$JU>ztR9yTKZLH-eG?6Xxsgr{pY`E+MkBxN+K~f zU};cNU;wpK@U)Cw7;-SDN8>XY0oF<+vRB9rQHsm9e$dfA!iyc8WhUn23TY~_sh#=XFPuKbTK1eHib#I`>B_T1Gqw2>%c0=#KhYZ;~ zm|Fq|3pvKjAlFhjgdgC_-0Cz5-o`x+16#vcrF1)J$Acz${f~90snpMi%4}=ul1dYu z-x@nxXMQETus7`5sEYk$reRZl`LH3gR=q9jh{Soqi)7oqJ@>$iNoD2Yws3y&8}gRe zn+D|=u@~%*89;Qk!iRzbG>Aom(#~3{H?@Tbv~qcZ{QJw%&x}9CK+`G0fK~ z?F0mhJC5Cid=&h1oy3%b0)hsfO5~BVGZj(okq!?dz;hUtVCcdV_XiY4VcAzxxk{7Z z9w55P+j&X)7ujIf6S=>$ANqKv)J=tLI~CQ!nAGtj7do8AU#36Hd?lIY;}R2lqv^*w z&?_oUYa(N2OM?=9@F8joR(4mIb3N%jaWCB_7}qkC&U~dT#B0r*L@l-W0sV(~b$v&4 zsAT3MmN+Dw$xt>E{+_e`Frl=xpk+;1$>)SC{$yExC_a@~LMj|axPOjO>t_Uo5W^Y# zZ*sW27q1!wtAzx2U+9%ohWxyUOy%Z{Y4=Jn)qql zyi4M3s;N4P0fD-_K1$`RShD;Bc&iDo?md{8)B$C%0ou3iFg*fbkEp3)R5RtTL!iub zNacNmnEZAXM$V8-9|Di9pSS*}@wWOJS}PCTx`xI!#h~!s&>Pw28s{9G+~e#TJ~TGY zFOWv&BA9S@`86yNi;`u;fcz#YofVBBf*?8_&WjId?Xb33pY1&5{+-5zN6;q|AOaQw zmB@D&2K9uy;!n1h!-GK~_?>mNZna&nj^QNC5#f^IGXyLOADm{agM&kC**$`;v4MRg zI6VP-@VqB#UY_4&gU>h;p7{-MLN>ad{`k-)YzAOc+EL#nh@?R;kpvA-?0UK{6_ZJ) z^+k?2TAK$`-mK&L%y7+4JTsH>bCT*)|Mt-9a*}a?;I+OC`0{B8zbJV=%-VN#mc>0q z+fMYenAE74$53w+3t7%2SZHHjXCuA=q!Y>#eSYH$=r-7iB^Z0cYEYeo1A$iuiM+_Q zTOjO82-A?_X&x91#WDv*#L3At- zj^LQ=09#L-Jjsk`$x*8zgDyqWjL*LymF>)v3j|K_)w&umLad&+w*HmnHpC&`Bq1(X z+ML8@i**rK^wU64eA!lPE2bhMYe#>WRH3^UntN2cipxzq`GbUN?6I?waE(;%N-KNS zu4B{LAGR^h*s@^S3Ojh_@rJ@zQeTI^0uE2OJ#@Rcv5}D)5ywLr_+Zyg^(po}S zL%#>Z3i69i0)509-ixz_s?tNKV$6;QJ_?aJdC{lrQQpkle{y? zn9?!x4c|jrfTBMBLW>l9kG+5@F=44?K?4T5tl+WWvRzSUg08w~Pm*{jcZa$0nkv}e z0)@;uSf+52Q?+cwsP3J=Ft0-FXNwXkj`i=CZ+ueU- z&!O8EZhZvVj?==$+D~#2nM@qay_HL-uH3}}D?`tCqPXnX8Z}YXS8fhx#6n9l_n^Wu z%;zkpz>myfcyB{TY{~n=YdzC%U1q{dVv;^4qK>AM%$ndKPEw-Zm^FTpdguotevx`2 zxnMQKN335mUJ-(%l+>@c2(-xdD6*HB0yhUa4zfFlNfT`GZ7EnCUimt#Sh@231I=a( zsL68)O2oorzrY{=X@CR~@yS0w&54JfwzBWk`@U|dgg?`yBU}N60hNai=qw =5Vc z`8GPDjkRzyz&QoqJ}`w5B+DojRh3sVQaO(XYW+i8Qb0KLspWpWWp}EzH4Ew!x08gb zU&&q%S5PlHlorHN54!o*lT{-eSaY#U16($fUYVZSD=nzr2>pp!iN+9KjI! zz9IPpt@!1&#e!SKX-Tpn>aowBixm2%r#dzW`_-9iHPB^b;uV(S9kc|5d*%$y{yF^r zEP};FffXNkvk3dhAG#Xior9A&Z+=E!U$PF*HgzYNbCvS!;XvOH-6(NW0s^HwVBPv= z1>P?F2@2mHHcrpZQhdDljx7DwrDw}qrlD9NsP@1RtRPry*Xg|&KN?TWl6?$Z36DDe z=g32H1(`ew!mf4s%fVoxEtUbw6JwG94H7_9sP`z1K447JhpS=ulJVN%GCq5gA8HM_ z1`?{V_bdWUx@Lub7krlvm*zj5!}kK=?8Udq{{`m?5%vinw-Z$YPJls-(CD}Ip=To= z3vv)X3j^@2jf@tqB48Ap5Vb!BX9Ou5UaF5#_z2vMk7cs><9l>*D*nh8G&{7pXPtRN z8pg5_l2C=71>n4i4%*+mt#^Hyk~HAd7X z(@X$mo#+=y(F_9iRt8L;>GIrv@#>Dv#b+xdN zlMEf4%vfsL;J2}{oxgj6F5-kE_(F{0IbnA|H{a?i1qRtWN_}O}vk63o5iEVQ$IK#Rl2HuA?r8#|g!a8KM z7AASASP!g%zw!TfKU(V&z7Y-P-B&0R#GErfH>shY@oVHkOdLro#lKA1VmqscmFa>_ z?@bpZtolTJ!YE2x>mE&mK4d2bSfq8V|3UHV%nH0_<;W-+fM99|ac-1`qTkv@R0-%_ z)U{xFB`jOTmn0pTT8uU%xryY4P^5#gTwd56y`_o9dDAC}exL9Y>!{uu!8GLu2#-W8 zc91xNN5yxH16%wcot4Dpf`Ew=w$gp~MVbRS`BR(SzJJa-E))34+I!3{H5ou-7hS#1 z)GJc~qr&C=RDZ8nNnPp^#1PXz^sdls=*T@jDc1uzWwUQGr5-vLg5d) zLI3_!0#WQ7`lD^2(*3QfV_m6Vm-3RBkVp%_`U4Y~6ucyc4H@TvxRN8-xmTd@ksxB2 zE`Ix%3J^@W7X?kl-?F8>U2E-nm{OMv;Zh?kvR@nzG$5dkJV~X~kpq1`I;ZBP-SgL+k$+t6Fo%N~Br7IWK&g>enF|M#1sjB3a>SW=PNlbhf!#!!0$v&IV=iO6( z+S^{jOn>xy|8^sd45^&F;em;ybsw%bRv4u2I%OYkR{oq{zQ6lAtwE||H2Zi|wNqRF zSn6lncw`Q)bqYG1Uvq!R+A3{C+{a^RCTZ=5tGVv4^;h=hHl4p=@QGSoE3P$jV1C++ zuCy^%V>AG<@lPiFa)i6p^k;|SX(LD5Dyp>fXaf`2ub5PQk~tMy5a`f7S&~>*6q6A6 z)RQ%LldCOS{p@VPbx4nHF%_z_qWSM6XZ5RfRR#K71lt&v0yT?~b2ML}RGdaWY1<^q zy&#AnnX^c=JBIrLU#Jxkww9nx=q@R05UMMA0Q97S4j6Wz^YA4a@q^ET4jMnm&(BY- z0E=uf&8J~M?6SXO1}VqBIKqJDTnR;RpUl~hh14hgA;QGu6+4dggXu5Pu7HzDA|`AV z!q9vWJ5`M1Vqo4);){x%LCDVxJ@4|#SruW|{YV$<~`H)H6F4+Lt1~Joj$sSRu6(VfOQ;yyGjqjHa z<&5-lRX6LDDRFl?jn3=36pyKVlFL&XeWaT0a0YvQyuK#3+iw2-^zOMW+pn%>83Va6 zHdl|?C1LBSS=rXfcsx+@`qv28a;yB~1Khz#Ol+DCo?G!v6F-1jL{|X#pcv0Dma?_$ z5IR9l@@PIHlp*{g(f%S=qA`n|B;^sQ1O7XuHw)arPAC&lJBel##qc?XIm8|nqa}ue zkZ|wN@<4BhSPdDF`5?S;{FV}62({G`Qfzi;3jPL~Pm(ARRDthyzyLG?H#q4kQ@45v z4PHeDVm9{uTt843zUH9QQx}1;$F@t+(ji?wkWceg3=V4CYKk~u34^N@{0Cp(^XjK= zuRYVuf4;7=XK&#p*^le`I#BzBOlvg`%o=;GnB7zwuj2H0d-UCs?$udRL5zbkxl(H* z>h4^M{N`Ha?QoV^>eH@}DSO`&BWrDrl)ArUyT`ljlN}A+hJ(i=hMXS$Trd zlYVSUqo$Z`%x!9JOu6ND>t3q>PZl~J^RcRPwdO`1t#uB@|;`!}t?ebB&b!geV*c(zud@=*H?yV=lkGMc7s@wuZ*JRu&HdO>5rV zF>m@DWP4vATI;~(b5?!dwu6LR>@kSzm-F1SJJ#068`m3Qc^pQq8pG|PM$LJ#OjL3^ zTCO`A2&bx5Z&Qm3vEwkBk)omH)m)JlG2nk>#9rm>o=4GFU9wWxviwQ8Uq6t|Zs>s0ZNZ3!@3Qr{7_bD=m! zG`SNq8(|O&UO@Sqb^#adkE1?1|*9KrJ=?b;-TJe1CIrkJ@D8j zv;#w=Kg2SK$Rw#~5R1q7nv=nbqsKonW(JIV5Iej7(*Js zKnE5PdZ}O*MbNcq)*g{Kk8J!y20c}g3N5W*N4GE9wRaw3Q*>7s;fQvR@#3tHDoOad zGh*~-T$T3q)i!!Z*OnaYSRqw=Fr?!rN0`sn^GeE<)^ZCSq8u@u1=F|9*oR(OhBm$1 z%7E^uKP@OOGz8u_x{coDT!9h8vO8WJ|UXu@UcZHN%LiND=RUL)?N#^y^fSUIWjqhu=@Sgv1#^x1ws@3(}8a{L< zH>|T~SqA3~wI6v^4UcUl=878O z0A+H8`__5QKPZB0CSf4VdS-syv5Ny~j8(Xpy9W zS7`nS)V(tM;abUZDXTivX|?tMu2} zU9cbBp8{|-%`Y)LZ1AZ)UEA!ZQ!GmQblb-F-7LT!>cQ-!06aaP-0e%76Muz)F-gah zZhlwF2F{CF5qnh+U*f76snrN8`##J=_|KVnyQ)2g5AR$qPC{deW7 zcHA|&s*S_*PUtFj!#643?0S9)!Y6&QH&~yzb@bL-Hzod?TpU!z_H%_!A9M#pxU(zz zyt&fjiXT1QYPNsqRcapNjn7#gvo&2=ThomddxyN=%Ishe7T??_aHa6C>B~*j%A!Fh z9x*6s9<)-;usPyQD;f7SSx@olKR@NWUk^HQ?7Lc4!tE_^L|R{_Go+MeDzSRio9Z6A zNdJASK1*&pcuP9;I3WT_P)(wve~QfJ;U)(=cB}aYWfwS5UHZiGq@#CPbv5YKCfLNL zI%Y~g(p~qy_UGV9A&WYVs^?O?CulZ(9{9Y9hGxci;c93q&$$S=?z~Es#UCwI;wqI| zuv^gB{J%*0t^m)T_SVyfMu0-6(7;K{LI5uXs-dwrVaB5{%%7SHT}EY+2c*1=MAFfi z!(94TSlOHKz)Gz(TMwL$<|7y^ag42K(IbQv?q;Rs3Kp_|%IX|Npc$gdu;9_f^*n%Z zoO=Omr6%qp@>CDZq&;Cov}N z`4RYd1#-2{<=NZxRxvtGwNfvJooT;G&?>ywSWeB+T^>k6CHiVVM=ncz_6977PT#j~ z;ev0BfXC@HuYtlSDAfM&>L|SQP0zYeIi6&BWgXoYv+r*- zl~SA%-Bb^roNo|SiO-B`u9~m9`hIq#Gik|4S+!JB+mN)7{ctgzOU~w-uhTpG#*>v@ z_-}6XC@U+Jsp634vN_-Fl@ur=^Qa+SGMr&c@x91q|AC8HlJoo4t?Z+nGoGEvH+(AD z+Ue3+aEe;{e9)Psm|B6rxe9~n)QWh4nbc3@fasCWduV^QZJC~0L@Ev)x$B*AIxZGI zrDq>_`d_>H#?sq{{c*?eW0^GVS~jZ(4eRo|_M3n8k<}5FYTo0}8RVbm#UKC3OUhW* zyT|xU`kRZ@Y06wG&GpykM5%AA@hl6|@cS70k6Gs^m)Xqc&k768lk>ijAN+oZHl5$= zz~7tK%smsA<2%T0BQ#qkUlkQGu3BXsg;s?5Qqoijj?6nX$LUWkTuq}k@ZVolqj=;H zy3ZpkZiSncYj60tje-wYSkPd`Tq4*chC$4ULVluWc=LS5`hDHwlnz ztz@*PCwnJZE@@J%H~`~bDkfer1K}rR*%z>Ag~Lq?B``R2f~+O^8N+iRxsu?~xhu8g zz(@F-NyNm3IMrI3dkz7Zj*HPNcc9=_rRbdpRykx%G9=-f)y@!Req}pQI^vMKU7#cN zNS&!@RNTtq2ZhnvZCzoJ_0K{_DmvcF8ywe@8r50Jxi2OlXS%n(%Es~1v`tw&aEg-P zy~%N*jW4;DWYIk4Ij3CW&k=v2L_w-)kD^=??|A5#ZDO`tjOgj-If7DtDax$E^@hHm z)jU+{o7M?y8`jg0TuQDTsjnATN$;JMN9wmdH5V1ft*r}&M>J*Ls;?5IlYX`QWz71( zTY5Cottl?!PwY;=c)02(|7-Eby|RAOkGfP|^EbRO7WR}}&F}(0r!*b5p zKgK9ix$8iHzRQY7))sfhqO0V@xJO^oZ}bs6XNyMYl98I%A)|Anf^0gse6q5-%f|(? z9;ljJO9;-fl|K;DpzZR;t#;o@Tkc+V!>j3DPx8O|)Nq`z53_5^v#GK)Qt8RfNq1`& zQ12QD;#yewbGCWX=lllT?vtJrPlziQn=g6J(_5^Rs!pV>F!(H6t~IYvsXFHLs{d!_ zd9(4Z75c)x2EK$G_uhFrTDl~Z2ntiV| zT^Y@umv5&}_I=D?-#4dToSboE)H_BfOZL5S@b3O@$35|km>c6h2bcRb!AUhU^` zHQiq``c!d)s8dN(@+=$Q)a%iY>w<-mS}d^ZLqv2@JML`N+wk1$mfmW~aaCTahtEDl z$VAF2?3SrJt~aKjnZ#bXN55(I)YSu_9PLe;+2=F{xR^>E_C8X-c6qM0P3#1_miL;r z&elk$r6!?M%){(zZ*F++(qZNd`6ngk$>>GBvrlj4X#8{_HF8bNfwS+cx9K-_IdZ6S zyg22lI{!0io1xQ%d51v_r7zZA^cw_IX3qAg|C61zV}C{U=1_x_s(-w?6yNTTch1$y+3U|B~z&*8u>Lw0r$qN4%hI!(H!yE+M!+Q zy|JKl&v0btcu9O2^{ALNqG0K>~v`B1%j@RY9@|Ky|nxOCbhgVSJT|iq>1TyeMFO#XaJk&VjLWgrsf~ z$Ls1A8~%Sza}>brdL2_dv9W;wpGLxY1Xv&FK3awY!dSW#=b*ykBpzDJ)>35#V@|;> za^8;dH>%zBr=Wh6nsbStT4&j0CqDS$rv$-W1lASv@GhJ^R#(}-yZzuiE1BsY9_0NI z%M3t#rq>cT=_ltdY38(Z;OUJj9@#tSdyRWe{Rg-B*C5u1%H@wk`qN4SjpmaE zABQB3G)4+^hDed}T9w%rrC=?yXYhh8uSDPDVnHwM)4`u;9A7?(eMP@v^jwKYgjHvg zK+=23>F{}$q?}r_+Ua>&p}8JCu5BDowVW>oUKtu-mA? zYrihBH>3G&@zJj3lQeW)NfT4b)8>_7q`T}%ig>v1Nt0yh!r3O8oA;X!=4mrMxwuC* zX7`dBW|6`De22~&EpPl-x}UIKj%PD>u8?Q&tSoipIB#{vH)rRF^z4fM&hqgWtJ-at z8yXuT6$eV?Qt8VT6a*rR^$ky$Uo;~N#_T!QmH$=9E_uBB9KQ*czK#EaKDSlAJKb1| z)3x)DW*Ni>Nl;PLf6GIn8}Lk(Sd1D|^@H9%>iwsN^}zSf-7-jQ42uMkK&s1i1_^`x zWNhP$|3n$wYE#9=#zaUy1{84D9Ta&%VAZWqMS9<*7|orND!FqBQ`+S*Uq{CsKW8PG z7TN};b-l0*j;gn_=q#vslytpbv;R<0>Oshb%qY+<0(2P5#;AS#}>*xce>?f~jKjtaYjgz~x1K{#`ac;bm? z#$7b?Wuq3>rNOw*^{R#wW9tDf+6H0bcYh*W1vT6XmGaDrFb zYIF#!F2!F0RF^}D+!xTCr$&HRxWXV|?&_XHyo_cnEVhnT{jLDuNwgK(n$+9Ew&Mfr z8e`LMlzEj7U3hhY=Su9~vwi>PY*?h+dyhaXNHeQATr&rL%Ntsi+?8-4X-%w!IE$T+ z@8eNDKTdoU5v<*%zx z-D+{`*lnH4z}%#FPSFPQ##}$nD{k3ns}jF<^gZvmcFE01%v)zzVtj)+&91SV&D1d} zIcPtkpYj&ceWxu5!Q)qNsgUx&)8+%==EN%y4Dl;dsCyVs*vXR z>D^^7qoW3@TaSe_aZQA}z2cXS=Hp!$N!lxta5g%s8^_P3Y*pXorgYbk(enBPh11zG zxX+tY>Up+`Uxd>9toq|=qmUhWqj}xdc6KUZ@A3+RjrkB35zAb0=i5fc^T#biYxTZz zbVNDyOpHkN$g!E`P>a)c?6sTh_idOq*QbX+WMk6o)SlSThCuTuox-jJ^B|2l`S!$H zraO|Sn;)6|NWn9Cto~r*-sg=`T1|Eu88?0pc*A3_wN@l4_&lDcUCGe@E$P@u=Z7fz z;CFWp4NxT5z~9nmtTOB?X_6&KnsFLN+?VTme38Y%HCoLEJPj zzYFkv!U1#j=R!`O;qQZ7=X!rO@yEm2q90P$1@s-cW9b;J;{?(L!>a>0;@8*X+^7+L8=)9zGqNFSxN+H((Qv6uV#X`ywUpC`*rW>&C zB3ftXZvi)=C9^5|ZNxO6AX*v-xCw{9K6ZfC_DG+lE|rVTdB((tLW)et*AI~_IHFHA z`LO>8uf%DIk7b2_kC~Ah(JmW~%X+%olU&G3GfFa5au%u+Rok;fk? zCcU@c!VQpgH%QE#I0dlTBMp>hnPo`887>q)LIS6l>;zbjA|K9=5>7Co?Y`p4WK`eJ z*OM?3DQ-V)=|npKc-k9UDa6KA9?&kiNF-nqeOgCXeHbk1Qfw!QWlLJz13a7qU&~%q zjHu0@>k2X++p>9iZj1ZuZFO~r>5F<_lP*RqktX?iDMpQTGfaic^8zkQK|FDrG2`mQ zIPL>x?|&=C&-p#FsEt>`OF!hzueY)!R3wQ0hqs{4I{gst69-Cs*upci z-~Fr%!Y?lpJgrV=!rfvt^f0qLP$cR?vFZ`yzn_2nel&^ua*%=?FQCX(-={9~y`SQ4 zCvAU?R}J4>He;*QG$eQ;cX%k{fMHW|ZaCslC|9)6XIkv4mZ}}r ziA{(TmgkV<4D+8=ZQ`BJv$nC;s}cIkcKi2t?tj_Z9LDo=l|`&o+M9+!WW##zRok+E zd!*cTzz;+O7908jfvlAKTYmHe4z-fQ=e1y$c{|62_jk$yMo*qGtmd%)n8Q$_x|! zLv>S;OHsS2!G_Yx7xD`}E&MsFl4-^cTY>Ynae#5}et7x^banLynGJWH$h7T!VtFO6 zetWu3;r%lL_8%l|?96tS%vcz4&Zu$f?Vl56%(P5FFFtV7lcDfV!5#mDsqGFNbZkRM zvyN5`xmZP5eGL`pi4D`9n~i3B+RI-%@rfhw)3sM?>^r&i)N&#(l&C8uv3)Rj=*d9k zWsWv4W!2}>)p{D9TZHr_%ieJBypyw8rKxepeArak^8-QJ6T-IFzuQVj3fvIip3w^7g8x zQU_QABW4c!?iEi`HD-Fhyf!%BR6RP1fuf3$+n`F|gcPo?veA6H@nODln;drt-Rt&A z40x6Pclq@3o^EBHnCc%%bzzv*Ve&C7@imhtL)st}* zCHpm{!R4&R_H!Hq=#=mK>`{~7#1j(jt3TZrU#T-TV=DhP z%DLQd{GnyYVD?42h~~C`(1jl!?oCjGWB;Sy+MXMXr^GdY}wrp`my6>7q5$) z<8$8`w9vRaN_JMJZeLHN)MiLa2x3>BRsh z6-&vyoavNX$pQaNc5QnyyOu?);YWMx-E~aYLxOJPldwEO*)MgD^51@JCA6(g{M2-^ zSC%$~Xb!c!CO0x&8#f?h19HYb{^&v0{6`w2`!rptNPCgQ#oFZNJZPa}%%3tDc8Tnv znz(W+yj-YvQm{F{(1~ZY$5k;8euHm(v2O@ti{$!e^v+Qi(~Z=)8Y=EkKM_)*3T=dh zDiUdg80gVog$!T>u(tlV2}M_js($<*&>_;W`NV9Ghf&ar zn027o9aCn#y8U}+>tD8+_t1bO9HblDk=|BPm)e`M?2yOwP_z)gK+@}WXF(V{|1=q$ z*!b1MUiQ;{`xUitn>D$1i$t>!q!D%&c1Que1vl|AW}Z9v73WjCwF{>d3n{g>=;~KL_(pn|DMBj zjMXN&I1!~`7mB?c5b)rs|F`fH591-#ID;7FC4u{1Mp+JKZXMgOM?TNMhEdSgjTm_1 zS3s#{fxKbjQ_;URgN}_{d}h1$&Cc(gwW3BJWukiqOIU<;B~2s~EJI~)WvtS+4R4M- z{XAft`Nt?^U@dj;4Yur*q2qA%rzBIjzt-nL7L3GI$4P8>C%I}ej+18^kxP8Cqo{5B|8szqa zzibnC-ck=;heP|>W1-1@R2}8}`wx9vv)=OmA?&^5ss8`|@uQ)4rKMd$LP=4fWu!?Y zsgPY+Nkt@kwAZ_2L}XM#WknRS3Yn2j*_jd9oA3Q`)aSac-}SxyzUQBoIOp|xz8>>F zZj!Pd{!IQqrIlBJE;@0FAI2+YL*4pq=9`^!eBK-7S=*{u=1u%N3wWaa?l)FF={aW! z(y;PAl)14hhuJF(t~!7Z_QG+Nli->bzMr_6`?#)ZJG~@(@01cr`LDD8?kAKo!DEmb zAJjGWMa9sTMnwTP|Llo6r4%&@35kE0tBR{bUtg`-d+yLzr;9CLR}0qK!EqzXAu(~r zaj$vGH9MM23;~3CCv{uTvT5plOzwb!!O3H<+xz6Q-p%C4^ZD^jC46S?)I=k==GF$U z?_Tp1ow;^tlaC=7Ec2^Cd1}D(LyN|;@Km$jGfq&ZP5x&6&$b0M5=pc^d9DRwOIVL_ z?$clh6`omY8jkV4Fedt;LJZ2sr-(&vDHRkT(q|-@UJL}ytn^8;XP7IC3+r(nR zl6_#}{ub*Dz19&$9SUsjzDHCF+2qz79A4|ft}EZNvD4a{#dWcWs=MXA#E?h2;)gpQ z4j*gNa5lMm3Lm``5*000M;G%g zKF`Pe?1d2^eomzFqIbP8$3^tiW*`HLZSA-6U*dmF9gAvy2Oh_GvvyD`RT$TYc6$G~ z0HBF0l=_q~^5g*?-JJ-iE4E!Y=erVkH+pTKij&5CDYb(K?*cwR$LD*ZVd+fetDZ17=rI!J@OMz%U@E6$ZQ-;ASz`Z%SZNB)=OCKWbJuu4QeHbkH`W) zd1P`yNDdSU&6ia7JhyISXP9T-ght@1jBLaEqn0ddG`d5nB-G0^zSiKsRX!~_=agEh zjf}&N{!0nKpNF0dUbz?V6)rrOW#02}>;v>LXMg zH<}N8c|_3D#7wrAJEgbqJM?b)U6P?9A9&}{i*NZQdO5E)ew+T=vA<(VQugq@AA`?6 zEZ=?r4t zBF_-eoT=;t(|{8zU&9>f{2W&*fyDM1fKRdILgjW+14w`?ZRz+&Y~nNFjwqZjH4@r) z@v$>)g6dHfA;y;s@VZLb+i0a%g7j~(Sfr^_UnBB!wYKv-M22!6EzCO~jg#x~N!wAgU{dip@t zfS&pLk}ZuxO6nKqw9OsKs!jLL&Q$g{N{`&|>aA>B;l@D2`|lgydpfvq*G!EH`e8N{ z`*D+zO1)Z^%hnH4?2P#f^F8gf9jrQLs7-vf+3~$C)Mr3acx6rHx?M4Y2c<&$o<$23 z9eCaJYJEkaUeT^Pr)!>@6au=n3@;nSH>>{^>k{u+TXTNou#IrXMg@lpJkv84MYEasGB&F$4H3QUV-FYJnVU^bc^Fk=du&R8Cib)61uHh1g| zxSub7$&}Md``6kLgSSV>$0tS(gNS5oTzd$_{>HHP)vj1@r~xOvjv67Ds2&97pX$Wn z;0eL4GZKk$uj*n>d{G4b7{kke&u+Qgeu5a2n4Ps=VcEyL-;6h0c?{eQy!KtG*mOM3 zOrP}e#C}tlc=O@b`KvSPq4iYpLi)-SZ~T>J8FFmsJ^5c9fLO-}$=!d67OfkhMh-J+ zu8PqzO*(LesS_+GeEsfTdcd! zh3)L`%FyEFuGOfBX|u(GvevH2P6IYhVXSWGP1N`AdB|sBkmV^5(I7i;`}64GTE|Nt z{?!sW$Zz8P`q8Z?#iR1ekBrz`2bpRe^yKkXVPRYpIF^0r=*PEL6I#Ze_eNCcB%SXM z3$?bt@AK1IPN*-5cYM`oW0^;C_QP*`UuacaKDxtF|DgYd#PL?!Ljm2uN}jbzX!}@} zK5;$SE_uwh_3aDi4f8a^2li>By|Zhac(jO9<3xOZr;uT|e}OGsTb;Ds!Uk5tm^1dC z))vIG7X%xZ;+-=FpsSYyhV7X-^0evFgSt@En_i zqeePY5~Bwdb*w`ovXY=m$&HLrq^A>9cL_d$q7}zbQib08{tLur>uvzD&r$&+xOwkM zC`xWXO&;yNSuz`?4=*=&icP3dLoZ2uyzyR4jAlX8wQ*+FnQ`Wxq1a!ulzzX33>j-> zyA;k5aSaO?wL3ffW+-}#w(92FrfPd|sjm(w``5GeJC|hWT(~{T`ST>Y=cpY<6|>Ra zxcAX3h2ZS?{x;s18P`%9ayviI$m9CDC^`4*d%53AHcQqc<+IXJ1 z#Z1Mxc8FAp(lQ*~VsY}`(}>Z*1D}1YYeY{;Y8+{pw<%=-^RjabMYc^EiF3~)t)BCC z)_oe_n01Zn3gL7Bt$TwcO7s|Y;9fE+v9(}fF zI(b5?aF8ZV$)zk&P8)YTi8*Ig$j!sJcD80}*AGJ&kaPx~N*&N^%s#Jfe*_AIjWJ~_ zWCuFJAnw|v4iJ+)SvTIC$3?E^Z@dri)II)SnZ%|pp_x=M7y<^|?UjH{$H!-|*?5&( zJ?F4*e!0QU;+dWHgn4Ui(eBrculT)=8WhT;hoSzLtKqW`{Q7v$mJ3DO<5rnzbmIM* zN)jnoIrhz)Ot#zkHRJ- zB+>*pjhJC>8HMxA@6gc+gywwG4B2|#TZQ90!g6?nOY$z(88g~ub>V9TyT?a#vJ$M@ z^jnWQhV9&HbIB5{qzi>Fm7t8wsFgb8c0@;~=EV72v>fx*;tCsGgJ(-@tP-?%&zc`&2Dx*eH=-&4Y9-HTUmX+~;aCD8yoGhR97sAcOt@~aihz)G)8qSsHKktXCCE+Gy z7c{x9Y0qCt)~68BSOmJS_0LzqBqLS`J>S-{2!GP8BIDI;v_$&Q)?o^@_hl1773+At zhD?`X?b98nu4(h+u!VvPUz(s{^??)x8e%}stxkLUKVnmVXD!~(8t-;g+t$o=aa06lNf>NBrF(O^3^aJJ?H8XTHOXMk^SWdP zkqcFCV_%S$)0rIcFvl~(sBt%+T!R}>x8Nch@opb{fK*^A4ED4eHGTYwe&na(G}!!d zX58|v1M8@`+&R=HTXt7Vf6tGZQBhTL6$SdYMAubHb1j(7V|QY0gs=PDx8k=4tW3O; zjCX!10w`VHy0UBGd@drj!=#%0ssxbPRCM#$obSxRKFJLy(&(-iw%$X_VFG!jOUXV$^Rr?iF7Wz^0@&G6~2T}MTGDF*g6hOS&jbV>Sq+%f=4v?5`gi!OI}tJlB{hjTUje?wnC~8AU(?uYO#V4YcJK)X)+h}r}>a+GSB|I zEwh}vV?P@-W><~aYQ~&>63ziMcgC({adT@P2%LH~wF2ZFA9EPFq`fA&Mty2{-dane zR`p*B)&YA4%RqKLC2l(IR6HL3?vh`Tnov#rXuj~cjf}(z$4iOD!|yzbN1pi&`v0ax z;$*&$!#(5a(sE`Tw2www_e3mJo2o)#o{$Fwz9olZDl|yN9Hl+eRuqaEa;$SX))66# zTuZ1Ct*)*f=F}AevsrbZGQs*7`dPic_D%UUz-$;*#mBfqm-3rQJ9feZnMKJ0vd>;p z#EoM8ylL7b*)~?>@=<1xWxPa0ReN&cUuckjMBUF|@LrlrwDt!T#}wpb(;`bAzl*iP1oienc!>lZ_ZY5t;j3HE>YPlLr+`ryMM$Lb8&iMt|=mwauW| zf!G=a(x@UuaT+|DBfY{l;`^a-s>?cdH%_x`JHoF)Gf{p{yyA2+G7FN=Um{wQv4{>Ah0t{L#D?D zAN9N2?VT@wH2K8&u-Kb9+ahFZZ>%wEC{#kiWA`SyQH(&zi{?*$W|D*PDnAZHrDdh9 ziMiEq&|fA&G9mDw3xBt|S6RcaY?GW?^=tZ?sTaG=@A>PG3-$zB_Eo#?R69H7hPRg2 zc;+#uQ@Q!z?^9aonjf!Dx0oD))H*D~(w$zWT;`E}Mk$`W2iftXCtvfe@lbk@=s;8i zDBT3R6x-pye3~2PFV2s7$+?OI`X6u5OQ41lzj@b zJDa-Ie$*Al#2Y6b{_Q()>DBfoBWoMEsd%TAz5)CVR4kQ9eth<^yYBA1z{)hUsJe32 zumgv?d9(OrzHuD=6mb1`zJ`|H#^KQDjo($O!@9%SUPv z1^zx#uG{u%+aJJr8*FSDoGD$o9Lkx5NFw2iz>*YC_8HQZr~eAz<61f0igy@`M~Sr)5F{uy&;>Llde>VamFN#Z z|EcXgubGa*xdAzS3#faj6i}aK2}UIFkiQpnjEzeo!Rk3Nm9ISGG`75Yn_?w)FF zIQi9% z87}TTSADc7BuG@S^pLh=y+x~9OfXxR$Z>vAKkgz81y8>79+@5wG#X5uQ_TYN7sr)! zdiYokTyJlWI;Ik%!RMItJJ_zO*Le9*u57DDK*Xi>H3zAWv6x-_XswyVYVG9-yjtCt zhTC!RIx-42FL5gWL2=f&>>XWYH zeqPw>@IsVEDww}~=LiCWE~H*W zC8K5;7V4n6M0&TkVY=smZ*Oci54p9}6!yCd<-yS5g++pmvo46!F4)nYiHbn^l!=Um z`VvAY4FD#3w+n!NL=bb-;?N`_4I|n{rTDS3Jq9LmmtbRd%zM@(iCi~Dk4K%!ox-=~ zP4I8jkP1X~X!?G44Xy~MI5!9WhZo0KH5U?VUi1z=izu1oq+crIL%ki>r^!q#S+U{S zsnw;w4rlt#=ya~%xUa_3x~*mWWmQMzNcoQx)y-Z%&qs}9&9eM*lMnWo6m?W~jnI9XuN?I8E2f=|#jkpz94C|zeU1fP zf@62SEX0HXJ7ut$OMMlcCE9!gZ&XuFk2SGep=Ff|k9GEe5fyHkgIONcIq{3Lv_9AA zJ~zvG6!-9d9#avn;M9zL^&Y!qE{=W2cXL_LHFyUgxK9f!b_>tb)%KSJ;988qzq&n@ zayqn(uT>c_f5ZK8kjzP`vjTTX9^*u2Syih92N{ww2P~-p-8l1Hsg92hq0QUs-5!F{ z2Fs4HtVD7Yg@Su%RwPx7sDStgw)SgABKgdbc3@VLO>fJok=2I~kYc=5zg5W#C-Oo! zrhwjv#LGO9Dn6yC72($M!}%NURC82CynnVPY3XLkEcOZWP@QhRTh4V&gDRm7IdwW{ zkiP@3`viijM_@n;RUPJwSMM^H`;6KyN=;0eLz01&%0@FI6B2wIl{>lZE@XabO2B5? znB&E}EXZ0F+BZ}`!QdG67-<+VrzX*J@Ksrd^iF@8xjmIT|xh=Ha^>3Xw994 zA3g?}8F;i*eGho-pm%UQ?qZ+LzwWSwK&tuh4;$MeJgHvWO#H~AntC1k8#ivW{^VTo z2$gTzzX7@%82u_Zb;c<P!$2Sf5pOk`0Z-aw+jb_drSbwhCyCyeMzj<^ z*EhjUj~i-P(NZM6M~J3#eUqp5-9H5$t840}b?g-6QEOv)fjgi50-%n+Qem!qZA&Bz2 z*~9KvhlB&E`TnOYLCxGt^rGvad(XvhUp~BezerfUvd-Sp#4k=F%y}EyqNCX#+;KXa z`}b9>so(1SjNp+x#dv&rJtwA?58r? z*FYh)5aKz;zDQl){d=bJ*o1s*GaJ4p8P)8|_*2obHnLbf=GbADiTx~d+h9ksN{|FI zp6CIXj{wmm2W|Y)3L@B{ESso{S*x4ZzIkiDFE%J`_E?{3QmaEy)EGstXUFN%GGcfA z9T914Qs^P2B5KQ+DUbg7wjGD>F(T zBRPEP&lUP(pqCnV@yAVmrpq_&?ofm0cOmN1!{ubFZ88!+c>cF}_OH(+E2DJ3&&_%y z;+Z-!k$ArMt;38D(49J06IIov`~2ef1f#d9!&Tq8iY6Y;8@A^T(}?!C-?nw9-M?`g zy_01haP1JfGGl6B-Z2Xfsn1#$5^eiRBiI;DRYcGvlWF2l5Uv}25u+nZNmpm`V6vqy zWJnxF>6@OBAwys|T3zOPi{*S1k-#*@eqU}?vVT!~35Ew5T;H1Fq=f$B(7oT%aSQu3%gL#Av=KZBQ{~<*JzP2y9IJEH0}oNVcpusQKo?_fwU&Oyfm z=B9OXM%O}yYBX%Ew02yE;{o}Uk-Qg0J>P+?ww_hyGIN|B8+`7w zf18v_zY04s)VPohb1O-Bjka(yw?B^m9=*);9VVml*mp1kRUYA#$-XQX?s+7vp`76` zK4dtANnQigZ-|S7j~`D6CfX+fU*!%7&LbfYybg$CfAa{}^Mi#*B(MaT04x9v#D_w6 zVc5=h4r4Y{(e?E!(H&x8VIdDoJYc^~Nt+X=KMSbw=qcM}e(JsA)6M=DmDCEfc2f-} zJk&lk<&4_TCN&nkc+5l(&12LTA#acb+OdWkKt2Gb$27*Vtg$b%35$nDhmA1vt4+oJ zo{x7ANhio|@C{W{XeYx_Mu6+vh*c9zQP3uEldKk1AZ1LoR4PYCzClNrHezm)Fh0$J zz=PNh54ic#rVH?UyFm&jcLiwraZrgUK1@^6gmLB+)5DXBlC?bt(aN;o%!yvG< znFb1JVm;zDG_r?kAX7SgXaA?!!kL(4ydajGsL|1|_+W$^Szsnec}Cs(q0vKFW(f$u znMK!etse}3+=aB%kD^ZFcEEG6uLi5N2FUNe%gb>|+vJ?cGxrqGJb9VMasl_}GE{ay;f+&rAF45U;}{b{J) z+=qM;)96n6T-|A<*jYu3?tk<{+FA9OWg3=8=u^Qxv1^~d$kGy6C~JgyvHe&WjX*U* z0|Mr4VS<3BJRsDjn>`jU@_D}E=yKsxwVnhqXna+PtV`FP+Xfo^jWAE~q@prC5=a(# zT2P{vQonLad#~vIJGq?+O&lKs5Y{qmo?zUXrU5zWZg&GirTIS1CfyOSRz2;_*!KfTb zQ)T%P-v(L<1&i!ap|~KNsYjSom!xiytC?%W+LC5CHI+QpVXY!sQHJ6$9oCO-k|p6< z-_PDYXGs>nCB0N?+!pE)SN~Hkbpyo+Km=sWG7;4pvGypQ_53yFjf8(ZF?G8e3dWKd zURNG8S*{RiPwz<8XJO1UWWDWmvhlseZuN_P4!-KyAAKF73~ZmO z{4s}UamZ@ZQWoqoNq3!I7_{GBfd?R^s1?s+rxu-fi%9SNNx@4@^XJxAK{FX&hqK-{ z?5nljVOKu1%kc`X)_!23q2jSxokcSh0g3a>G6x{LC=l{9!oy+rNvxLw1IrSmww(IGE8rR8o@@Q1AeH4u(Ng1}?1$+A=^|Q(D|!w)snned zyf-9!vi6ihUB>g3sv-&Vr5oTeF8N#hjc@<)_V3AxcWsuIEXqq=bFW=0{=l(f-yN3f zThD8ogte7i=*|2XqkvBf#cE}?1?p%$0i zvnO8oY$j?k|9 zx!crJTnj<+!Q;+TXm(3~0k2?la!~$~mu@L8Hn_;jWNKHcAtK9N!YW?7(lww1{ zeCVvzBiDij$hH@#nO7?9HxR?vpR2t!%qqXAE5_oai@IgY#nF9)8&Vl>LqUG@t%}0T z@+H6Xg)dDkI^>uyvueC1@%aO(9;j0%TUSnaiVv6e7Fo$E-bYl;Dcmm~xUY*HxqiFb z4ofN=LLxgcdf_#1%C(ut&d!>c#&Jzsb0Mb4X=zY-(>z-8a-vOyHE-tcL_hFKp4q;TnT&10qXzz>(-EnYKJ?5m6Ayt^M~@>P@3`zucMRWn8A0K4^wW#Vq`!x_A?B_Yb5$~mfl|Dd8c z8fG9?5oIC&g4HR7bx&{DkmKm>&ZSMGWAVyuyp;BE8}2Rdh76Id@%a_8s9q}7*PJ@_ zRX+EZM2zY?&Z>>e-3&cT1J%zi&ijn3w)rE-N4Rj;w0-XH7A{D|UtP>%L$hTbvb`VL6=T{p*VQACaek5xxjZ<*>Bm#qcVYH#L33A1QSA0>oAM8E^axbYRa<yl=q! zSXo=E&8d61>Hm_-^oO1;u6I1`rgzX&Mg7r*=gQ6E_I;r~NoGog{QVDIE9B2C&Qpnh z>^zN?vy}JfGrQ>(u{Htx46Epr6lsdML8d_$JE{^lbX=$zw?A6^;bo91C#Ra;K!KR| zbEfBzk-62idYhP)Fvv7majy4#b<^6lid>!?6_SCPt=;PP{h-@FYHlY|rmng{b$^sW z*PGG;rS%hTX7z&t3U!yyRjlTdcQ0;8BKw?`ThD!6BxW$rwW71}5TTt{iI0nGT1_Uo zZLp~J+N^ss$P|u(?~*t*vjD_e2ex|IA~71fAbuEMy@=h=nTbQj6;Xli!d!1Va|-m6 zguV>)zO+H<7KnDhyWVqXn?Zk!j53xrXsNe)SE?dE=_77>?=yofOjzCAi@)&aR9?q>^!@ZRz9B6uJNB2Kcq#6K2L8iz@ZqQ>evf2K3f)V^=mt#@_tc1wb z2<4Eo3d{DD*R(m&!47P~K&<@>n@!y1@dwkUjSlP)O<9hIjvPPh>bJ#m;Cpm9?-uSt zDK6bk^9!+mx&X9yA0Hdp^|iOzfppCw{V5CQ)aWTGC77>C@1dyXvcPw@uz3C#&*;{o zGkf+N4Lm8ogY#0v%=O4T+~e18-YjhY$GPF~HSK@J&9p9LO;-r7yC-%`KeN7|-b*OB z-FmHbi0M}R@Y{EiK{X>)2>~A?t8I3JrV1U1oJdERqty@L(?)0smFEWV{VbK@H9Y-Q zzBd*>UnccEPmZ1Nw`51h=!pca^oCBrBVQGUYl$m#<+R7Z8u??{8qN%_U8z{o3ZlO* zT4ZgYc(>w7eMjy?UsS?pnc+AOS4$rkv>y8qGzO6u3>?k$rcax88%pItng5S~>~j_y ztwP#K=^CdQj8Me#5e9Jw!?_U9YS@S(a^a)q2JF(!0g?H<(nR8`-Ny^LPd@_qX=Ju4 z$i*oxrCL

    iC-Lw3vb^4k@{9xzutbOzI4%BqSaVz=8L2^rQRSB+-*8YFjrh#`l6} zwxsDZG}&)FpOi%$JR^IX+Be9}2^F%H2m?2Pm4rr!ldZVrSi&^ze_Y*E4?|1dDR#k< zB|#y!Kv%yIOmR50rnP0RHK}8gnB2noBZze`Ltcv5S}*IK5<6EW;{?W5xzeFP%$>Nl z`_P*2m=&nsFT=1##0Z?qhCE*luA!&c8p>w5c1Sw0(*dAdzIyfmB=Zv;^+@>FTB3{DIfrqAI|^GDZMX9la!PaP}MZamm0U; z4f9r`#ik}}RP1;fKOgt(PfiM;*>c)VXi~PV|A|iNGdFMAD`3}K*8j{cTgzy!Yo5wT zJ4A~hbXmkQB=i^E1-uJ>Kp@$v6$2p){xpR)=MWNxk_h6!aQeFiHC^8`B`NNy3C+zkD! z4}iH6S~5dF%XL4hrnRR|okAIlNti79VzG*2q$r2f=-r-U_W%QJn!mT7sWb_Bq4`Li z+Tz-u)%F8g)O%UX6~YLaa52()+iiS3ZpOWN+cLK>6VAU0*jacxm6Ll0o|~dYVwk3? zq?$9rQ#)+y7~E2jAe5sq>#0s0SPz)lkyzCWczo3 z6F`G8C&9XU*4VoU^O>lonL{rt9~!q1egiB1KR29kVgv~6?slafo+g>}n!f`hSX@6E z>QJv2hSGq0%d+09*J-Ur;93Qx9zfBc)@CHi5@~_}TFU9&0%X;Unhz{%f7Ea@(h@QJ zk;52-_Mz%`R*{PmHco1`zJMuiq#+dgverrTVC=hnrzVz3+C;a*Uql|KN*NF3UTtv8 z6gt{aJd#ClHjs!k8)NszWM3Xpk{2WK(gUxkXHC_DB?IR9vRJ!_-OyM$e_dR+mC;=t zEp1}~hbLRJP1<~tHwWtkdzhI9`-fI=zFgcKd%5dhAMtpdZX4ASxlxTsy(`g1N;+<` z;hJ#-`}NlR4mC(}VfgJ{a1Q=5N=1hu@YaowVSU!t)+hUIbuT zsQnylq)YFPkufUfk!wW^^NR0EdKs((wCG(a`B6wrjP{g%yO5jik2Dv_T3cQMt_dNZ znNh9$7>QEgaXcFHx+m{`M&N%+o}?KC+>4!j?{kW)+awgFsym8yY_pDXm~A|VG0;~2 zJgWU${BBr4t=<+lqQ#xxWyjkl*;Lj! zI9ku!utzPmp3Ti~wls5N#sjV*-lK7Hd}^`h9`KhH8f$ZTy}9V>o}(<+l+ble#kcc+ z<)?Rr>2|!Ee!t;&nbjzmiLt#HOG?11?WWSS5fjkFj7=ZI4e!eDNn;}~URo+C)JaH0 zWp)G@i|(%?b&sN^--%!P&ThlQ?|P!GCrA8=tU)`?TW&lP2}H zyN$19OE++q?%XNY=y>!nN3!LE)bdFETkac_j=GId-k0df74hcDAKg`*?3kRR>2}up zNGDW3j>Up#qHZDK(|U2M?q@cII*O;*dlfZkV!9Yf8>+hrc$#vsDWa{g`o}Sh_yWdE zERiltt2Y{MDocA(cU(z7=_Vj!;Q|iieh&P0kU)2?-y-pL=MW6>sDE?YQT_WNS`FhV zjM%$>>l#UQq~$=s6}zFipL0x6k`HBak?qOHZJTqGT1Mt@Z2!75n89rTi**5{`46DU z$$~BuGmD$Q!_SgdQ_#Ir!_yu6fWTD|x>Y6!V}$)vnCJNt^PyA3COLrMC3k3HFX7uE zU-NA7fbPu%!ZM@33|-879Fee8qsQX@^&GoZzr`5564-@LB$--R7G1)O1o4uO&6Y(I zP4j;hM5r(ztcQNL5=h#5nLR6dkyMrw=R#~+dLE!HXU5n>n@4{EI_WX#)IL2AEbxa@fm8Qp<4zo_uf z>Fc|NHxDFEm~3(Qvdwj)!ezBeha^mr@UG`CG_cx_7GCs*Ef5G{x|_sJMO*kj7%mwy zx;Uuq+Y)Ov^t@gCO@{i2d)T^zr^`*vC!UR&)z$An_`kO(Z=&XzxRS+M-hEq5zUzhE zTe4ngdEu_nz5$to9-@H~6U86it?V_MkP3Tg)e+Ym${G@W6|L-#pVzceHLpd`FJ5GQ zc}3phrOSHR9odJUdLk6$Un$dhAKzfmZ=LU%qVmi0jkYreI;FVC|SIH`b^Iq7r&ITJpBFG zpT)AKdW`cVD}ykd%x$fc@rOSkBc>+xB8M(P$}b{&H?^a!eR(4Cc^JCmpet{x@C%wP zciLcM>OGu^zWAaUK(nFu*5`g*WT`7OH>L+d8EjIKUS&;kAGo$kAkB5-)%Tq!!D?9~7-R z6O|re2SGBb{EWOb)6<}$7ro7{si|4PLB2#gqhZV|fr*j}lur+0)e&>` z`k9um&q&V-&kH(T-f3h9impbTX00IIdH`&z(MRhh+Qfyi#0LHUX11%*^%7${W8ANa zkLi8smk#?Q*Htj)TK3_!N`1{;$Xh7f8%7(|n5{pj*KEc@{zDH@*BH;*reA9M>iY|C zeui}0+8S388DW@o(sC*vLUa0^)rp|Fnix+vor4L&e8v0;?c?_UHB`>?2^L*Cx=dtA z_#?B$RUx|_5?=@E=Z7=eMji}}|9Fj{$8;m$C5x${1~)}f7SkEn19-ql;)L}%r+!3R5?|lR~A?2)TjD8WD_FWmvIZAmgTIYb7wOOEU=pw&`yX2s8*ZQV!)BR3MiUZ$CS9n@-AKPed|?U@y_=A zQqZ8zx`C6oX3$YMQ1V_@M8x6Z*T%9u0=$8GCDT-o{NI{!YyuC$5JBt)dN+S!nY!72 ze?Q@$pFfi9?|(~XNrk=W+$cA9rYpZmt)aQ}4TxBJLYyrnzix0>|0xdl7RKWO(m z8h*(->;SyTEys5xc^QWZdW7((y|qFfTlZ+<;Es!<)jQ_uW_YYR8ny4BLPsO+jY{&J zuV0UZ?R$Xc3A$rdfhz_{$C50!1S&xY3){(=*DgNUl{eY?4*4~oGthZbQL{uDV_6SS zuay^a5ElXA)cCG9(EW1Y5{RY?7p-h;`Z7bu*@^>IH!ZD$29<%mufWnOC{x7tXm0f) z%cU2cU$+|3=m-_HD*5&jO*hCC@D6rhzVUqA@!8o|Q*#A`hnf-aHO2VYnrzraTACcN zfs1AxoO;^}=&Wq4b}!8Fk|Ws`mb5hNa?XwAC0wZ5Ny0J2C|HGMk}GLT%-RMsFQd{> zub^8a`A?UcAKh~L2NKo%iQIy+QzyJMI=D==L?PtwA{#if{1ceC`clAeO1LGyi~D z$J_I6G$&x@FJ`&@@+4-NX-Q5_j1^1PsKgdMUHanzQ`^nWc+k9g%8*gdp)Ofpt)vT^ z5B}-$XTHsN3LK~RC$F-tUt%mTV=&K}SSbcY8RK;aj(ge9LN`#>umT@WGF+mi5eown z_DAgkNV~?!(%-lycXrLR|0)mYMHmZzQ(~=?4(uk#0c=vWN1nBaI)(BY1cHNe`w`s?9QUz zoiO-f_F+xtXtMgn435mJNHz|k7M+PZYRTU0y?AQ)kIZ!4sP3=oYL?p*;v^X8o+rRPI;4tjPsNK0ufH@L36(wR*_6R^T}-pafBm+EHTO5Pj;e? z2@W$Kro+)iBD1XI<=;$FM#?|9DlIEycgn&Yfoh#e=V?liy@Qs8he^mOIYLGA<*?a z&xplLZ{JYcFxM-tO<2wT%r)RgZm(ps*}caU;B{ia1Z8ekKq&19oq1avtKmnG9aeDp zQCGJbv$g~*L%Hj_Xrq(jK&%pSXT@gcp1;5UG*;GZG)Z9}Cjk}PXB+a++h#~d!y35q z+u)_KVG9awlON_F+Cmu?ciYN-d?aKG{}>joiP$3{sf~dOKD*@Qi2vs{~Ob~JzD#3=UhPz~|%0IGYst6vN7Xtk+WL@^CKKSCZqiDsU zEf^Tb*$qXh{CHq32?44n-tDbe$Fa|!59HDxnE6SaqTmM}0gjzzI8aWpN5Q(T07Qi< zQdhuRn9~R;qM2aICYZwcTtqeiLxuP$^`)3uSS;Q_6fykdGgE5U^KkCku_9;&BjtzP zSYKH;7I+6~#0=OmSsboAAn5usU+nKNzQ_Ws zSF}PExCPqK0^ExTw8f~97o$Qf?XqSICp;AxC)~i@B^%dU_cW3D?^&^qf1x!4$SufY zhzxG0(y(fJ&_9FRq!}Oh==DnqMTH3^Jl;q<^?Kk#`bFD!BBdEmV}na7B#e;#$Ci@SFz=m=Qjmh*Hy1l>gplCz;K=>L}3va!Fzx&cZT`rcdKgQmcQep&(zwvGBX z(86|ge;Nj_Lt~Dn)KJ4hTKUt{anf*(dJ;T{n&}aDYB>S%1$RN!!_c33v6ylBNb;GD zUX?%1hZ4GianfSNWg;GVe_ZCI1t6bTc0}Kkv1!tFC-9(!^K@b@UbvWN4FD#9h*zDd zf;m}VVp=j{qxk4!PD}G9=_>5pchK*Nv70nwU}gsuDgtd+uBhG(Nz86UQ78MJzp;q#?2+$$6J_I8m5H3Az6pjw1KJ zL8btf`mgkbZ|tPoRB1p1I>cpiv?m0qEhF~B9<`pFw|qb4$9VG3K*G6$Lu~at_;D|; z9nYPg{qpzk-w&5+TkT#r|NZmfzSbW<=!z9w&jeiT188aM)Zc@i(`tLi5pBrD!OfB# z{g{xq*xq&on&}e#*}!x4N5H>%42W1WsAfrucs#gqm7avn3!&QFb z{z{=^V=x{*H05Uwq8Z!uwq8Q$D>!>#0D4&jGr)@TN6VC>Da(n#iBgE3;@bFJhC57g zjiQ7_t_w#kjgAY!9Pba=>X7bLq`EU^l4MaxGzxk|WnFmG@5cjzpMx_Ej2X_WKGb^= zYOr~Vg2OlxY03i?VQ)S|#Z}Be@shmO*;ef=-ub>C5BEV#1N%4`E`2_hB155FrmgCb zJ^de8J7C64EgyJ4lsR{NZtPp;SlTRCEUd|LId`~MXAIe?E!ts{JoYY}5$g@kLhO=R zfS{uwiD7MiqAiY_i5{}BMRFd*6L|zxLLZXL8nK!bCAzMQ?fV7Iw8_}wRP`4xy}cM% zc}n2q-3l^bh%#MGdi`F9Z3(jx1fKQORw%c9)l)>uaX|$D+SFgTRGc3M@%!*6^8N!o z3if@Tz$|GKelhOyFK?C3&0lo!q+n8wSiKMk4xlEFB3Kdsy1XE1)sx2Tz}b|t=daqR z9{mt&JN4$WTnAHz(y<+ieAZu@`CjRSzEjam9Zk$>xZf)IB15O2>-apz%o_+pY&KW4 zu<{U>8xhL=Dksu1{q{TpIf6f~;7piGBdqw4W%=v7+yYM70GNIlyw4XP>Tbado;FOx zlO)6xd6daWe50F+4IPuj%)AQi5B>zvPU@x$FzDth>x* z$Ceg2);u@!?%ojnwysqONZc2;Bd?(P5ll%0jqd)B0861&g6#6ZCniQBY=mGulDn96P z!E3qpP1ijmJEMV{vdC(aU~_#bnVvZ26U*S;*OZW{s$;J&#{L8>Z}eU74z3KA?djBU z9cZ3gMmWJ#rG;ia>S8Bn0K!tB+;9mJcB;~q_p-|jX+?ushc_ek`RxqF2RB<;81uE7 znrXnV53o>_Wv4dnBx1rxmmtrFaB#Kv&6W)++2yh zb{5hiFW~8PAT`U-Q#4i>`U5M8v+G7XC^t!8gmFrR;AA4pESIo)!#~gmHg*=R6!Esy zhlD}QC$=2Y5(N}N%uC!+_a&BbqO4zoME)E+Zg4z_oq?a@v})D2h1_M~nuV00MzoS0 zw5lB^?At~gX?0EMkkm$MA0Wy$+w3Y&r<$VA3dAqsYbi)q#*=li32#)c)D0A_I0JY8 za|X^)C68HfHU@Z&lyFgXYLYAnRV0{Pwg%AkV9aDe2}&vXo}3o@ezYY6?Xb={4nC>{V2ZA7gE-iYJGI1Tz0IyPcZMmDVq6BbyL8(TG?R}e%q z4uPDX<>51f;eC1!wRy+k@(UGAL7xqi0fQ(^xB#iQ<^eK!#qX;y?hqKCr zOjo+#QO8M(pHf+d)m=L9p+!IXS_<~!FaYVAsF`S^T=atk4pkYN$^hERr1qi^pG_qt zS`-?ibBJF)MEdE8bhsH0=l{tTPSY|)!!THLr3p1KaiI!t>ECNf)Qt*iEb*7A&_&=r zkUM=0{JZEX?1v{>T82B;wY|_PEvj^XwtKdOo+=4wjv?{=& zSPmtI}8izg0JC0*LZ!7j@k6L~qB`k>;DWH{?VC*3 z(lMyuh@cKic_H(ZNvy-X;?D~GuZlsqF}@9REznsHODJKDo((Jm@?IlbX56&)md`OE>}Jgm zaQ>XG|G?m+J5Rh*77Xokj^m4^d!vPVLO1;U{b6*xTX?cQ{LdZDoLji*+6MG8FJ|7h zC2$h*uaS+YqN1Wv!Sfr)Ivf6Gome+wZK7--=sa%L8eID~{V`DEmBu3!;LN-vgxsKO z#E?zF!s{7T+RWGdmASufqyC{vWf|ekOPMS4cdUlx$o#+9_Z7L5X*Dg!TMOWGv~hfV zy!4x$Z?N2>Z1oTESi_O@pjd_H^25gyZ7i5oO11I#Bk#_uQwrv;i$(V%<}`@ZN7Ps{ z+_q(;z0Z9mBA;}~n$LFUyfEbEZB*mUo?W!K){j=);?$yWdoVZRC&{TCZxvc*SLOqT3{{4ptOHmWp{Br!jhZKvt!&XWz~&!;6S_aJp2v#< zi0((U2b3gB@krv_gS1Zz{gUfN6__C`Y(2HLBe$riXjl^>ANKZ}?E~FIj>;p`Nq;(( zcWSKDal#GnCmZXTMXfz*jMF-uYir!TRH@_?H}5LCYdlyzv5m;xufzkBh&YmmY_%fh zQ`JVOwW>OoF{k@ox_DRdHqpq=nY6>2=@fF$YP10acl?KXTyv&gx>-BYIf0hvY!)^7 zlK%+x-{I1x$_$cT4rcu?9)1N;xU+_@gKUD(RHl9w!Hsn zgtp`>^_bTx9KcFFrq!%UkHQKe<|D}94x&{dfJqOb7*X($J!Ps;Rl9(NzQ%O$ac1S#4z*B6jShjRYogeR^MJdI8J2VUy)^@!ZGoQJu1FWgQ)++y%}`Ds zHti~7j=_O%=au(36f)1SU-*f!@E5-(;?x6W5GyqKxn};EfV^4$wyWjNTE91?f0$ZW z72{PvTm!U4%4p6*T<}-$^Js2_;M{qnEWnb9=gDsy53Vjmx{kZq+e5c(vQyS~@ZWCT z{UWHGW=<7o)MCPcKQCH2jp7z+c7Rm~Py&z_&42K!)$Z@Ci4e>H6=WlJ+9lIc)DA?f z5{yYcyP?Xkf3k2XF#b1acZlx|*0t>ZJc|2ARCYQlXSuzKwGdQ&83fi`qssP-g{33w zeR^iz8jxu;pbe2m>u3pw6#tfOouiNXOmfB^8+>x9QY@T2M?VO839B6i=h8s=rCL~D zEO7Q;_LPlbYDirQR=3$Cm5=z}jps-X*by%AUd+#*l=}Tf47ls*=^0aV8LR6_iVn9B z0mSIF@+Yg>$Me7s8+es4^vNJeio$){nf8I{ z0)=vSU%U!^5-<{ABk!`o0#ktIF#5xXkw`D+qJlA(xWMt|{CGigg*L#@MJ0WLgAK2i z>r5?+f?~g{y|dW;Nc4RXGo8e551M>0*d&Q6Lab5NhLz=8WbS1rv~9 zBt{1IrMAiDsps3n0USH}eBVrken9dijja^ViA*#>4q>AF)DCLaleX<^s?To8_7VxE z)w!F!@~fl!fkb>p*RkLO*L~0Df%P4+6eX7vN{C30V*kWqmCnTb>>>T59R|}XA1!~e zU^xbo9vCj7f5-qv>r&qtB$ATF3`?+!vnDTr7L_Ro^7YNlYf<{X%@dI5FsvqrW@cWJ z)}%!My)LYWHx%fxND-DKbBRxk;m_LtJS-hQe%GZ^cWmrCpEVL+mgX*#SLw>5zaEK* z(SP!$;rp{C*(;VwSp7c9oTEK6FMcStVZ$L+r;$6k-1jMNl0oCl+q1Gc&m zRRa*CcoF*(gC$;UoL{S~(ug_xNNgGZ&UYp?LfCR*JVQjHY15|B55UWmtH!S9K1CT4 zwV-yA_Y%7QTQ_e)#8Vhhtw`-22OqL_gAz^n%-dW@_pVr4p0$ZZ8z=`-6SkO35e7Ks z6isEMgy-f&P!MhzJtV6Uc!Vz8ZllS^(9+E7dzqHHR%Id_@%{yT&qpt@UALsM+6+mBR zA?s445OgsdS3thT3H8b^=%{f-J=1bvUc1nU<#`I9wVWh2x;w#{WRA61{zrio@wdSG z63JKl0J)Jl3+y$WfMd<2>1qCi+WT3T+xd&MNBJ2YynpMfa@q(4qN=7RmO3wlk~Tca zbl*Syz3+#xK@MZRh&$(rLr?r|B+CQ9U6Wji<9Gsf2+SsTOLM2rB-#=q)QWM7yO^nD zigzX;4?qD;hzeFc9by8b_&j5#5VW8LKQ7DyAhxjgN=h$e7m#MsWQP}MTK9?B;YpUf z3DnDR(O~`nYFV&;-;Z(bIxO3v*qD}d&giY2mk(c{UOzEFFoyhFW_r>X2%nns2AWYk zv5I<0QeXUp&BwMf1{VH(UfEqyhMg!AqtC>sU-GArG!>z_D#}hdX=x|&5e%V!p1%!p z^I>>SksH?HG?g-wTE8x94B$dC#K?r_WbtcVkWD=R_9L448_z9^jb(4rM+jbl#@W{FM zsa<5;7N=?u4=xdL2|PX7wd1MP$B+|~Ie;K0N!ph%KP0~{l)-qCORg7oBu$Ll;(y=e zFLNhqY5ALHhkUjd*hv09wP)|%KuiYfv7{j_F_rxQPU&Oi@jphRri=3>2Yd(?NK`yhWku^y*^DhG3%F52w{zD&UD{*AU3($7t&1{3!{Z9i zYJxmTk8aj^g37tD;RLoQFEE2a{rm*=^8)7xWZ~W_V0!&*ehMI|I`?)|Y0Tx6csW3y zpYiu`o~;f@)g*ruf9oV3XVvxm-1m11ZuSV?@^rtUT(RGXqOzV2;X44z(r~j6gPaKj zJzr%ddgV7FA(S?|VG$a3muAuE&9M3dkFNoxDf0{~Jn4?(7_&{LOF!@i%`27C4ZwH` zGAb+j-B`Amm+3?en~cnpo59$wQ=c|*eq!Nd%Lr&|-Z#TjNe*0>#gR{`&YPGv-;p#@ zz&CSFAT5I7GmMVRQJ*-Sv>!+@^X-YuZc&v&bY>*nqc4xF7SKsbu!EYW8CGBRi|FI? z9{MxLNx8wX%Rv)qVLgJtBb@lj$D6Tifp8Lztm*#h>gqy3#2QYr01vp70+aU9RynNqMYJHHQVNgwC!{WVtIkOBiON0&vT zUB3P!s6ohikQ^ycJVhLi6AW_tkRoS{Sz20cvnW+d7V^#Lo)PapLGXTYhFzF)b7Y?FaySx*tb-=?Ut~S zsGbbyJli+hRo zbUjvniWv~q+B}s#ClPfS#ITjiu2U>@r^M2yQMhESLBLih=`WFx(E6AlAg$eOSVA@&Ac%$Tm z$z7TMZM}xE@k6>wbC)8suIAQ8Yxp%Eql;m%k6`Y(hUyIDc1#qp#OpYr>wuH}z>s8T z935@zYfNFC{rcCBdTsJ$fReF{fjG9!;S`6W^hP#%LYhW3q@w-z;Rza1!4@=-2{2W_ z{E9t~GbQU50O=?;pmwicc0tz&0W<>zFIrK26i zIVvzdglN#fb5AOXMcJZ+DIt^TbHZ}2jC~sArz#6?)cyK@LQO!sL@+$!T}OKTkx}Hx zO%Nc{2gen{p29;(Jx%#~vs{j0f=1rDDBoUx&!igL~zFF9xnOB=r4b~7K=MB@_FGSDg@tHA0u4P^oQM;tTB$gxcI}M z?i{z#Buac_*f}NS(Wm!E(vk|w3ppt%ZSyzLNp$EjjIuZk-&efLk^}2ZT^y3+oF8e2 zhcP**>-bA%B#k4y747G{D15M0{*PUWLhX@&)%KT6K88FL6#-FJJ3W|+&M z2^mttFhiLQVh#OM`*!7}|6vj&@VzEQjj4(oKB6(g062waAKw@mEFOofSs+w8wC8bn zWTY8MoCwLT}D1#gRUKUxHY zSWdcbCZR9*d8^leGSJ$Czb4hv`ebPxdC)6>)<6ae`00`Tl7?E>q)^Z>*Z%38@yj-M z|2{Qd_}7rhTYA@<aqSt<8q(UCjz*FxX&eix_pE1B~%C>1^xdjY@ z#rObAUn$m_XHLj=U08BlA#CHHnguQ0@M&aO@awLzOk+&z=YCo}`5jJSug8kkB2 z> zzBJpx!GRrA$dL!a<5R%Z7B+gyjAFb+bVpcu8pjKj6~UkK1sXME}o55PAemFsGTbVG{={%rN>-ut<)uq3{g*E5N^VUaYuvP^(Q4bEbYqVM zS!=>t-Y&RQkA@&}kf_w=?vTgyCfbHzcuxVr+po6a5UbsOHPZpjg?7kxd3>6DUMM_w z^hwk?^{A+jrNb4!)@|kuZe(Xu@Xld&Pe#*$(k@&HS}EIvX~(lmFLTOUyOOh_AYx)9 zEL^p%OW@(i=Y?`grmrWD%ni-a6?pcvA~dWTvguyOA(^a>2qRlVK5q|Ds&9o;T6+zR z4jN`+GA)PsERv->kUV5;`XFv2c${$jIcMdLjl^dK8ma`Jknw!yFNb(Pqbw1rx=)6a zR@SxtgvJ7HEk{kQUQ)7RvvhZov?&^|QxM{%gbD?FKm}_ti1vl2$EhhCUT84OKNN^u zo^W$>;3{zn_Z84-+(k+F>G&jbm55^}a(XVBPMWw@om)tyE2_|R)9bTkdk0Qy-aHcW zAGL`~?3G|)jWqkcB`a!$95SL#tsRX%8ZZ0t*~3^xUclHi2~deCHk;X2ladf&u4XvP zKUJku^xb5)3o*SuXY;G|)SI&ZIuFgg5n3L&p1FX1_59hyk2%Db%jci$}N3We*0CrCu*IjGesd1Y7BV5>M;6M_3r(bQ0|Js`xaUG zKD{T7nzE9$a~Jy=R94KQ^|mPvy%88*pI?zNebi2-u*9lo?dd_0pqn=j2~-{Qm=^k; zcdJnamr2!~yAZ;`^E@mW6R&az(BjiaDsbSTOj0e&Wod(h8YfV_Hg% zi+!}ULmwV)9nvj6v^3uSnSDf|fMXrU+w~#Q)>~F`%#?nj7`|&@db~yNUKI!x495bi zFWdU3h`9px_@DG9y%nii-}i1g)0Fvt*hLhzyMa!eSn+?DMf~8p9H&<}Tj1Eea6V|= zm!u*psC<6ovBI>x%bd8;#DSPD4q%3y#rat7JFKMVViK(wj%h|WCY}hRj3nel zr0C7j@V}Vbj@7H6yzW&?hLPU;`dRK%kl1{QcLJl@(b^5Oc?@a07Lsma%%1;=UHip- z9z}GwqqeSPE9h6|b`;W(sHlYpdz_+pXNg9K`SX;FlR;l=>t*ZnLL_=z1+Q-2edUMP zTXEDRJ!dC5zY?j+`XpPg5F{c{de`2=(cZ%xhh<)9>*iN`i!L>T{Cf09`^nOim$IqM zUiQJkb@%%7V=F(e?!mMkUpy_On?&eMTFVG; zq(0h9Z>mH+r1tJiMa=wli_dk7E-}C2*-MKgG0^ZL&slcKFuG6gVD=)E9CCyKQPph| z4V?)!9r>9bpPv;XAdk418;dL+);ePc70C)s;$vY_L)spcj0F1vT7SRLu5+bMBiYbK zXSI{N6yoI(y;2%>VFE&ol#Z{1i5uqc5>`faJ1HVg4?Snjr%#_4m*(VnCewt}!i4|W zHT5$Z*ZaPIj4rfnFR^KFfpufunaSFTb!;2QIspDHU#zWN_0OoE)u_nY1h&XF50B7) zcmxm(j-$(NN6CwE07RT-)UvmLM=*pO=5g@yVmk2b((w|93pW6usKhNJ~W!HKd1 z>nFGG6iE|_ttZ!4c=0?rR<^9>iy7_A0)BJ+m0tUYNB7ICgkfRU&d_bkqcruZlT64= z%I%>tnA}jSy*SkkzZ|7w8Rrt2k~KNZ{YKvR9bea4#BZ*4F+ct6Y}$FT{J3jFPfldt zwV4g!#;5woiQ~eJ_2NBUH6v9;x!=BS@k;r=J^t8O5Ni=wlYhKo;tlmE@4+kh1~m zCs0A7(67I)!$Ggq*9OoQf9YdYsK0;Zq*>k_Z)P5`xOh^G_+ua*?Z%Wu-+8}N0H*dz z5W|xbJVo&69bp4pa6csaZPh!@vLgy9>l10RCRYl}t0ymNB0j4j+cJ$09!jj9=fd`y&y1n zzgOh1vopvq_Wmjd+j{<~V80lglfo=l5%edMj8SpItxceCW#h-jN z)d6doP%U-F-q^pPy6Uvo##1u;alfP#(5uC~j8g|QjHjDUr|iqQaP=dJD=%%1bbuTG z1JKhM5-2qA?6#(hNci-Xa}Q%SnWu5f=s#+XVJrYnfd4>f8V&IATN(r=!NmdoTF!6) z10Xved{Y!q?TtcdLol8rzhSnkCPPW;dj?-8LSFeQjV=`$LeNV+IUYb|+n0gC!Q-4p z2x&M}7fyqgX$c0g3hU+p-PCU&jS=YlWW_bC1Km6m|kPn)> zsPy?t+MN1^#LqW`Bk4`S~gjT z@e1&5eR5Y0F~u8rgfeP_UbOo<@|m&XUf+YOB5^yo(g3)Locf?Z9Un^Iav+~Eo#m+o z?`ON_e>FUx*to1>MW4@$YchUbQ9BS@@oqDZEJmk-50g%qUZni@Jrwq_f5Dmor{_q* zmKUytd8uJvXw4Fk4>rYv3~lHwvfM8~Vn93hFc6R=S}&m(tfyHJ$q-WlDv&(Xlyo(c z1?SB1oV-QkJ}3mSI{MG$0G{N$EG^u6306d^@;K(FZU zb%4~;070(o{%9JTk7_2m^qmuG_Uz1avqO3IC=XDl_gkneCx7O7QQUn#Y&n9QO;;`E zR~QzcnF#obW*yNuKlbkXR@-9Uz;u(x)V1np38&SggSW=hbIY}`?#Q7=r=t}?{izlw z*AA!OYhv|4)VXwhxwtBXOV1@!AH50MlD$Gp$Xy) zX6$%C`w2`ztN=IWPudud5X*^Fmhi2YCcL`xsEl^MrpIfERY~(ZRK-Z#?i& z_c1twTfo1!_7^cDKs}@$6Vrk5uivpK8+hF-=Y!bB3v=`L`uow5-h!Y3m=W4mYK29s z)d4Y`cAb!wV!Q>tA^c&X&pw#xrw_ofi|UGo91Y z`#zdN5wJC*e{Y-+mOg$VSt^pTm4;B_j| zoJW*ZX^ksCt%Nh?|Izb~8Ms-V{*OwrAtB~q+}@?=?d!$m6$)~`!HHITgIf=6+Q!UR z5Vxjcdc!jE{Kc{u2%lor%Z9n#95=TL$X{P_GTLaP#)jtgm$Htu3$`8Jw8_zS%F%ke zqW+cYhauf*x4Fmk+2bM(-54C!ii<%gxc!(NT_e2Uw7|ky^veI-%J7oS!Cb}cm~)z2I<5@XcUyilp;9G9y{fW zoPxps7s;aeEz(T@=n}cOT*?+stFLYVsWvPMfLIad|ABUvOArh(u_tfsTIJFH)-N!z z<)MXmlUe4A?D`%4)9MIU>M@ z+#klh@YZ5hKA&Ip$zj%(E@X-mjoRaF#KLjEc>>R~Ku&$%S-{#S(LkQfbNJ;rzz(QJ?>R~qw`q&OInMmuIDyqcgVMALJr8!J!+4tpPwlMU3BOAFl_AF*{>3jQJ?O&s~Ym%5;{V6Si1A>%N0R zs1OB3^k)G(&mK(<Y2z6&PEdGUT`Sk1yV}{5zZ;OPKh}QHKnV)w|xb3YL6I0j0(8 z!G6WE?f1M#pZ)T_Jzv!0;!m19`Kf7}&(B0G)~hX2wC!2lMc&}i#ivPi4RR{&03GG< z0?}#IaSVxd9q0*T;=!LF-QmFmen33MXt~FwOP31BcjXF1hR1dv z?4%)8l#9PZdUcbn`mS@cdv0^wjiLkK31DSY5bQVimyU z*SdLZ$Cob>sFQ9l{dzIT9f zoHRAyV=ZzL@8-8%JhsBg%`j4#DXj1zWIw5(X>1q8K^_igKk@$82^;|k)bmD`JcufeLcC>bJ3xjyV4ZxQa$Wj^6{c5{wDR! zq4|WYnUXK4#I9*R0as}K9JRn9uj}izQQGQ8!_QzNve>hVPGjsAICy{WEYs6LYcMq} zKECGjxAG`|C?dSB{NM-y>_+$|+43yP77Zwt!a04u>?m#6jn{oWo$gxQwJQA}6>RXX zJMQ}X^JU^un$F4=4ZLJGuk^n%(07;iCjZ!6#la^ja##ZM*h(YATM=+hFt2l@)AX^y4QIT|$Kim$dz#-L0iUR~87{0WZmp z`8NZFA~P}MccR1zsFZEfd-F2!V2ti>)5OkuTIKKKjnv(GopYVWEDnS1AE(8i8ij!b zx!4-V`(qAAkOGB8*bZT|u8eWkSZ%$IJXQMe9@|d}CHTZ36h8TEOR~lz9p6xj$4`B5 z&febmwM)@kZsDipPPHEcCFCKCtCbH6cJ>ATDIZ!I$T)KcrL05&@QsybEA%`&D>iMf`*t!~<{tyRy)VU1cN-miV7n$??T-7An{WSDJG?Qe^iP{rB+~!yU>&Pr zU&@T8TY9;kCi5R@Jn-JAAtvu*`b{*}b6?dq#e(MbYpv>j?~ICBbtU|!ulL~v8auz~ z?wG&C{(sF{C|A#9@LBsWbu-p_m3ZEHs9*8Gk9$Ec9qjeJYUL}9ucivOsuo5Rw@2GR zX0Kb5=iDPEBjGTZ+?3ZT(5&QNdFofds_p-U%-tIsw0<*z1J$8 z`a1y(=7Ntat+U4cUaC#?`ySSu#c(32UT+FdC*Y*~kA?O!30Zlajt^lDT^_5gO)K;S zYVbF>lLcpZ8pY%;5jMOYVQ%=4W26>Z{9@5o4Yyp<#tZ|QIkJE2;00FPmovA+FalD< zX#y(X)=qUdsbkcV@sTLP`$}Xxm;80Qt8SA^&6+}y=g4>-o+){7!Z1Iu=bFDS2}PV_ zmhfdvn{C$?U3j84T&Xu`&}-FQ6R=jPIXbeYL0lVwkJD}rCar%j=-x!WL7-9I=)c<1MkcG;JI_$+qn}C zlJ~a5*>aX0S&u?QK)QDrRpE{t5Y$MfjS_RU-y0RI&OaE=u)dq%WOGEXa_Dl%M7Ir( zeIxIsxAb^_5IS;VL(J!j&=Z#5S`1R0+T$te+=MNM=0)LGyYSBEN&w|zX2Pt^u(?Et z`IiVc1}~2A@+YAxQWz3gU;tn!nkBNkkvs9O5ow>ZJx~NwMV>fa`XidOCwd1jWn>?A z!Yw0$r;3{uB^}1cjvg^f&QoIS2nxbS>ak&<@uE;6i!RJdO=k5?g~B${<(ioH8h?NE zJvy9JEn@(oi~%ogrA7%~`Qc?+ic^JsvUN0;8)&==pCzO8dETN2jmf7_!&mLhFkwchsu()5pv$01X`;O*|^u@Tf&@bjsPKcA(w(Tf04IwrtEeN1B#speC)}zS7 z-2;V;$rO&mpmQ|A9LtRFa-luQwar+!S2=l`I?4yy56s{tq~0(wP;~LWMh#2-WG1%{Rsu4p?f2>g zdBAxfi2){Bjg|&FVw4}03P3={Fpw?9ZmZ974Vg2|LQGiy6w`XbwGFzApvFrBE3CmK zWXQYVc_g+4Ed3)9>BV##rt0o1q81nVCPaF`g^6YB1(FW#gFB>!hmOQ*Z321*S!QV-OBop+p8kwBgdcY{0_M?Dbe4EQmDFmV_`Vfid>~o>V%UO$=O+gBJAluctVl z0f~0WU>R7peGtf?$uQfy62EYZCT$u%C_OlAM;VEJ!6$FN7TqKbH1YwcheKU{+@^!N zVvYrTc~F@~EUHq$kb5CEpkm^kIB{Dglz>kRA7G=o%ruo(E9pV7G+yIIkmXdxAh_vw z6!F^Q!cX^4@QyX-pfRxKKRbp9W0BPlH04>tSO&^AF=#Tke6Bja5qA<#5vO~R>FK_C z!I?+sL?TNT0OJv*Z_{*zA4n34W$BXvgyq4EE-czyS+hpjPHlmHe(}f!mKyd+zCwv)mN8h zuUJnDCXcM>>FM#oJ!2DIv$YsP5mq;qXBz|ZAgBQ}8^bvcavsZWp<-*;=`j_kW+o1o z`y+VW}$6$Yt9vPzRpJo#MF8&v>vep%JdU#Nh)w+LlHu7IG0)! zB94+SUa;)vTuf#LNcX*;_2(E9U=eK!-_WW0%nZs^xWG7U z2cH=qz;T7{rU1Y>2*2P8H~(jCEk?~?2op!QrSx`e{s1s<0SLFe;GJpwI*t}&oU%Uw z*$pr;x62Yw9fLK}ENN+x`7{Xl#>5T{gK-l5oZijq+yiSYp+*`Iy^p-KnYI z^XIlrYsjnpONeYg=Ir<_E2x#urvE+Nz^KdsP1k`o1h*I)3bGxWfb%fKnk_KEX^%G1 z3IvL2`rX~cEag=B`V?sSk=F1fyDxBt9dNCC`I)59VQcz)mb#c+%BX?}ab!NUx|q>_ zac+;f670eisq6JsUmq@9r_dB?HXIUhtnsB6@s z%=xT#*3nLHSQlV;O=|-Ph(zY4y+_nFtFYY066`pC9=_$eGS~IflI!y(?X|O0i1fX) zPGOzE)jbOLd3gSwKlSP!rN;rDfBbEY|FcuJdu4o7TC3k>)a$#O*~XHTK8Gx={)F}+ zDaWeWV-l)0{k;U;wr-An5!9cAyCejt{KF$tHhqrgb6d|h@q-WL$h|0A?N>*2nL}Os z_u>bIcz?2oc*z&WF8Slrn#FZHR_B&h+an#G|sc6>zNHFi_CRn!(qi#gZ&qi=Ex z*ZbMIth}>;v|Q(@SC*#sS39c9$~F`B1!8)ws?)jdm%Op6x-2}Sbi z&*i^w8_D|dy|G+a@QmxzS2gnZM|9Sm!yE2mMnG`QIEGa@;5z;Y8=6FD9;trl(t*u? z)ruClv9A5e=Fk5$M&aNMQLQp6&nB1_RSMv{+&N&0uOV8C>ddXnXVsDA5^WtI@+tvb zs`Hlbdsm3YUiRILn!VugJ;2`RAh+KJS1*GL!bD!@P1wZ=TZO(4cxFVB<{jYN*FfR&+03 zerzIaZ1%lS#%CAWlUU5R2blivp+V*KS%!dZ+xg-Y)uR%++^0rW_|=BzFz)aNb^{K z#-gI*Z<<@B^A7NwP-HgNja|Hl{5!W^$zO1juwl_nsAsob6zbP4F>nQyH?-$tvBWF- zyroy}VJ9|EL$Q}y$>w*-zl?x$HTtGIU|^}&4iP__UG}}ZR;^(U$?5zhOCHiUsZ>D% zYyyP;K|huP54Y>Eg^KOg*Uw4gsV`&|cN8_KXfi>(RBRt>Wu!8TxI3r}A*&zJ z>es_{7KvW|8(Jool1C4!q6hs^y#|G;1~{v-PQsV)hw33jD^6SC&q zime=df}p4C8P$K;ekFWc8eSxytNC@OwH~i4Cmdb(aO}@!_R&Te36TxH_V0ZMXb%_u zOJF=64l5@N^$zWVBJC=)pYuZZ+QJn8)yQQ?59BDZsL5TFh)ajGqvXvJH>smZW=n&g zWuFf4IRU1oI$kWRa|6I9lEJ9-Ay1trnAj&2BV>?e@l=pSq?+d^6@tb}F~uhoqaYdt z1a%m6-cN}YQ}SB**}p6X~v3W@_$G;yW95cl;S_IZxXK5$}d9dXw|}dUEhfN7`*f>`9QSnT1W?Rsa^- z@!A@%+HGwW2R#8+Lj~p{{4~5mk9s**j2>Na~ zx=~3RvL`ChOUH=#+vH20o}Nwsgi7kyHq0*K+XO4c%E>uGJ!h3cWM#w@vG^@zoMzGL zb}Yj6!+Nb#@w8-%E%c4WK&JrA8qw-PX-1^TCx!H}4}mE7YtB!d zR-#IX>1*OG4#1H^bj!iV z{zm!$Sq;m?e(%yi!mIRMd670|udP$!c;b>VnTaRgm#$gFH&Yx1;D z?sZ|IGUY@!sh-|m8&$DXe_AE!@WhCAq7T>$$&CE+*%vIP0<7)sg0zd5E=l$Wdhbr2 zp_Ym|>l~CFTgO~T;D!6)HneqZA6DtS*N??JQ$XP*QZ0Z=2+hn2Tv}{VSh)U>f=>oT zFWNA}CuL9+&?TdNHsP-n&z8)_EVBO~nkEvX;^x@6iL8#FEtmDeW#b?h0oF;o3qm~& zg>!I4x0&Lfd+eL=Ii>r1ZlGV)GBV^7`YB?Nv;$-1#Y%P|| zBC+%14vv;q>S~@wo|Sb{^xtxgw<{qnd=4Z%Crb*qw6?-8QWJ;pff~y#ZN1-v0PC2V zy&rv@_wYlQ$|T#l&VUTxm#@H0s9+yz1WfL$?Ad--8!ot+?kr5LSq_el%CN?zge!Ta z0}+puKSr8&cTH*ilF}1BPz+a z!T2cv-?kb5PnPDPdkw{=gLo25LM)m#;RNGj+tq+|&rp_x(Y`6X>HFt8PrXmHoMxiY zXo93%re0Fevfz`!(4!Ry$yM3qdz)g!Hlll%)sLCp%|tKyF>^Vq)-vS#xo6?!z1~y0 z+Z}4A1NeCW1OXM8U)7>@&E(#%s_EDoYDX(>$a0+nwoMeq5N}K2s(}C!nXiSBANC@| z=Bq}yi!w7~KPaRaf2mj)UtB><4;}k`bbtvc&V=n!U8tR1(lk{X7EwNtbt{* zk(lFIbbfS@u>brtB;DyaKExbuJc+cAPl=D{b%09H`O6Yk?gWrVl`DaUv{tE(?jBDJ z^L_kWP80-Q)Gwm)<|SC3T<@6r{@$CA>mr~G5fh1QEF)uRX#%O)QJq;A`rINH3EHIu zp;OyrcWrckKHw8Vd{~J-zyNYAcKI&5?-4jc7(|x|9232`v!B_&g|&cI2fku@gc3ov zXv2|+_{rglQOG}Xl` z5m!!ClUAc^fKW`38IHK5&B_+uZiI8?^bCg?(>m{h*wjtSLeZt*o;Kh2+(B%YhD=yU zwE6OVgiy|bY=u;8X~+G(fXzF7GH85@Q-zR@@FW>Kqq>9WEv6M4QJ`PsguVT(at4XSjYlb8SZGq@SB)t}aQPqzai)av}1 z)G=hIgWi#$Y`Rsw5j;sh%*|d;ELIQ+;sjbRR+ zV_x#y1KI;(K&_fvEpF#~jW|*w{<<}IX=YD}6uj1!h0;G|F3((BIi|qpSw&P##tE;s z$Yuh0A(0Xmuplx3zT&R1i!IqFv@?|OEnE?p>yj)L{2YHUXujqo0u_+t-#iQOCyDIK zlIL-8J=wrBifzn_5O4AExODZP6qmjzkn)36IF8sX`dUCS@fc*VG_d!{0JVgd_RZVTaT0liGKUs^sPvudDHdmX)Vcm-felWQFPx?nffv{;t43)Sj+{uk*Q?Jau-@%yH1^K*w2w=f3Fd*~$}2 zu}9-xrc`_ZxO8h|sK1>lTFmZHm4-LHA#=D}*=S7yK7c}~+i;)oxlIM8LyZAlX4*zj zmREG?BEhEsE4S^rJck@C$>s~8O>#nGB_OMi;@}B(I$Eao&3@_>&Tw3QQyZk2A_fmM zYI)N^RiurR=jT zx9#Y^RVfN*pTF`u89Qa<`fGlX%h^=4JAPEYBNT)*`a5oE?OBZaCe`-)dXl=*C3OqP z+2F;zprdhCt*Ye0*-@8qf|R)gG_b<%g4Y8X&d$hmvU`p63S4l95;C<3_#E0q2GJlk zE?isp!*rs>BUB*W=;i=2R3ePU1RvXuZ;kWVUb)>dGfG5K25 zB@;FoUx^MQv{KW|7HYuiRGTP19CCXMA5_fcQJMPy1eAJELCzS=9c68K9isgnx_YD2 zTCcXYCLlo5Dz+D06MU z&a}k6e|qv2d5w{Agc1;Db^zxa#~9of!3vMKDiYtv$WMUz$4eR6d`gm0DR)=jY}Yhe zW>S*tNYZ8l(0Ruk-i20QpJdJ%VZDyz2*4Ix>bsfC@Rvi(HtZ`?=^F%xP9GERppJzL z7xI#uAfF>({{3XKldpwi=nuBIX}phs>{9VFfE5D!@4#&{f}`^x$|K9h9FErdE2ys8 zMdG1AQAXRLk2!{KKU&c4*7Qe~Elpd{k7oNJaBW-Ojk>F_+IJdjZo;tB2(O>^dc`o_ zKoq0y+y6T#+;s$hmrN#0Br0f;#J5_?#6+ux{f=ZMszv;brQ;>K?nRJpA@5Md%J>b4 z@=Sl%@g9Q-qw{Y8loRt57Zv?S=)@fIF@qS2U2M7C*f;MhY*bLDVSNWvciT>f?>L51 zrt$s`GNtKnPD)wrJdT?#TKx#~h?1$XNx}xlNf$NQ&S3h9;@5ZjduUF8Bk=&U(s&r6 z;iU4!yY`EvoEz{)_-(i8A*AiLf7A2ar}LHNZMq$c0+f! zuNb?S`!Yy(itcs)Zn{Hm?f}xPG_#?e3Wru()t3BkOKeDx?+ZMKxfJe#ZJ5~IClNWt zRUly2xt=Ti!KWgFi=JgTs$=$#lx6Y*W1r&9%nw5=rHBL<(u2?R_wzNJp82!?HML`*wJ1~LAOrOS<-*n&73 zG{!}t-jj;CDv}r)VKTc3>d>#7CPm^$N1WDT2{WV7=QB~`>hmXQ&?5W*mz|s?l>t=Y1S5gddSwVTS2N|xCd^^gWjhpa?-+4*HT)~t7Xl~}CR5Vy z@5R8XK)6);e!doE1B~oVaD`WH4nFm<6N6ZP+L-N=K`UY9J-FWKM*Fghh+MykGYu(PBb5SJE678Ca0GWYs)j9ZCwY^vEo@&(hV%kat>~aVd#d~(U z%v>5&JOxE90;v2Jz<044TXL94DZNqcj~)cp0%f6$Ch5-1+c+J{nN9;V-@}K8ujKQ& z+>ZOmj5uFwtlxEvpQ#A=tycL67$nx>nuqdS)q>Z;I`Rh9kA$yF9-?JLz82w_G%i~( z|8KBkwyiAQL*64c6)aYo%ILWOE{Uh49^~+>K1w!~@lFGlSfT63ysE18cNb+S1h{L@ zA2VL5IG=x!_HEu0FOj(R(klgC+YbC5M^i7mdh3r3)eUpEI1UZ%z@n6C%v;SKfq0ef zqdxKy>45TK_?IkjPCeA@{%0P2dUvb=q(t_#E<_68%2 zhqXno(pJ1XpNs}?{-nilWM^#jY6bn9eagmrgzm0dEg`({dY_l*=Gbt>Th$Vl$;sOv z%U^Qi`WHFw`!!{>KO^u*Svi1XFVWN7{{>U#p(WgFF&E6XQJ`%z$ zv~r4&yf8(LJ4Z{-L$D9VTKBGKnfwdrof57eZE=&dISq+Mcad;u9@aW+#FHx%_^Cs> zeKc_}CyNEzq);7i)c< z#a0rI2x{Am)4=@O7Q4KPyrdZQh;0xolSMEVV4Bgk^p+`d`lGYNz`N=2)t|(Yt}HL~ z=iT-@2@#|t4h3NA_EIEJsXcz^)((eG6xDa~5o)@2?lu5fof)A2YF=az#D*8+B?_)y z=0DX+lfSl$7b;_t(X%l%HG*0IG}Dx$WKatN1ICdtVbxcR%55m?vD_cy1IlLr&UVBe zG6^+~SA%|BA3P8A+rab~F^XegrS!gm^Tp)9gOpUxuG`6cY=Rw8m{h9^QA>56h{yo; zLP50&K6AL`$-#Be%kwI13UNMlJPBpTc?oIVi0ZfmfYhNTm0;yI&X_ee-7+y4CT&`M zMq#{|y~{4baUh(=e`(H{a90r;mylU1)Q!+eQB2qHDT^iim5XSx32m9AgFO{^NJ)&k zdqHK9VI)dSUqA%~Agx+oy>JGnlDvlj4IW1))FXwnoLx(33O-6STW(nXK&X+tr@kb<#Tf)H8@_RK(4U(R91xm?DFo*zBpEe|U9LaGtMw8o9^*LX^=1~dzc40H8G@qYfC z$};>m(*+_^OR1n2Cn8PQW_5Lg(Alo`AL5ZkJ}t)) zUC-P-9v50ZwMY1Np(GkAK4v(;u^Ywt z11J)3KS3H2K%8Q*eSY`%VPt>5{N~3P6jaOt1y$f*R@2X&HtHmqzvzSmMlPHIu2Dd+ zA1$aPa}lf+yagMrJkmr?hhy({?Z+EVR0!fkV&zMguAyTL$WyYZJpQ%>Gpxay&oQR9aT>?+^1OQpTx@CjxLyvoyrplijt84Hxpt z;;H;gOKA`UtEU;M!MfejmxQ@k?7>Nrc3$j6k-X7?TEFB{FbXO^@bkYd3^T-`FR@a{ zUI9v2e@r|zvUUbx(+)p5dcY@R-+&R*JrJ-zcFg_MB(>SP1=^?8fvNwC=`%L#`C2&O zR#(N{s`Ry@Lg8zh0N1|f;qYudHVL+5v7WigFB%Py+8UOQ3h;@>3Jm=I{gvfNKbhvhJulP_}}U!enPcXey`v+uzn!)Y0z%mn850<<-y^vUvgyeYrMy>seq$z(EldU;R7VWuh> zpTE0HvhVIKfpgIHJ&Q;S{4{mu5=zgYlHFq7_0#3-@x)CxKw5d#I=sBRIVLZ(QQ|N` zCwoOeZU_2&E3tXMtdYX}QJ&_bQRwtlsoPJ3cGTO{Ww%Z4yoXN_!1Zzj;Kyo_C9Y?l zAQNFYxAzCFT@R_XODn4MD{-5BkrZ=1Y#LiwxOJ_11S0q9Lk2kaJb7+0L! zT_D`p4^xTgvOG+SIzyf^tpU;k8)5sY0Z|4e2bDf_DWT-W2g#ab}b~qdt@OtFFF#k_EeirKlyIrzj;&qNEZB;_S)=XnhlAixMLwiKk zrV7JH{GYLe&UgjI^H+X|M1on>)17#`510qoxf=FGVy8cb3!F?o*ZY{9x@cfXY2%S8 z+F);VFZl@Q)_+&oVh@ajZASo3FUFXvvsdnT!hx*oOw$3zA+m34X)`{na7;Y%BG7mT zb&!iJ?fC_Eh*c5-iE-dY4upd&hoA@J;CwJ)@zZc8^H8I~-b}C?P; zPIjcSqoHH%XLJw17g#w{64bMCa+#)1Ftok@F!W4(w8tH=2aq^*Hqg0)9k{Z$(ufr? zSe|*6`*l@$>)~{hAfdC%I%~)o8=uBb-#W-1@{uGCv*9(wb9|L^`9rFWsRQz{$b_!; zl-4jG_yy>*Uqn;ax(Ej#RbX#`lzjX0|Xkstfz;v!%EAH8an_!R7v|3-$gYHffN4E> z&d_uTq-vmiEfOD@IF_$58Qi~fDt(f;9wZU=$pD{!9c2&wNE91VoVnE{HMx+=7*CWw zkBXFJ##Hvu0z6esDVO17wLd1YII72P3?1_5s7F>{?;77wxn4bHBY*^`=46MHt=A~b z*4nmOUT8%b2P=Cq8!!ot%H4-F$*{B=kK@1^KoUh7dRif-`ok9ETUuRA@J4pR_*850 zmf*hvig(f?7r?g+ip<(ib&}=_;wdQN;}=bVC6Z*T9_qHxR)v0D)yXH~bT5+s7QcJm z!sb73O|m=bvUvBdUDP)w3t4SZ!fs-b>L^V7=p3ooPXFVNo3x_@V*(J}N$!s5_u}Y( zrUX!Jsr0qIyU%3X1r1A!9Jpw{wl7Tjn3nzLDWRzq(EAbZkX)7d(HMs-`X87-G=N0N zV;k7`0gPT{U=3pl9aC60&1Y(E(KzK1&952Ri-*IC+o1dInuL+Jt~~<&kk#Q|^Sdk- z;Sn@yfjgiN+6dum3h3r-mX&~N3-umZqqBaRvHO3wJ>bOtFcV8OC|vB&^_r?GJU+FW zbcZsDz>wAFFDGDxMP#&J?RI0=Io7O(fE5Jw0%jv&}@Ja?sRza^VV#_buEEOaoy zvG{yyx25`Wf0XzQvzfbH;}5hGD?Q4qXMBg&=))r-7LfxHGhHklCrKTJR$dG)8r)2Y zDvbl97_y+q*~!WJ@2?7B(kW!YiMB9iK_!Mr%})EUatBHF6{v_96+-3u-uEdvGyo8Y z0QyOjKpNfQRl5z588{6NeKN?Q7v5hL`usgywPr~$1(`Od0MH$fVldxJtek>Fw>ws( z%q_9Sw0mfJRUT*|ftKj0*nULF{Ciy<7l3~?u_;xwk%0sZ$1ph+F|F7a83Pvry6tKoOS88Epi%sh@^N;cu!nJMq4C+FAK_Up9@qsjccC&B( zuj8!l>5quSw#LRc-8IMW;^hFW-%A+yiSFhB!HzW7(^;VQGXPL=U|C^g6;b?jIX(}97yN$W}w0C~|#k?0~c)o=$V zOS)KDb5OCw=_1gMxz04+!j+J~85A@AGu&gJ%jXRPkGqbdqcr>c_-iw2uhjT!&(8)t zsqS@r=BYB|UvRhQQ>fne&WnOF{OfZ&zqE(Q6gpYkI3AkaGcw%s4?-7bQg6v1bp2o2 zy5_UXa=fp`>Fi%NtU}cM$_3LJNOV$};@wS!C(X}P+dW%4qHF` z)U~{@ETqI?r7KDud`#I^2FiZN9=F8s`o*h%pp+FO#tTY(YD~xc#f@tn5*A+T^Ag^@ zQ_){K*s3XC;{!3V^St@~!L{4EO8X5n_p3jb$@AKvxAWh}ZAQdGQ;qLOu*UD;6iBad@_jPKJfF^9MS41G%T==pQmCQ1hkjT z3WGX&L5IsjRVF%tBiKP@z~%L4Xk&fRC>DUng!SH{cq;wRnJIw2>tB3m`~v%%i@&sh z`jU{clEtGGCNvP@RIcxv-h0#t#HH_%wuNjqF+UIsof+z-$7l`9hO`S8uZAmnt@9LR`hgV_ zSYf&am9Luf{>q?0m`OuNoHfEQIqtpXWX-j@YP6AW){IJ@^(h7IIopT3DlNZl>CtpB zdck6D<9~;e*Y-^CXxK?o7Q&2We~s5ZXL8IGtiTIqV>LmpVzo0c+2W8sfRt$6{1I_- zEzl*qisr(4*EMe7*lK}hwu;i4VoeaVaa6k@uTu!bOJ{>QP&|g=y(KmSNRPiQ{q z;B2M?IA#Q`(q79wNw(jQ(_RVMyGbs^%+K#UjWj%O9TQ&I z4ILOivZ!sTzAqWXQZOHMWMFCHi{qNWIfu@a)^+C;Sw#_7kt_V&+*qM^> zi@h#hxUg+`5-H&U+?^S*a-kRBXO*~WfXFG7+SD}`W&Fm6Psu%r@Fuk9+g9Gd(5ynh zYykccAzX{#Q{xSKoC3mBQ_htI%x%2K&|`3WCuL@4l8=kSuXpkWIHTS$y2xP2FgO3C zW*u%3?F;{={1_S23$4t+FoF}q76*jf43GC4Wq2>M|DEhxUv)Gy60{~GOK==QrPNVv zJjqwZ_NAM&WEup{?B_s+;M#Zr_nN#XNZHlD$wiFw4N~V}+NFb%;pT$sYGD~6!jCJ)4s)IR{!x_BlV9~j1jVHCE%D8*cF>#%Kc}M z1z7^^FPK=0sGEpvZba%qHf0-r%Hzp&LQXpD6bLJi$SZQ#)FS$d0wwWA)OZYQlU-E$ ze$+)MhEzVYJEIjPP?H&YqN$=-YIz^m_NVHC+NpOGn~JX7x9t01p|aXvFvLgia-T_~ zsr}1J;9nI^X(mNoZ!zNNu`>vMSg1`vSx5 zE9+caheSQNr3GHRSQR9vxD(LB<3p%DaMi|K!4 zVTehdwoD(KEu8Z4W=;8#nX7D_BQQaWpO^LsP4JZvR!CxoD1QdH9AP`r;7j7%6kulCV#6*H~0G!_Z z@}ku8b+vQ;nRu|SquV4#S*Q(a@la3MeNry=S=UFBJ^8#EK0R+b-zkJe^KIwFnc`Z` zCG2NPN6oXnK6&>7ee}Pjn*_-v#c1+nxkw!>JMVnM>qmcs>F}nH+F!IEte9amI(%DO zujVGqK58;;d}8l5wwdl5IpZ6VmG#DJ(wtwSqnb_KJ-o=KZ#RzD$Pt9olb;T-3ZqRPU`gZjifyjZjYl>UH=S~Tl z-&bj$=n}im`9_M`*!A96i;=cjUVqN|X$%0t7t|Kht3~Plnt(HL^mu1Fl(l#dUTjwg z9o~2Rf-1g2$Lt@d&_L#u*NmCKK5~3(Uu_PkMmO;sx1Ngc*Y- zMYxEhX^(CN<^`dm^SKk=8N(7JeE)4J&f(RciC$M`8-|uMl$Q&z8VTRPQ}72o)wWQ< z9+{o*{@#n)&twozztyHaPng;eCa9gs^{w~iq7!qJZkb7lyp9f5Ki~Ab?uK{5@B595 zO8Jim{dgr~a3{RrrY3*am$ui!mYrX<^0l6Mt2WkoU+f7lPp}cWg1O%wkWC-wXOJTi z%s*olNxjP03bJ(g)qaw-R%FNPba90%Gg#8;*q1we*5{Kek)H{*#Y!CEShb*C(t2&f ziw-{4`0bb&)_6tfwLzsvbjgnuJwv&g+xx2YN`~adP!wJ6sk?H62(0)qzMw(2=cwW= z`rsdw;CxY4Ff48$$U>ELOSR({+WT$GBQXR}<+nWts#Zw2+Mgube1Z3iZwJ=eDIsZmtMEA{A^^n{2r|(vA|BWmC$K)2;8VElS(5p_w{XcdmUCW^LE3FyOu^YIj7!c35@G z$%|K;4MtaX4lHvjl{lOKRrE{vp^`fa9q zzmgDJ3~D5*pBfeNHo(*^=p`E4jqh!a{XW}r?m8evlj%lU@;a{Kyu2Xyd*K8Wg4}Im z`J9ILa9#bb(Qhf#BFS%+Q6@t!fT;5w| zfoATWy&~0Wf+YT$+Zb)b07>EjB#po>-t!43DFWzG(>*KIGaSpT&nl18lV=18k!V?z z*)0#i1aI`!Dy+sJ2hA>*wL+Itl24*bV*D7vb9t=BvNxysVsPa~JTnDjv z%n1Ke?Ig(Z(XoVxXSgYd77=A>0EiUSKVvag+rk6*3#>U;Y2JgfauUfqd)}FGVRp9# zba;HtgM6p}`b8ch5=_v&C2)o;BYLcCWoQ=)jiaf6rV>{5uipj*R_Y*4GjogwD#1rE z)&rM|P;{8}n8vH&@;!is83caQg~4^f_dK`v1@v-u8|B^HNqe3F0uz*#?Yu(T{_Q}h zsnSX8fs3VQlyQ0A&0T9dTs0=K%!@PdHX~sU-sr*tW8o@j#sOw=h;SB9hOhrF}SDe6+U^wHh1o=`>Vb z93<_qm4B3{F7VnOc+{*Xi$xT@V5j=p#AWF{hXaSh+3ALs=97Mg~ z{Y=*;v~uZ>X29;`RU8NnWq7eVq~s>=4SaJHt1fV`=Eu`6Lcq2LZ}05%#wG#u4sO9O z#GBvW;Nch?0M~?vHPamNf2Sl)d z*0u0Sfj6i~aB{xZn!VR=f);2zu_1;c0ByW#3K|eM28hmRDrl^RBH!&o1IRw9mYXaZ za~NSQE=$`I#eOTxptubgJHy>7sJm@ zq%hw6ahw%)7K7aq`JSN^%-Twpv96a6+W2n&BnJPAswT7tC2s^UH?b8v3VEz%XqK%O zrWzX~1#bfj1#pilvhllfznJViaY3MN*|n<|f1zfW`XP>TS^*ABCB4<*QOmjCuAGzW zqQ0($0S}e&#IR$U#k}ze?`3Q1hOxVc$YxCZa^ zV{tjzyBwns5&xxb_@JVNunF;%^kP9Q`>hEt`q4Qxf&EBUJ8=yAe%9yP_7AG>{1G(x6ZzDMN`Q%9L5Gy~|EY zDrBYc#dZmvDSV6?rS)&^E|IL z=;3LNn-QmVR+ny(qdB>yQtgMRj4S)kW!es+BS8spS9x9CxRfqZhzNMHC|rbS777Uj z@ZnZ0E_~lYBPMoC)TG6HG5gr&A(hyA`HI8DG@qN(?cbwNhVfn0@lzWkw#KS1fI|YV zPHWVRs@Myi3QO+(sGoflP_@3oeZ-;-WLOT8fJ?H3cM2WPY}qxJkMRFf)S|lx4i2VN zarbYcZbIV?A=xGs4EEc{uOlg)qE)}ai(>#9q?s3wPbC|c??4#y1dx zrFa36ANw5#aGHeZvI}H9^lS@xeX}po7NUCWeo`z`kJa8i3bJo2V4DZX(3nR9ufzkkV zBUU^hDu$tj_7U4lv_}rP-=9M+-Tt`xZYxyU7GS=@uz|`UaAGQ}|HW2o;O{_a=k+-7 zPbwI(Vd|Rk<8SZ6TUi4Ve+vTM!>ws$48lxUej?GV{!uHZT+@0@KcexspNh@RMA|`+ zo(}zt@i&-$@n8LR%ocJTLChm|xfK~7w8yBQkcaH-v-~SvaGMGa?b~x)j_`3jkEqs0fKQcz7k~+sWl-MfkZ787MVg)SaVbMW2O+?A}n-ftmDHP zgG@@q&<0pG+%i=B9K7eEJ=#HdX2^AlNi#5nX(f|Hk3go5ilqq8VqIb&U!pTn{jH` zh~0HIJplNp9EhL+ui3?f%}J8T@+_Jvn!S)Djk-D{bXqRa zWHgKvDwqi>Ai|MkJq$k^U!p_|7jif zitJ1OwqA=9-LmB<2H+d8EI69RiQ)I4B+EyYLGlZ1s^JfOZAJSq)OD_^(Zh+CX)*{h zk^-tv4u1C3nbD`2$Sb{m#a07YCG>G41Y6aYZ{vNO0VXO{$lENVYc-Z81~T{ZE_i>_ zzRJ`%@>d-hrMJ-&3V6Hz#Pm>y)rs~3JTRs)3jJ$jf)GmNUX$pKU_=6L=?o~#j6P{8 z@G4}^fiK@v)xND~vKirh`smsgYNj~A0*9b%)gKgwmQPQhvaga`K8@np8xrCA#IC|w zRc95ni*D`)`wJvcL);0uQi7-0~86VexIHzX5c5mah%rqbE8OX#Z1>cI^?(9yku1l)d08zLHw* zF;2!}Om!7eAFUH`==)=xQUF{8e4WP75OjIQqF7kn=DF!~g{?^vp{Wt7>rrqu`o=#g z+UTw)ycs|OF5^iv)97D+OsI}t>1-!XMUL_IQIm?7S)GFBT|>nk03ivu(zv+$Uf|SCE6Te- zEw<$MsOd&PXUkLLD*;mJfrEviO}%~Zw3T^2_ZCIABDLshExfmdfNX(yR`Rx~O}B2$ z7qegK(H0N#%&P`@MrlThhoD8+J72B?sIjDpf31(;TGG4~aDH{u#9_UTfnx)O4^|MI zKO|r4Q~R;4RlKi%zK5DryZl(yNaYeq8^=hCsQOH{1yPlygWI6N*Ih|(TfZKYEFwim zQh#jjX4&>VcjnuBcQha`rz}%<_gb^G9ak;evF%eo4$fgLy9}6UqUt6&CkX7McOVFc zQ|-pFBE6BWVfIy2n)`_-N?-w;GAwi{ih~3BHsGN_P^tRUqpz5psouy}1rXlqPmYgP zXfFpE&Pq83i*tj$hik3^eH%b`d?DV$HrDjNSmhrgJRoh`>Ob8r0TUwfiC;I?TyJ6X zzEEqSp_CMufKiHakaWpPmc8UirBbTZ_jn<=K}ec5vSKv1`G?uqTT}}(vImRncdO4C zmpS=TV=w`M%;oChSe;B*rD0%bUU*n|neEn32YelNtW*B9q~ut{-&{0eJ@9B_YoKi8 z(Tai1CO^%alDCdtOdE%LlR|534BNd)TXXjbUD-wWOxKB2%GNP}FwhF{oUI!K;jPzR zYxK=U6pR1pWtRw-_^_*oDcC%5jRGfIhcXX$6F;TO3>HoK;II7@chg2a6xVb8i*;3p%7?Etkm{EF_n*c zlq?+Y%Q`t`?r8JLdrVuL;V#df3vev{_kbEm?d!_Q%IY&cj=Y64%<==yRDkWE7*sf( z44vwJ#h6R=8P%gckQwEBiFhYiZl#p0#KP&gde%tjxh~rVkk$< zX&7d;RX&R2bBI4s_+2-?7EL`!^6v7R5-N7`PuezIHXbvZ)lmR?FD(+#cABsZOc4ip zl>Cq3m2d!$GTE81uA|=9BJp^8ssU!rY#JQHr$FVQTN29uYRc zWmJf$j&;{VUOR+Jd98LiS*G|xFa52G1 zjlFPOZXEQ*K*An9>VV~U20R%C(WEOU5g)M-in6s-;vzLFhP%uoKE)v~{PUj6pjW+r zVW_G&M0R=K-ES(2xm4HIS)qOKqOv_+ojjS?K6wzJNZWRXVk9344*T?8yhZktwo&J_ z(={~iVJ2(6A&_6PEd&lJxZCa^!4Mdw}M<4nX zxc+b9B`0~P)cD91Sy1ubdmTihiNPwiud&x}Tz`Ur8Ak}K5wn07FaH+8q?jm{pn}0i zfs|JNV`N;2#}u;^Zc$pF$_1Avnkt~8XvN7bziGr~AvQ5`0+?S8g`;J<3}XRH zaVhpx2Ni&(h+o{p7s7Q%{Fa))W{RbV0b9zaY(iFI1lFh8{Va_zjW&zG?M zD3g?^D|*~TY$2c_WWD$g^PTmJ2wHVEKn#8xSHR?EPVdd{g4re>qMqE3>3Zqq}ZqPco&!(L2Sre} zl5OQuz&$$@-f)axoU?*KqWQ+Y)@+ax4PJXbs*G`5g^S*sV@yT6-3L= zjx+Am>rG2pGHJ2pwB!tDXV9k3ijaRY5%kD}8^K1DzMxj2G-mrnS4X0BB!ZVBV^J2I z@8G4{=D9c7xLPGVCJ>xNr~K zx@nk{^X;d2Exzu=Re_fy+eheRaYo}M@iSC7<3^I%tp`mu$*q(uH zjyi*CXx92jjOC|YKx9T+xQNpvBt=N<_4z1Gc7BwAP%uFbZj~DlM3vJnNEaNa&eYf0n#mN6Bb9 zHpZk1+ogg6AA>jFXA;A|9tDdgF-NlB5Tlz89+~byN<4rWJe&K3!vDvLdsKqc$_j!+ zpRbxulu?Q3_9N6HAAE?>It|P;8kq_0jt=@t>J%kIy;xFPcvmWe3d5!Q1{t4>%B;j~ zHeyAtlGZlfJ$}XB#fKa@%BX$9Lrr7VT5mkWdcd7MGLbOXQ+U=}Z*ffNza$5t3ZZPt zL1yo4HSvQ$*9XVnPg5-!K{)~G+wF&pQ?GR1=i7DVoo&F3#`?b|8`dy2{?L_xgbQQ8 zc*AlbgLC&`PDI4|>(T82$?J+)-741OgX|I($s>S`yH{%NR*Ihp*pD}rBQW@>jH_r1Es{`Jr3j+R0Kp%RaBUit3&0QvgmLfMuBv0xT6hY`aHBibRbBpR<9O z|NQvRXS!)alxi3`k&+GBm6g6s7LhLDkzQ1NhWXpTz(7;`tgK6hpYaCIpAcqrc~ zX`L&VK^#JUP8R}3_@U(#aeAnPW4MUMhzn(5Z`QwWZ>n$!f&)YM6m-}mb-PRHA8C#D zM&BpKgxWenz$8q1xL?zozuf&NeSh{TfF$=Yz$ZF?D+uvyqZ}}xRR;#pM@Z_=!JYMA zP(jH`w`M}&$dScIfFmSew3z|=3Hj;L5*$2AlC#BPk9o3z{)rU9fy~u^=Xf3+u3O&(;h{=D;-`*z};3**zH7z{ig0TrS1f4K@A58zG zS!*v#mIwF=Qtruk>--g{b_4U4B14%LwBs*`$e>m>ueA~X6_aWg(DJ39v{8%vU6*bW zrhA3|6~H^91)84+By5MQG8COH6>AejF|Bf$p2JjZ^$(7LuNn#;Cc@TFFDzDjzK_^r zYMp6^0+|g&7?FYk7x^%vEip1!DKHb>Yzt)_QoLs$4;DCj1k$oU1kW^d7WYf9_l>{e z-ZG*3&`>17%mm^j5by9SO0Ldk@M;0bgh&5sa!o^9(}l6JUB<{0W9ZQ9#MeLI%uzl zvr>|V>b=BScz6IApJ0(zT!&p|Cgin6NFlCs;E92q&2ki6X5tG-!1;nGK=@dgFoCPKRjbbI`xiB~uU*3$86}X8x!_((VNGsa z%v~`UE4;~>ft(o4fx5Aw=%i^)D=Tm7dhMT#aIgqu5|}l(g9@;<`-@po1*Vyh%}fAt zV-XbDvvrie-Pgb@1;F|uXQIFgxSt?i~Ln1t-I{0je1 zbJwDxE(g}7_fHBpOl#t#Z-Z}>kK&Tuy}+WG2x-b6BL+mwf7EHkcezCqs<95Ui?psub&cu=?U*LyM!LCB#(GmG@MBUpGL?1%8VlpPm$F~!~oQb zH3rE@xC0<0bjh%;Ambn*DI(;Oze>I~hHv+g1j*C!jr4!blWs4^*InEhyirF|E8suu3!>fd&O=!Q83_HZCIrQt#VF-Sm90wN(hMQh#~*$FEh_Q%23 zTok9tv)OBX=O0uK*!elCSZFi-BYepZDtNMO7eZL9ylF&c4Mc1L8bj&{%H7$SBe0@m z8!(E%1eFGQ=-~V#+Y7u7Lv9U1Jx;S*n?QhW>WNOU+b<;f~ufV8=I`PsJ$jwemy=~*tgh^%0t7S*2pHWe9Sfc+@^t(&HZd2 zo%;Y9WKfbnaV}E=+RI=00Y~MjrG{QRW5QzUY(Pv|LjMAm3a(Ts{ z8}iiV;lj5Na(%|1^z>xX9V{J|N^`gi?jD zVX~eKJYuMffg$1)5|c{eMYZYXj!wEqAgLS46AnZ9%e7V2@0K%`Y*jr&9tmgwZsX|e zgE|002Ev?uVLPD27#_~qCVKF%jgm;EJs7v`F4=E|R?`b@8XNol#YxBMP^-7X{qqm8{ji5#{+g+DF0)w{c(d1Uopfpc633Rm;*U8((`B1Y6ech4u(K-{)`hs%7#38+ z){?ZzOR%RHl{^hBUQYfnQat#StOIC$`v)i(FwG7rCy$zTW8rZ?oMhB|Wu-+?+i^-% z$oSHM)h;cFb|2SK#vn)}%#^Ty>=-g!f>}tLR<22#SRN|X?KjC>9BkBuife$Bv=}}0 z8Ci)fT3WZA+|ch~$yDXPmes|Y(K6395Q(%KJx%$s64KI&oXU|KQci9_wv3ikW5LB~ zN0{O*0>oo>$DY)R6mD!YcG_;`&3A`rE7)fK#^3r7zgLp_$Qk- z@)I!;U~>8O>(4NpQ2Ja-Rt`;{06ezhMBPV0y@xpWbCg|<1aTOev8T+8 zk@ucD<295Y0PcaIcBw{)D6#8kML0S3hV>~94tG~SJ`|vPV`OAy*u$O%|6%(t6D}h= zV0dFuSxJtsT2r!ZMpU+Yyu%EI|A$dZnt<8p4l0cB^bY!r_4np|UO2F0l3BjR2|BAn z-S0-rG4s7s8QcF+T}LvPug69f!UOKLI_L1#lnf&mpVnEsxhK!*svEkIe3J9=ap|mW z>dl#(DiGt?`}SKUjL1weH;N#h?aOPnfw-tuf_NxPEZp!RT+v>GI4F1I1bhLNI7yF2f)`c9KKTSsq3fgV2FwIAk z*|q0)&$R~@kYJ0TqV>bX3Z^v})N>VhG65U~#X#~vA|Hd@|3OGA!s;m{YEzv)ys6*+WnfWGcOlTdH!)E?c-G|Yk&>9uHd5ykBZ_7z_!(C$B5`;|+ij3Mo+rROK#xBKz^M3$KPMv&lFm!`(o9L9!Qf#0FY@J#tlaZu1n~fnWrp8hGBVXm}pL+U84+40hz0YBz5m{&=2r-ZbVRCK^{dW)D?N$I(1| z*Av88KQPjdeD;)dB0JRe_k0_6v`LrJxQVc!m1BP`5D>1b?`Vr^;FZ*P_R+RNW9`{7 z!=gnIn&~g~O10$HAME(F`}7_w<5QdPB;Vsz@8Ry5Xhzd_e{E@9dxEj7g!=vPBVimI zhjOLeT1P&s`t3Wf7%Zr~d(=~?l#F8u)ab5m9_ACo8B(aI)o88iKj*hk`1nTA!6~26 z{LF6wgAZ;SEhhe~uAH0w zhET--qdx(luC4FSNS=RY-k`~YYGrfe$ZjSF!}KtX4FF>raY`Uh8DcMBm_0ibO6)6} zXN>>dp1a9TjCU(y7lEjAD`H|_sj z*bY_>XYuFNGBhKX)G6!$FEOF)geL0Hn2jS}4=}ZpT=m`^iJS5I*tb8QK7Z8FiZb~9 zqrG#Hr3l-JIdhYQm^~_Radok;CSE&hE-t+|eAnMzTbj9?l%)>X06?>ucQjl0 zct_^S&CK~4!`6)ua{UYRMK z0U87m0EwML*g1M{dy}Z|V5}Ylh(h*=1L)Ta0lfsQXjH;|q(uW}?0TAQq|x*SLYba` zSbgn<;zolbyJPS>8}}TYz5}^i+T)1giV+@LxRHp5(gtP~{QUBwJ>*m>2JNgln^m@k zrrP8tIUpV5s=ZZDX>CK%iYLr3e{R;fpE+hF)pb9&a3^oI%vyRc9kVtTU z1!X`C9+WTDW0xcy2YT`zXTRC?+}az=1Wi@=N`%~)Zf;#PsDM1P*j4+?TKsQcXh$pJ za>@Q+G}jQdk-zqTJK;g8DYxz~OS$#@d0+C9;cXnDx*v`l1bd%dAfPhzt>+_^X2FhDZ39gYHE}WoV>6-qgZPTNF&Pbkucj zSN!;eZc8=l(+21sAk+l;V$6!UUKbm&8?Le3Awu~O>BP(qZrswA@973V%`@(lQh^+fll3*#gGq@C`>h*Sxz zxpkOgfot*p=GCIVl*71NAa(ltW+2IM!J9MU;KF6B^_9%}$Q`~yEb-6+n8EAk2QXDe z?hV}ET>i$J;YJqnu>>@xHza(G8ogGPCdboq(6?4tdPIFjIH`;EzQ8&#xPHI6v+X zw8PzlqDCd?>H~{d){V0z_xQG)VPWg3WUM3@Se=^|0`~YYYH{w*(U(V3xf+>8rCE|F zwwg5E=YSz6=00O>k1yK;jQ@7XGJ=VgPi?lNk&}hF;h8fJ>p55dhH1oqhA1TJTastZ zsKMzfIzRXhRdd&-d%pAEPN6HHJ?Esm4wtK?f98M-;CB5u!eRCs{Fx;+2Ke*|fLL-fEvxU{`@)ox?o zLvS{O7@hau5eAR7Wz1YenCh$42F|cKxkp0&vhxLo?!m(SmU>{L_#rnf*^K}P`g`f* z!6<& zF!aE9;XyyXMQYKzvJ~^@F^qg?+N!#alaLSHB;jqf#j3Nvyh~5COIORk)&KQ}&Xd~b zpH%E$d+b8bPJB{~WJ_#YEHO)ErE6u{si|9&lMbQoEc?;+w6*~GeL{EJ>GW8 z5Pv;9?N!g(S&0pN)rm&l7O$o-Ae;vGaPcgI*4siIy`Xk4gm0K5h(p;+8^KAw5GHj* zvI6bsCX^b)eS(VFy59D{%vrMn-4aszR-g%7s@i0W8>BdthV4sab^%`7xneM(LG=Yl z7x2dFysBM7XGSq>%$YEg`Nz?Rk>d`IMFBSPwQ}!hD8#MMhg||Kcv!LP3K@s6?!ji1 z+P8#BSlp1sH4Lg4@OmHGk!sT%qM6EL+k6s_fJfx~0)y7 zi#lc6KHt&x`c6xES*QC3zJH(4ncaZIEeI7O0Ax7_g3pdeoh;L6$50J?`ktuWG(c#@ zDABTlmf)d~lQLQ-P3kHFbKux<<7*-D4l_&%)dxp{|@twOO2GtKnn1Am5Xy>|Kn8#a7Drr}ib#1C< z3vnvz*Z;tTcsIr-zxj3R_O#<*62Apa4v3uXo_i{M)u^LIZ8C}!Va%G z+upb5aYD@|$F}#LuDJ+0lhWcgH1N;3q$X64AfIamRtaJbOsrhRHDvA&B?-pFrF~$E zOS+F@F=&9r8L+#elJ|*U89CZHv|qwtqW)dJN_0s=$x4ozp6D(?i=o>4Ygm<4i(={$ zcpw}5h=IkDR5L;I&JNW3f(L@q1-eYw-OlX{G2w1?G39fTjw7Bz_cY24@>P{(Li2YM z^T}~?)PU^81<7j-R5?ct3j?ondA%8J+K+T?eW&s^D)xk8oTK;6&rC)5u$b@1ANGN7 zvlbU--5+)m5cXBoyRT(p{E^voy8qA6*2Y(R{q_|+V2T($h%SRHW)PJ#!6n#*iz7e- zO+V?vpiKKGa%D69Uc+VwNq%!Bx1~Wz;Dbco6Ovu)FIMmXIW}0O#Va1`;SuhB`0cO0 zp~w5T?s5xiY;4RDXehF(XjM@2eDOzTfaMkLLN!GtKKLfzh`ZD*&QwWp5y~N7#C*U8 z*e(xiiEVkAWK)mf$ulNaY+fTyHg?{>B2PeVyG}vH&zL+IejVK>=kTvIrn$(zS&R0b z5`VT9jZZmuNkI6j{}Qg9sex+EU`Ln)W>- zLE_NytNs1n@%HW7h93?+=`vllzdSG2wP;e|$;q3q-#+; zRgCp8Vdf%q9M8I`K$~SQ;JFK&29w2GE5#<+_FJZVxicwqXODz;TQzyfm}Uob7_dnmvd_BC$k;=G6aKqrz>pDe;%H~b>A1keR-#A+Nq;c=%XaBON`n->A zvc5$4Kg;oa&kUbJhq)kzP$TkA)q5_cxDxOWHo3e>Y>JVg3XO@UDf+cgGNG^&ue zKSI2HP%15w>QVcq66SEoH~mU{*we_Vlk+su{Q5>|pCN;bFY{vJB z(_68gcI-jM@~~geBf!ItH<)cfNw_{<(i>Eh9J1~&d&B*z$lrRnZ9c24tGVAZgVA@g z?dlXjjuSLG2MB`~zh=DNy<0|rhS~lj%Lz>WxC(Q&N^)GGSI+!txHKmPE>}{^Hn8F4 z5RhF?pTZ`g2*!V|qVtkOc& z0g_m@egX&>;2!p{%TaJU^k8J@T#acIQJDdQS3Sc!AHaSpWOo{M&~gIuu#izq4Ms6! zv>pRSaQkVzLRdE>?^x_ou;J1$Z5lw!gH$E$YpYdk?6Pp*huEuJ)h``Gg|Lnhm)N}E zIGiLbH6W1yw{a9hEi8_@%9m#PE$Bc;Lo#oBjFzcwy5#1%f0@gyU$;G{^=sQshkl^V!8MUa^I$~wIS zIH44N4ztPcg;@4Pzco@BS<(-S(nxr)GnC+ zfb-3l`oR?QZ4QGIrQWwKabe!ZjMt9_H17%5tiOEc%i|z3nY@nOU01z>&)s33Hs~;z z!7y&Qcnu66UsOZu^kQ85rnyOUA*NmQ8H<{Ul|kws?7Zz{qbPk39j;!roSh?^5E1v) zzwxz&PjDWvRYP-u^VbFuGY!pYgqOC2iLrc)CZ?$Vw9RumLeK@RmB+F+A|KgT#W=&EsA@oQrWB5+e>R`hlT% z>-4%uR4z0pz*jMF^<=wss)J&ukD(Va6CgLm7)58Gtyl!cmEMS2U)WF2+0Q>zq|b@< zFEKgP9e8rmIK40Efa^MDKxx;B)LVr$#bs$lg(dBJ3iqmUyifKO7`Iy#i9H=+&|Tc` z%-dnj!FYa*C~KB^D6ZK=jkQh9Yt`K!v9AGsZxZ`z!54?v-tfEjMsR}ao&Cz5@VWID z6z;#7WPX1@=7Bi9V5)&=u@RFDXjh7IA#D3tc>&29F+V?GGvG@sdDclzVK;>an0aXqi37s3$DynQA-^N%+L-EjAIAGFm_Id6p%ycO>M4obQLI^N0tZ@3G`*Q$RB*esjJ_s8V0 zY&hJ9t4XXADFspMbjDdMAdqVJD;_uYE6!~id|GHs&MIL?R)G6-laelbUeEHuWKNib z7HG*kv0Dl;)f>Ty7CRu~>4AC$35v{BJL*29hifgA#EK?}ibSSkw1)xnEo?V=9t~g* zE}ht`Mg%SFR#?dN z(+r(Hw44JfA^d;#tZlq1YA_A~U`XO>+J-x=dt1Z>{u>fE?aO;hPCe*W$-f2N$vene zw^!#I7u2F49fcP>t%1Pe3~+cVVzfjcGBcf&`seEUKRM*T$AN1-s@tA7AQ4Ue=(@j- zBkS?>rGN8LsCBW#*12FjcC1*v#Z>KEg1{nU@U`BxWhRe+k%J^b!Okj*Z(!%bU5R^ zg+B1H`w}MFq@rT~IQ)g#CIlmP7KOPsTbN@>PN)M;*!(bniPf#2Q$FJXI6Xicdof-e zBY(2lVJk<;pDmvjr1^eq zgDfrDGJr8y{L8APAlYuW`f$>=e{|W78CGBFqtkzNFnB1$a0znh&4rQiA;J!}&~EQY zQ*3mZ=0M=ir8Kv8b@7H7NrHL$eVO(wMk5EA)B$e4QbC)x5!a-QlwZKTpImbSE#VlW zMcql^7)*6NVFdMKo z)vAI~TMmS2ZKigLZr7gt`6#ba#a4i#q!;9u7|9gD6NEU-aT0%sRCJ4t}KtkwY znMN%~@aU|-D8_GaB^SRYA+!h!WOFx)@&&{?TRtFsTd7P179fwtGbFh&3kuNR$=-)q z3$}zH%xV1?DN=*GS`P?ZZHWpm(fKX3z&T63iy8FvxTe+DsxRa9Vp&eLAJ4h(Zp%>q zhECfKo%UW7Ml2_iBN6rS5DBE;!{c0$aVGfDn;Qzr5^0iM!~2yUT(k_btT6#soa2nUio56i zdK2@&Us*T2rK~TL{zS;EWD{xUF*syfP`qG;nRdRPT~PA4aEE-IuSK~2*T2+Ce%$o4 zM2VB9aiH?Q&GIsCD?MJ17}NH@p=(LA@e(7xw@Gd_khiVzTciVegb-?qa_-A`R7?1V z#bMFx3rDdqQ`$NU3e&}zthp8ly#26yM%=C&`vb|d8%0;iJUub;=R{)?_U%9euKKO90*A~kB&B5=PdbblZ7Ft#lxUT{My?u@98&By_<=}da<}4a1 z(x3!GJQO*!v0&YH{3@G8gph}*e$(|CtOG|xl(f!N`mqTwtQ2*mIkJ5aqRn9iKr(jN zVlcVEE)L?B=>Y>uVc&z5Qa)Ez*nY5bN05*HtEokSsmuPA%7a_vMY|VK(VY#ne}22$ zN>zrTl4pny(khVHAj!`LKz=X#8OHa3bdJGB^i1?0PciHraQQ$FKsMo6YFo3{!IT&){@C5s|VPbK%>Wy6scm z=WX^awI7>H%=$cOcaGVY_rJB5ZpFrOey=wUvQrKw5c_?XTxia>d%~OTALydFTikVI zg#AJ3v(8MvPA!=!49g^WYY4n0b44adylh8#d?}l99SF5a?^4W~uQYwa#izALN@hjG zi$R$>`IR(zl{!NvDQ2K$u=n0e(T-D`2Z)0xreR$gq{d+P0MR}SNLFYL!Yr^?)eGlp z&I3GdyexZX^5lZvNP58s{?X?-Ds|bF((>>e^2SO#3!* zaAbbvOTOH2`wDCmYhFqGv`zeQ2emritA&H@6BAQ7;F=be^+$8^Q17r`iKG)I0m8uMUmXy6W_|6 zlhxGJ;#wb^UfQO6@!~~J|EyVZf}gIGD>lCI2W z7r2fukZif@InV_k%iFmA;hxG+M!7=S_QX?XeSqoj-@2=}2{Kl@f3oV}bE~)zuq4~S zqIgFox8sfe#{v@@Fk|vQfdlc-Av$yDli{;O2v1+%Tejoynl$U4)PlSptHj-t6w{a9 zajU3~?-+|Jic^Yo;}j6FKi%E$B^lp7_}Sug)|;jB4ejN8UDmZ5Ck$sWM1)U#i*Fib zAM5Nd>=JQgAY{kg&yHH=|U^E#7n+Qg07z!`v z``(}*g!62OR#lP|+?@ICz30OhcVHit(&lHR5^@}m_DY?pAC&;uriwk3Wi)K0fH_Z{ zf*ZK)c91GHQch{aFhaja;s20uX;8e?W_CG1#Mz=ipg!sz!>r^#(#LElx8fmczY zsg_*5zD_fEI4UUmMtYNKE~k(5yE|#xd~Hf;zH^w6fuk=m$bS&l>U!zI0>;sst~`zZ z{d2eg8<$9`v#Qzo<>})ld>|$ISDyZM-!mNc+D1HkroiPHl;v)D0H?p{gQ~I{R@zG@ zRiF2N@xx5xl?FEW>Y}A`$(sBgYska+k`%S2d7;zDUWHN1H}fTOpk~9Fg8|4}QOEvU zIBVE|dK2*SmeBDZ#U_qi7>FNX1_Ra&P3cMNQYnbV9?GP?#F>oI?;Tf@b3vn3nK@z{ zU+KLWUaTFsO{Z`VSOX`F9}npA`pse-57BLx$7*!EWd`Hr1uM>33>Ie@8t9W5HfF%2 zqR7%1=@5E;|R zf-hZ|r!o%U#(?maI_s>lx{=QdoqSXr>+E|o%b=t59AJu^%IFBWh4lkC8-7$IH(`N| zyTH^)9^MFpWsW&b$VsP(?~lX6gremPwJblXiJ3NL<4|2r`TW<;R!>!xHKLH}ti+Ku zK{ggfaqe;AmJ~Q(&DXhdspRP(v<-NSX#aa?o>jdeKPdDa@3yUT_l4QsU3ZGzw zPcLoSFq`u0(chjud)D;HDAp}=xLzt6-+lD*v&hKinDic3oQLasZAZSSXRXZRvpAw~ z^wd~)g7SFFdf7uaGpa5|MswzFLThr1jz_d(Ei(GNtOmL;O>k?*nK~L2zoZnJf^bhA z-1OUk%%6BFYb@?UkBCE0Cj_u5E+X`I+^%y5H;$QX9W&Y13#+unn5!K5nS=&HtM<3% zif^CGc)Bl_vyYB8yIo;@%$wfp%V6lQ5$UXT7%W!(^F1vYx&rpD;D%^`o%?ZCRo1vq zo-V7t&cVgm;IS11JZ8qky5w5(!lP zD08{2qeBMP1Dky~b|Ca))8@2}kdYGWmt1t)Pt9O`G^*6HcAI{t(mwyi+w}a+g2T*U zS~oYQd$$OnI)8hBg%W}XZmc}m0(c-Hq!FDQO3n)P)UmELI{>kw1O@s@$2U{CQq9`R z(}%VY{~$1|6`t#X5O5bH>6R6qGK%IQG8+3E&U|MhHj<{G4d&tO3ZM+J8UGe8t4JZ_ z!w&GpZzB84D=I3U`r~~XerYonQXb(@=)xzxMNbR!SOi=omV@Y`)Y%Ltd_LGHZ7BMA zTVVI4KzN2TnGaB!39zdd&~4^F4MYVmFL_j@+3Qcob%a)>{%684Z~S}jLN_<@Pa7&u zo^yM>{|3JKgCelLTkwe9iWB~x)3YMX|ME&i%S$-N)xB9sfLA;prhNJG#b>9PZmrYQ zgxHyE){9fM`rcYgYm5^VKfjIPF&SdR^Ypt}Di=_(`8}|3<7LQ2^UUj;hUFQdY^2)! zAu2^P3V$G6HF>Hk9h$x#8fAsRDq=dE=yFRD_SwXNa-b_92WC5JvqQ#?RMU)m&n>mV*nVTW49R$jmAM zn{N3|(=!UAUK705;H2QIyN7GJZt&Y(KssPKvVz(as~?mDift}c4m;)uU^vFOGb4}X=O8Xt^9uHb2#FHyS%Xoadz zR|uSqc3IV$<-N|x7xJfwKX04YdJzFbPznRg-mWloR`>u-i%rYD10k5C8qR*Y9g_qX zfRPznf-OoU%|021^PGPuUF^H9{alu)gYJjZWf2yYo+o0RdrR-!j9uVvY2UqE?YlzF z*}&cIJNy$}+rFkf@p^CLRPn(_=JUsn_|YLl{qMX(eb@SnuMI0PLjb&poqc`Xv!Dhp zh<|w~U{-(^sU$M0s;bHb2cVujq19?ti zF{lj&e5Tnmm@`^RbG^0sW+^Q{>fM>AMLy& znWCJry{Yp8Qs73P^wrUuy}fo>eH{v}Ot|ekDYxA)>4d`Z`(M*mE#b;jR$?~Qr}wyB z9nN=R;)pOtG4?5cZR8~I34@d-e1O9dJhx#M%c=ZX3covKaM z%YuZ9?7w~*-j+3I{Pnx`raC6q@V4yc++RTAj!V z`FA~eDJ}O8t32BQF8EmbD}yT-t{(s~jC1(@adq|Zk1o;n!?)-1r*8`NN}jr`bL*_T zH2>UH%lbVImp2Lj_-ehjCw-T0uVZ>%v9;3{xfN-BwS~=<2J;mnh9;w6tN-@V4gQss zVouFw4U_?`UXFaBkQg7VF_+)wj6!hKi}c?RSb{%6gQ7hmPb0`tFRcJmC?c1Nt2>+D z(7iLtMdXM05M4uM@3U=Wx*%B$ENIA3y*M|2u8Br9+NfsBLxh@ z@`5RGJgeJh9KDHVx_T!HDU>GgbJzxn*L@WkNMqJwMNll_FKU`QA~7jzLdQQLS)40Phq2n}MD6q19fn6`E@P-qAzmTs8FUqIaTR=V zW>urcFX#R_T@-$#yvFcMg?b=w_A&-%Up|U zX+@8v#L-wTVx{z=$@XUgX6L^ztjBkD31c$twG-c6PW_hcal|^*DOzA`ZbUJwKl>LT zHdIAc_vZ0q4WH-3j2)OIC<;hm%9GL==HK+hL&?>?Cv|=F%)9F`&2go|9GBa?aMNQr zW-wb-frW~MFhY@2;>z~(qLk4YEl>|Z>c!Upy`77UPEufI#m9<`p-LM6(b+r6=RedP zNc|e)cu>^ArpJJQR8uPWHUK!!gj^yg?tY%^)TpwcXtuCFHaz-0INhHyUFle|&zYs1 znwr-w+#QCnoYi0*QgJkrc|VQ>4=2Y%K4K||>E^h2KJ2l>Wz}`tQSt9$#_30SFsH=K zQokJ{R97?PCKE4w6o1Amt}tC|m9lHCQ`O&FHLB+wm5H~X!dbP>Q{dN`9Plm=UY%Wl zCvy1xi(1!qMf&%Q$XnDqY?y^}Ya;!l<1y8eM7t1> z&P+OMJG4?9ZHFqjoOpS15uPN+LEY8MiLWLUbDN$%zT=8!AnX8r2jg8NO0p_Eu5?=s zTbuXaUcMO6I{e%Gu5)$Z>jy1f4A zGwTK~70BY0d*jR!jC4Mo*L4(NbSo&`quaI3!a!_wWunhHnlfcD*iJs!f|4nXUq zxKa6YyvD98`W%**n1RQihl}TpehoKop8wZgG^&QPSt?(^s^+t*Lf@}GG)PaU5;M30 z-O0~r1V8(n1ybF2RUEHQjWv{Bl`NaXYrJUgsw+oP;D3yD+=1Tr0abWod;?!Q{OWkv2}%9sXtBxupOrt(Xfu!t3q0=7Etrm@g+9nJfV^ZEhbR5*u>-4kcrU#;c4{I8 zZ6OuSoIp$i=b6anV{cqAyTRL_gJ*YDv0w6ZCn&5OH^1=KC1dSnn@%q+3$x8rlQ|{P zwMdaSaRc8a|5(*;S5LHEczNFcbzVXJQhSHcJBd;gW2yS32_0h&u8-9chB}q!F-D&1 zw(rg5oCXh0%w>1Lx<*lP`^Ue>I=O91s#?Qtfg`8|4)xX+Htt@~+9=g9suW<`=F8k; zxXLVNaet)Fgs@=aUf^$LsKRrxkE;oPb@9}u9&lstV3eJM+q?`4Ui82Y^>5t;1nBuh ziO#e4zKW`VACF@L{tn@L80T0Hmo9gdx`VFjR#t{UX~W{ZSsF1>JZg^DK1aF?OjmmjnYl_ zLAmvS+rFNP@|b+VV7UuhO?XcapC2A)>w_a-f%N;+u_`ziQ!XCYyVP;+Y`MGl{Y5jd zW{#s9*o1-T=?y-!8gGe#mGyh|@1*qEjAzbEMTTxZIs*?CN5vp%N%+;_@uR)lFm(w5 zT2k|_5lRz$SOn(Y(A|uqos)L0Cg*CqM1Qt*;Z=XNYOkHe5Y~!olA?Qp(-uKcEKBq# z&XXo!kT}3HZ}FV4fjWnH%&$W^6hv!2*g%$m`=ak8F!++W&7o|;&PG-h@`f%vl$Awt z>rCIwj|2FbLp?E?1hlCRr~EIc83x#Wi9i?whm;qA1rHo>g{9MBEOBabY^wBiiZlos zMG+eN&R|6Y>i1l3ICOVVv9$T$S-`D)s=x0kspMq*rE{2>we%cAYi?E4# zYD%Am;mKYRJu)*8$H=TmEH4K|2=_lV6RTXR6jzkCgZtXchc12vZQnEKOcXBhZGB` zfI;@~G6Yg&_)M)k;6Ia_&ck4!IT&8tqGp9)lW08Qa%LDNy88L~J=Mu_L>Ipcuv1ff z;pW3AN8zjEhO8nk*m&y$nvx4B(Tq{v#GytC|LAVt>ZQ(^HFv5@XKE#WAizjH-bm`_ zait-MSdPx;vcf3JzmVTYHR?0b5mO>X8mC}NT~DY3R`6WBy<7}c)=C)%o6kmUP=vwC zxJX=;2dB->Suj`%3A+7SaTY&OAa?EK@CTeq1qj&4`1by=8E7-%YPcvPdU!}N9pB(a zDbd+&ETQ=CDSzYUvBR5F*S$p7m-qD8J zB*cNat;0LI1<_I1@h&H=-UakMee&!~5L^bs9Cq9YE+*tU^LxcDoSGLiL-D+-=fVOA zHOP-boyjE-!I>+_Nxls|Wu92+1+#=-IcO(&gi`gTTSFvYWz@~C69`9MKJ+ju$$m0BQ;D$d}& zsOJ$T^8Je~$Bh9~oQ=GNj(yhvszP7M3!0;Yf?O({0K(9F@>-$iw>5AQeXtG`2wt8H~#F#fOP@At9#BYG3W1{sx74yzamyFddp~0tPtJ2uA<6Z2A~C6LekYTRH7L}e*1u2o zvn(}LR2}}8k~C0YRL>8g&C(nSkx+S+#huIZWTmxW587?GIpk1BOsMSpFM0a8|3cGf z8HI+pzdU$yd-Ti?yOpK;_c=^-=_upGe0wawQ=l*iCq%=zV(}XE30YGUz30$Y;7LWT zu?QYE2bsa~7BO^p0NnRjXYo=Vr&b|zyvahWJ=2#BWy`((iW3UdqMpf7_~?e;n^8<^Z?Mn56J^|;b?oH= z&jc;2YC(2OKhP!WGS7~5;~4~8bVuB$gRx^9Pb`O{hx=&Z21iVN1JY-xQ2-- zc6X5btNXJx1h1RHaF~NQ zf19ESj!a_$Y{s-5x=uH^qS3|UmI%AH12pRU^ZggJbKO9LnKNuLSPkEaY}E|h(-n0W z24nu9yZtI4BTYHpB%^;nm2T6ocka6KqEydQ@?u)?AMbYf&qNXY5CaKElq7jrn&pX! zETi$37f&V?tr)TMYQ*gai-vwZLYZ+wu{d*vmoeD){ca`>YsIVeJlF{5rputCIuB=d z#}~fm%wA>!E73SyHdOD>rRe<^v>5__9o#s8HR>b2zSh-QW)OAaX{*E7$_#TG`~yJW zrnTsnsDA%eU(^`~InU3V$_@+vPJ5wMwnyUz*%6sF-DPw41Gdgne>vbqy-$;U!D8iOan`kFebgWOn(RKEW75DCe+)?k4n8n}WRRm-|=2W6%?{MP$!+fL^HbBVwUQY=&q zmf1~}fnfR_C;8kGZ}ver!CppvG>1TcL>kc|I3k;W#+L>z!AHTlm|qr|V_7b|8U0mq z=+QC=v^=4(n3^(&X5NN_5mwiF>98|De!}gU$Ls9WaX|LRz0&?a)JFN1!Sw`}z*u!S zZA6m#I-veHX$EVq^Y*58R0`@((&6M0l}Xe z2HyuPEiJ$G)Ja7zm4Mu6C#sre=#$(Kh(py_YUL;-e&2JjUSHy%L4(0y6dt(gSz#oG zK4qZFVYrw+=OJbzDDYPrMJ)!t@gGn{TE}0%#pwqJ{%k^zcPD1)@&=#@o%ypIPK|gL z6BAR%?4E_fD$>Q@a>m2j-TyDp{Er6SCzs;$g`C|+pB@PhFjpXN(UP6bw_m7C3MmKi zVV}^&@ivL92$y2}6F1O6zr~bf2?l3}MdU(PY&j#R3HrG6Xn;k@O^i09x6Eo~)>Jv< zHY}47`x4#pA=OABvhww?L0*pfjl8toR=1zog?@cr5|rmxd`f=MO6$FXVTP>H#UsIe z7q3)1jJ=5eG2edV)kxbb`*7|u1f{7e~^Rvy-j8Cbt5IWm2}xa`+%Nclf6Ferc@@>RMZ@_ngwYjDbv9VvUn_>DKcMO7d5-2Vrd{TGH;C}N9Q!_`DtgKg;P?oe;_i$nc+(c33qY@a<6 z;4twp6E)jh)n{IHqA5*RHO5VQE^bjNpG|%N1b1E|J z-_CNcd+@%X#wj^A_xS;S_iJ}F1!5UuUa-et^HjI zb3(bDr^4bhk1yE$?e7$@TTl^tx_4W6IluJUdZ)lYScW-uP!My}r+1CoRw zb&OHs)r#kAjP@= z)7_f~Vwtw#!%sC$Q%$?2lD2uv7FsFUYEl!4tdXTik`RULrp8o=7E;O*S+Yi!5Sk__ zEwU3*gtBJe`Hu5`G@19E_qTn&@1O7aqiMq9e(w9a&g(pn<2a8KDUiw3_jQS9(%I>9 zp7~@RCfoP%5!my=@)_$0Y;`ECtlA$XSz08AphDj+P#Q4g>^(N z1^8&zmm5HNa2Uj z+hs^-5~V&1ysLzEN{@K|;}Kx=pI1sy@eH0$$HuKSyM=JU%Jy0-J|pb&pd3&TQr4!# zm{>sY?3#LZtB5`i|L)5GT(Y(uPpiM?cCl^j=A0M0iG4K&9>S<#h!}?V?$%Yog;bc5 zngrFBg8rZiiFCpok3sjs)!A&EJ*05_ z?R{J3{wm*q*Y$jqt@3wEA*Dl6qO7GMIVTT3eRQ|^S5XvIyJLYf%94sW>oQ$#7$3V( z0_0kl2mcb34sxPC@QFPA6CoM%eM%u`tHlucJ=23X(%JObbS8k}d_!Q4^3lusAfVde zC!$NlV@F^1m=qPx;A=Npl&p^88EyRj8vS173+NrY5w1O+Sk*skOM{uchCCQoC3YED zO=Y0+Wo)B`B_$;cM+`X~QwI5H;Pk468sH*ZhfxmJp^l&^KxzQVjhBFH5SZlibg(&o zSq;K@o#%$vzd#!X3wyUzBrBc(qBxEOTQLPObVg8N5a2*|lsJN5xKzg{Gq4*~EsnX| z8b|~1JOIGwjfC%_P(&3nibtd#hJfdoTp!SYbL@j(2j~u~*_K#5ShJMJI$v;Ez9;Ye zcqN{Fn@6_{z6)u;zdrZ+bR`aHjoy^CnbmSej*_5+2K{9 zI9L^l@S$53TufM$tQ@>9cTr~Nrnr~Uwx9ySOFE+^`P}gh1M!aaLIpVCQ91$>K9A8| zIe0Xx^LSc*4m%1)CMKgTosI4qwkDTh;UsP^#WUTEI$i#GV$d$TGJrjiadG2#5AjTd zN+qy`8Fr`sSR`{^bEi}gn=ptYIoaJ(Xg7=Sao5DT2lBK$$jd#VYo9CeN?dTs0rPu5 z7xZb}FdRvd`b*@NRowbw%-Ou6YfZ0oP(%6gYO|ssF4iRh4V%0WSYk@&4@y7~cpBY9xJMl~+uTA+1ga!hj<@oK8uBUGP@@NW> z#;DTQEX94(Of~!e2+O%;>sRu8x?Nr`Li&RDb~W zc=gcOtG=wh%*|NzhES$6HQMqZu?nloqoq|($-|WO2At@|lDs(aAn-eg7QTnD!A-Oo z_X_O2V+8R(3qbiDeNR9eRTN?Mq{d?9=_BL&P?K2kdyVq!tge9eCzz9Z1Ce3gt|jxb z2TS5rCON^U?XwP?#7;C6P>u2Gj$a8F*jRFl1qzn}WJiyyVBqq& zQmGU4-zMTg7jA_S*CdTp5@bPlcmtI*QNUSkU2D?ZwVW=M~4BE z64k3}9KISL{!zDg)9V`>_^8w{e^C2yVc*~jHaJjEb@beIP&#fDE~DqbY-b;4lCf2z z>y+8&(VZ$u8kl}$?AIA-srhM;@kQT-`I))i^!>aYs8L04-@HgQLTyPC1!Z*q=GyZ= z0h$J~WzN*WSiJnP5R3NKvA~wE zX&A_U=X2fn`z#=qZUZj9qKDhXK_za#%!Rj+-BMwe{b#6I?(6Y8DQFY>h9VJ;wngro znH)k-n69P$0d#^~QlNl9glsV{fw$PRa-=1}ugCqE?}r=%zdm=V!*Lyl)?|T6$}g$F zBRIASMYUV|&9PVDo#&$AXie3}2e8ggn&G%Kk;gpw-i2RE#8lzA(l)9`149awqIgjp{=%udZ_T{6ybZ*%9Y zWfnoqUfOt(CxN6blNqYWH((Mmd;*-vdr;<5!JB|>6X9DM2(12bCC+SdM%|_J5)_gC zX|wP9yD=uBhiYj2R;=7DdX61AShFbGJD`?Yjg<BZM#>p>Iv0jMd6uh$5p5}C zLjCokE{mXW`h`K_k&AjlC=jBo_XY({l1de&O#sbp2pIw9ZE;8aefINFhcx_NTsY~2 ziQRzcwBq+IXru$G#Y7u$6Cf|Eq2FRxk|u-|qu~sAIk)bqeJx-Q>9RL|ZWT8i9qt=1 z9id#0-Y85O&&Z8Jfnyv(f3nvJJJX0D{&URvj}g+9z%7mQ2=4c@vnXP70Dkg}hu*Q# z_A$~^?*-ydsN1bmg}VhNebF-F`6CGBTF2RC!^>ki{4$r`2mOL+K8GoXV=^E|%Ph+l z6VyCU$CvFfl&hdk!_-Uxl>qyW{E=3j|FLAVN-;Ki8%BL;t`cF$ewi4*7hX%J!6b*G z4*5)0;E_Yb#*u!Iod^*x72+~Bii4pCD#pD)L3~*Ib(tvQGS)sO2i)8M0*XC2Ac5uP zl02nm;I*v*9-j%PU=jKfDC@_XKq6xRw-9928c!q$!09vvYDF|TRqGWlFdaauguYq> zOWVGY@u}O(SW*(#UYpw&w=W*+t_`{`o z5BlC%O(6j6%AwH*RA!0?MYjzJGy?VxOy*FXe9(Geuq=D2c-4uYiDowmd%zcXdL<*3 zrZ9tUA*E>L8;N5}z2Mt`)r%w1k~T1XfbyaY+jjgLl2Ip&+2Dt6!XzS~7TVn10zpdw zn{IBnZirHW0Ntg|{U3Z-u9Aag&x{RwV-~TWmC*sqpH4DJE+(pKz-D4?n06b62jG?W zE1lPQg%r}**sW}uDe|q`G1*}0$1^gxad^i&wVGNILIeimipC0kn+hYp2jJ7N&LPGT zWy5z$a7SnI8y|gW^t6YE8)X1WgJ0xf60#sj>bbu%rf=vG)4|e~_St(KmiqWIrxYsk zaL#Kd4M`d164`QQA78NgIUsII`g25`FB4K3Y(;`I5>ZCpy}KKwi_1`DO)XI;Slu{z zgncGMsq7^^_=#4XA)a&rupDHV1pHRvVOc(&&i4deZ%kJH$ck^QVJu{b>%e$^WR`RB zrSU6&PPSrsc@3P6tJ(Vxzngy=1v!2ynmmBl{Y29AeiFfl?z|}Pjc<@OqI_nFb@4v8 zROQ*E-eHtN%G$7twqygj%5miF6khud0%sTQKZfJGJaKc^(<8l(jE2VXntROqdI)Hddd99Xs+K!Ia6k3S>JG#00bhR+6p{!eQ{L2pa7*L1Ih_uO8`9WY z;#nOR*=bJ%js~igu=rZ62k})%mEIl=y=X8&uDZnPmvJ!zDzSI8Htj8*ChdQC^botg zt8h0vtatoRkbja4G>?xWL^Yj{FTzF+$W*n>q?=~T{#mkm8{zbdC%FJc6*MFu$I|wr zhw{+tcYWE5)*1ZvN{C+w)w~myqzftlYTS6-oMDGiN6jRV!IdsUE@p|AUVzP1;KSZq zAk6VCo^2g~Rn(!Q{kbwv%`;jV;f2gt$C!uT?=s^;BQa(ZeI-KJ*BNkpgBR{Sr4+$d zhk3Mu5K(&&hfgKXYvT=B#3e2zIKINV+~j_;%BR3f564Dew!inbCLe3LLlpvTg62Nvtrm zeTBT4c4dNgNxfwBo(t7;brs{8TpJ8>@C(bC~eNcdry zpV^ZeW{DcpZ+h;ibV(*hLB(iWaH-$P~<2&M{PnH`?#U+5qJBaWdVt% zL8v?0Io%IQKSw3PoHr(EEqy*Xr1cJE8Gcz%U|QKadywX{q;AqsvXKQ z6WXPR7-H92$;R7;IHIT?q(oxyg~E1l@$ zRW`vH5QLgbgQ>e%okx(iCZP8!T+Q^6>m)j^!_=}kG9kEe%@-7U_#*Dbu(EFYInJi_ za-xp&I5HibVQa+==O;&LypwX7V*_r`4jY{g7nk#;W|d**g4-O16I`AIcjbG8$4Wej zH$VMwB~Q8WYyU;r1Mk|_S@d>47><(JIn+Clxtq81aMewVC(rqoE<;_4p*&jf_h3|t zhr&B=S>*|kXy~z>@#>)=-3u}|g>-urQmDKGVHbagGq|t{bpTGA;n1~0_P=&Ue5$`H zV6Vj#^S=1An3wwcK#UV1SqB#OmuPoA4L^B7`{$#!G?3DuIaa2!VOhO7YMKI2A4f6p zT>E)Y%ceGz%ny~1UNx?$DQiitYFX?n`9d$0WEK$Km*OsOK&R>BS@9M5>5?y;y`X{G z@-|7UNAYqV;$?6l%s8(v$VxDFY~CevQMBis02KEenfKN+$L}R^LaV?bMW{8K>JMp? z*i7-RBd3tPNo9(<ey*EncIg^9pRL zU)4Tn-WgGncf~AxEF<390s7?3?kxObGwc?eE**^A>X&3{l4w0IR?e*#%QY@*X!h$7}P8kn&QFOxfy7x-ynH ztDn-5dB7}L9clK|&*=4zR#=6$yGO<-(IPj3oo>inRAlgKwPI-cq0pr5VMF&mt#X*v z)9y0l&;mgE^JQ&!y?%bmC0HN#q3x!QP5n}X753iduuk=uPhn0G*DP~%qq90%Zf&_= zcQI8@=%tic^CI;g=}PN9i;E5$-rbbiP^y<97gK4JnUsF)*tOCpk$yWl8hXmA@11ZN ziP|4(|EmA>I~h&jA2((MI-r{MFuuJY=$!|X}%y0jzD7I<8BX(nO z+`t++(Xi@S^MDo>!543TXvgA$l%Iumxh*kG-Jk3M7KA0Z&*sxi8s#-O0k69W9Y%2GuC=a-qyr+MO1R+jd^J>+pLlL;MLAtUsXXb61D0ryrm!z_|J>P$E5#Phb zx?-qby3~?#nv?Im#PX@40?S#V!Lvkx;5yjJ`Mr!x|?N<)A1IZ*e{cXlhLu|HHN|&`tZ~m6dDe?(BP& z5up~1QE?sd)4d~-I>Yw{&Ck#JqIT$=X<61j)cB7dM(Hga(0RDk$RNq0Y@K~ixkPN7 ze~Wa}KeL`#==q;Mgc{yg6jzwe>a=JnpYwWE`dDMy4jZ!u5PdfhD{o}3H7Ll{_jk+x z*gUP3_v|zoFw6kf8x8x6Iw?uTsi9Yv}Bxyf5t66etkH>gW@~9eDr;H4bd1XFa%3!2Z{C%f;c#YGAK?f+wr^Chg8jFg`og!`b^vqIY%Nm48^3Cg9V z@10#IIHp`g6v?5;*hjV6KJ(0El3R2YB_AZZr7x33Ic@!Mnb!5~mUZ@p<; z+L1Q=7u(^r({qIe6kV^apDTo;mxOcQ4_9=T2b5jmc9lQ%W^>xCz$1IB?;>K~b#YUr@|wUB4?ie*EqNOKNbF*Zzs3x6ZAl zE84%W>B;}(@+^Nw7wpKWZO9T@bME!Zc7%6BxN3Ju8lA#KC=0!mwS!Of>s+x+hyPN2 zExV<8O{G_|^mwVDe`@m;4f(neEmgz8-!wyCG*m|TmROYkIp?YPoOfHp-o8@(uTd{} zBX#`(YWC#TcTD(Vag;w9&n8^C*Xrpa<2#)E0fgj|LX7|;AlTF4yZ%CMC`KlCc4uaF+}biu2eX==^qfu9J`C;c9f{BZF=O5VY2+Oa%fj7!In=*BpY`Q7 zY{a;nmHKZ(Sbu-z3`f_zYa`C}Xnor|J@|o*S!K?oA(u|O0^nSA_u_fEQfn+7v~L%h zm6%2^?0hsgJ04B4%ekSm)PDkLAoN7Qx^JIowcEn;KkJ1QZ^E>&>zXAqe+u%jjo9Pa z+bXd$9(~Ov>Q}3GN&UB4s)}l~P<73J zUN-0L*6-&y1G2-=^w+sLAO2HeVszp!jc$PM>(%p1)*5?|j2wIhn2xb&tdVuhr89oi-BK>p5{dV^YI z5Xw-6NddAFl}j}rc(|0zbRR{%7&Y54+S(zo@3TZw`&=SVn4>7ArRTMw*Ot{`+7(Z) zqWfyt*~{wACsLR@auG>UOVS+qG$^|5j!<$u&=-#WAAXj+vb)E5P=# z0rdD&#qW=lMG|M7;1<^QneU^K^1K)LH$Vq;*H9aTW9M@K+he06hSU_sZiB$Sw3Hu6 z#;?!$W7@J?D$cq{tqZ^_ptBG&b(t1wXo8i!=v8S_&u+qU} z%2s#1oHB39oT1~nUv7UsJBg*t#h$juT->FfA&#+kgfibVIxbx>_MQYeJ6mF10#F3k zYzmiA^p*I-q-+>94?6h~m^9|Oi6NjIdWnv~Fh^op>%bO)7bf$M)99@r%NM*{hY#T6 zG5u>Lgob{sh6bQeEqea|FA&iTMX{R5FTDX}E0<>W+~>XeZjEE4iwmgmzn%o%LUuM< z`pr5TWyau8a4ZEHaH`87q8ZJ@;W`v>00+Hk4?DRp)a`2bMj!}IKXa*dhG~{U7c2W}uEab@u4CbBneSoc?Es!a z<+#AM&&kItIt|6$w%;Y?r^KmGmvHo%K)Iw{04GTwcb;_yJ3h2tJ7B(uTHg+d5Fpfn z?&v3IZYi3PL&HuwVlA9N`Y&F4P__zzkQk=bWCH>6kyJ!FkOAWpkc|2?3IVJJI8N|m z-UUojc+rBvoV;wNh~;}5F%4c(n1&w06Fi0Waa)3EriSD)c45d!v)G@2#n}SEl_wkq z#AscCIYlEE7t=8g$ruuy8yZ>#A{j07Ntg?|0IPQk+t2DQ@Hw|Lp%1tA6ssCSWsXY& zM>2l`orEoipGeIf5oYW(*uOA%J=JH5FE}e%ysYJOC}jz+FU86J_1?0PaJ0HZU|#z; z{5d;x2~|ED`o>PfLqna&_KhSjst3`nT^$XjwI{y}rLiFPB@2qPA zQmdEja%nz)G^6c^WtDYk(J><}G>#0nnN}M#+!3#<*cT5%*H*!sRe${5Ev4?i^0*5{t30-Xx|apik&iO!CCWCEfDMCDVAp_fTM(0v!!aocZuxvzGQp0ebBgD z=A*Hj9YMb6-Jp(wnTsMN|C^r&ANljB#PaRTDRzMenq zj0BAG4PPCJ4YA7*F$vrFE@;!HaHZ3m(vJg&(;E3*YWSdI*L%R!naU|~Rs5$A^>*TP zQgtr}L%^70Z8JGMRg^ltmY}5>LLpO3+H$M;?D$zhUtUOAg8CX>qC+R;?Xl>j$@Pc8 zQZkJ)$EXvI8k^=Tx0$0=X)`oMGt2fD8$m|5>&K;%o5MtR+Bo-Ps5PUgLJoo!X>s&^ zl_TVZg6v|*X#!@cy|5Nt{JY!dQG-M1m#v1Z_cA@$;3u4N&XI11sDtrX9jrp)r36U5 zfc;%j5V^2y(HCDNsFF>t7;L)yGOE#}1$+(ZJ zx+O4t#vH>z`tKF)s`oBwe>q>+Eb|?Az8G9K_Iiq|)j#-N>?xR;7Eb9@u{v@?)PBB5 z4!*$HA=rH1Utl0|2tb0JAF#vD6X-2aB87c?fm$IT@7(83ARcq8eZ=X3 zvqlN{CVN-kKwXn|B71aK0CXdkZE20+Y5=ea-BwTjbN1FJb>5;f#z!G{-xDCQ35|^5 zpWH;xg!tuSU|hce@V9A$hIuhOl;2#wkA%939rG}#I7*3@E&)(k(z=L)NZRA0G| z*@EIOKYsu1t5x9lW>`#e^;M03Qs2k^DW7vR{n#Y!_|H1n)Y=`A5#?!f=}WQ#Xtx>SpgExo8`eC_CJ7HJ``hUlscOw>`F z22cxC)a1e>2X27DaTRc2sDr183ha=AR5UuR0Hp7PIPUB5R0z~YKry+=pg=)=yqHwO-r>7$b=8aN+^I8pLu1x~{{0n|2Hx>6FG~f@2|Ca#>5-Wqfw1O(37yHo z>)~&=38y+B7tZ{%%)OD1jY?0q*YY>)e3_Pz6a^!LzG}P5SC-wWUguMq&AD=Q#U)}j zuEpFKwbgIt+Q65&DBsIoxKua$fc7$tlTO{8YvC2*~9`%x|M zFBm(2OXIE0tw}{QHYf+eSz-YZu6IipguZj%8;}v6WH0=G1{;>>t=W2Ld-a3cAJHW4 zIXW^F!9gZ-Ie0h-E3Qh${~6R+%Qj);gLkzfN)?^n#fngWeND&C6Bk8t0iTuRDnRl& zRY4g;eO{hw@%BKKSQ}pL9Q|bjX7WPqnHB9&Ze*!lPi?EM~m zY|2}~Fl~}^vNO5>t9= ze^#&azhvu7Y#$(lJTkxnVkrzIXWlMwJd6sgB+vN>FT|PxP5{b2!if)+r>21Y3U7`~ zH4!<{N4p3$llwPTk6DQHhsHS)($_FJy1ww-E((gkDv z(@|^NlTB-=fBKM~x~Qk4$h&~`>)xg|FU+(*9-yJAXtS*6{p~)^1~UgU?$`s#0xzQt zJbG+2x+lx#q~oYC#3NF=TS~rw{ph?TWK#9Nus8zt09CeFi;6u zuLGQ@XvUZk46MBIag<0l_vz~LLLfBmzsQ($g6u%oZ!;R%y!D4`xhGy| zASqApmiBklkv6n16YGa~V)UJ#9%ZC|%CjdFrhi?Pk$KL=*Mu;TDr!0_*qle)T<2T+ zE)Hk*w;CM=9b$5(@Bs7)qzQvwt);yVnz)IJNfNxHqG~K9v>^L$&PR1D2AS?g;ndmz z=9H7&^7Yv%uTH$#f~~W390tHRJ1D~eE_e}5@Au&)({A22X+O*hNw@{jo#hQv2UII%tfBTa0DMVSyB>3@pMf{pgpd1!A-cowxGI%;l`3OD_@BuwpFx zlrcGyeJMT{T7J(!C-{y`d%efEC8;~t5+;3E>kO8I)D9+C75JQI5?^{V6cFDZ}z5wh+r56X(2PFUEF_V%}Q$1_`k? zmN0fmSm^ld#flYMUbz!xFkV+NO{QNjWc>6$;g{sZ+TIl@#J0$5Xsc0H)?Wa6F+H{b~rf>x?29b&BwiZ5b8 zqaga=#V`8pj%oezf|Q6p!QEnL>)l>4)8{M%&7xG9u(2`b5(QNgX657F^w4Tl6yhp_ zYY0C$2llg)wIj^p-(S|U2_b&Wuda{PJA2+QH~U^*Zu5d&IX)E@oL-vmmg~um2%p!_Ih{3RFb1xcsYE1>Y9*!)^+OAQ9K*n2YVKCdF?DQ zb%YJHR8(`B^Fzj=rfq*+4P#JK)9A8nRy0qT%oMn?v96(W+7CCLOYI(lzG&4i18Ei>V-L61G~1Xp?E879P0ju_kxl$x82Sihe2 zQ20$e>q}?@35(mbl+%J077EOTzI*&=%wyb*@!MS27XYn(fOs*JJLKmrpFn;%LUv30 ziH`h#$9m()x|MCW-4w-{Z%CAzqW?y=lX5Ub5#NI0_H91th>_-HfSdkc-#MVD62Y!8QJ{QP0U|sH#1?&UO6z(RLsE@*70tS-*0i1FE z0(g*i9Tzfpm4*TJbs)cpQ`Ub1elAL?BFi$ibi9-LO6bLgAjvj|ZCvuD6()asWk$LY z>Cg;+j}bUei06=RPw7U^lS6$ObOL`t@o>k4;aDhw-Y1`te|r_LfLuzc$SOxnej1D4 z7&x6G{^#{c=G*dUb~n9T*RO|qVB@yW!q^jJMUGnsv*!_~Z|C$cVo>yM6xKW*3h>G* zrFK98XL2YRIWSMohwtlo39IIkqc|4Eqc?RK?v?5(#%F^6j!X`G#Q&Ta8xPPOPa=g~ zaGX}hu+iCBV&?()vd^1|4*~G`Q&5~CdQzevl$Y*B2loTCE+Tau1iP)w@9=E_Er;v@ z{$apDzdm8K>_DDH-Kk;PWJ)%kkokQ@R+*c?{v+7yGaNSjj~RhzM)Vo|L`tGLW_(FC6BH6O z(qfEor;CGv1nCn_+($6@@TU~%I#MLBo!(1f;7m&L@l#IpuUj9Ip3X@BBQ^6CHv63W z$!qgUa)P1!0P@OQ>YVzvo>)0z+eK3jNuU!+J0VykBiUFYHOyKma!QLcKn)KGjGfO z!&M?2wngOZKEB1zpd%!Q=}?P>pP1x0h5-hS`F`0OB%!6&z;*&aGB+%D#12T4jWk^I$7lj+|!?y*S7 zId&E=149#$iSPV8E@3zlTDV;1TJ7#?X08czs{f!at>@Skw9fhI^~W7m<+~VA zBtC0s``?e~pJYz64cmUwcag+=1lg>MUnFS%~IgS!bJ-47o&cy#BDN{eAS&bNb)wZ3B0C#sq6mZHs#N-bwHBPOGyyBZjb zG=7|qe%r)oV*$4`SQLqY3TQh4zp6{C6Pwr!c|rneuL=3nqjUB>O}^pU@mYh%NG^|0 z;p0c0bQB<@*-!rU*Y}=q<7IhEKqhZ2L8G~r=aMb!SAij25BU~TdNG?+cFXvJW+|_x ze1S$ad3&*x`uk5ey{pg&#nWx_IDb|SR;{h$Q`RfJvT!{H(1pgjT zO&6ERNb5QEAj?gHA|v25Sj!3f{{7Fi6pO5|m0YcQpzcAXN`Vbc1YZyLZ_gH(CNorG zA1qh#Y^e>1W+aefIyGKgH}qov40}<}vHo0GZ+J5_{=u&lq^K8$I934^M%wJc=%4Rb zDRaM|lk_t{6E5B7bC~NvTqE`6dou?G|F{tp_Wlz}!vMnqjkN%6*U@m(0!Mofgb7d4 z(+8aWuh+8V+V4Np`b0}001f4(u)A^ls}(5H^xMrt#;^mV#`u(nSs)+Gvj!UI41}`z z2{Mui`6Fu?b#!bLre5y|oEnm3;H3D%3?>WgiX)Qe+Z#h;NlM7^Fa@@&mC#AtKne5` z)xywa;tljhfz?S^2)%2}X=lJ#H$MT7V3$wUXaDK=n_UU@SO}_pff9Sn&t(oBbR*_LJUl+na~$9?5935#DFrLlZwOW%wfk3_B!E=eX!3y zihjttA zL``A&JYJwAc{yC0)-HjZOu%qW|MQ=~gj_S6f^M^lWaM#VDbFT1XX-JNPiE99fUMYe zBy#K52m0+7vSfKv(SizNyW?=?GY+&1zn6ejFOZ2Ohx|Bwj!_jdz966kWdEek1>9<~ zh9bTJ*zx_aa!d$jXX$bG z#uHQ78;F$dR0-ffqn=23p^rv?$u|Pt(OenA!rj2BW z&Hm2f@gir!O@w%B8ZesqJjd^ClTRio7=S!%iGw}%)cOY%4t%R{BgbQT-?#dK3>XPU zA765y)xI`W!>huUq~{YZ@5^o*7*hcPfgxcngl9^3PjQK8FSH$L{Z(E1`4yQlN> z-RIu9M%#a~?a#lB)w~Ok-K`!tQ0L;WxNqCO-{yTcPxI1s z%c>)4Rmwkw@;`ll#_Q9b9}n%wiJZqfvTW6b%JUbRw}oCf`sBQX#PthP&bXhMF2lX! z?9AYh#0Q?T`w!3i64ckLAe)iCVn%l_OvRLB@>RvvQ=?`^r&~3Tu)~Rq?_?FXhssyU zE^)hF=L?;q9EvO0MXve^CcBl~=lHy0+Eo&zK59IePj(;s~gj3s3=6`4R$7y~4IU?_O_h zU1Hp`Mab51K}4UllueCUU)3bp)=r?+kVy!HnI&_eV#Z7P=GQlkT%i%Oe;(-~kXWr$ z3bsdCt4cbUkF%r7fX*Aj;V6v_rqT}v3+n62O1V9})3r|g8cJFimR9k~3}ZIkZ}D{> z_4sn*T?U$u;%5R`vaRcpRO`e7&plT^gZW__LDQ-S9r~IxhK{y9-oLN(U$)9aQW%F~ zX%nOxqJB;)WbBPhy~)TGt|`MgcemzLRkS~y8L#Luo7XnakF}}7cxFS+>sD>f9#Yf5 zwOpj>?NRU%w(0U2J;rtgBEZm?m^Dhb2ZxbOc~Tt) zw{c%SWhbJ+uA{Y`oRjP(Z~7DZt57J&(uBm(bxM1`jqWY;I#@RpG&T@4qfs{y@_%5R zJbe!Dgf!zkm+24IUGI)c{(e+LxA)V0aUI*Um$+HSl0jIBQodK^d+rZ-Gw-54Tgges z{09_%x)%FvRt#T6z=>0yi#M&etv~YmmevOct(ax51HKcSte@Y^j9GR@j}>W_wfl`iPu%Z}g6ZpBW&RtZNvb^pA8hN!jmv|yIOWTo!tLOqFJ@A#~vnW%mVnfvksa;Jn=v$ z_8OPYG*sZ!QtzAyRaUst!%=@7A{*fQd9bt3tm z)wGgksLDNIgH_KYWz3JbS`*Op@mMk*k~rR_xrQ2@A-PEtN`<FBnFLeeul($XY#6+=<)PuikrWAyvQl@TKt| z{ydk7@80ui_SmH-R0M5$1V*%4HW9uM-(`YIhIWJCkb;sy0$7hD>V; zB0UIfO7)5XTexG!3)d#)4X}V-P6~kxtNd&+skbSWD{Ohn3gx;CON%cQ9LkT|*X!5k zTOz=ld*oH>g;?#>{Hsj~N8j;veD=Isko97cU1G;Y-VN(tZ|?Y1#lUJ9_i{PI-wb9p z1!&8zk|XsS#`Kiq#ym>Df#R1cBHMbx7Xi~g`*xGn;_1AnL19(C87LR(AbO@XGI+I= z2bKXkZFy8g5P{J+JMwvh=*9ghE5+{x-m-aIunycxmG8|hCy%(b8ztC~>kN(AOB1q% zEbl4mu4`N0HzQo})sio~uaM?WCS--Y>aEPV>$LYJ?>#Y>C4B~-%Z75RdvdaGFn80+ZK6HcQwJ5#=mp!% z>sG=3V@~kktOegFF4XM+8DcDi6;NvS^Eo!= zp=#@%cE9E#x86-IUWqmz(MiV=^SNGn1XLyqHc_J=Y(sR4lM65}k&Kw-s^Qpr9Zib+ zRL3^LN(=0(&Tvrs(9$I1;a&m0krM9Eot_@fH5nn1exmwIVPE7mQ{+MLr(Xw+ska=7 zEq4!?AOnbUZi%43Wl$5vNkwXe&5t*q~6KMT0S>H ztINzFy0o?Hj{O?{zTJ)zhR2fHtz-j_I~wMCdT!V&DPcR0Tx1q)IPx*>2oJ_joOid5RsbPogXzEK`2iYnXhNBc^ERcshAx4(39=>-A$dtUz)xQZR??q_gHrn1B z4O@GUIrOmo9Hqj}=($>m#May_eLh9ZGikO7a1or&n{a&e&)g=8t(Y>ud%NzxK820W z@17j9`_0T^ku!RNe4mUqQ5I2bi^KoxD^j>{z>IKm(v2vQKMBLECc(B<33H)sNGs!r zK?TUtEnvz+b6NR?gkCHN4sSvw?-sne{0&`_%)oOc_gVY13kNd=9?mUaTUg^}BCNuS zoJCDyr1NSMZ1kPQ{LG#KetkLe$_y5GQ=Wie z3S>Bku*9t_*X&nsZvRZa17#=pe%5ls-vcI-nPvvw^I#U90 zr+Ildhw{Jzxtc} zab_xcHCFjLj3$3BNU5HkR{7wXi2t(t;d?!2JJ?)IJT_bQm;diK{3Zo7iPRx&#ho>; zPS2rsR%D&#)GiKp{{xO_cC}M!gLvg4CRFm-$|{yYphK1sR?~P$6hnL5x({&9Of#gC z`<2A1PZia6UCwsHQMK`ZcW0PMqj0Gm8beUZyJ|H@s#}@H$s#p+^i~&D1QEy$aXQ*_ zJ1~Rm0torW>B>b?FzsB2kW%%>Q%ekY+X37ap8)eaJ^p?LNk`r&40)N=#&q3!uc`lV z;B95c`|2^(V(d(m0vSKp-@O&=SMpuvqro10nq6S;4AGO2Q}af-n&Qwhmto6p*e5Cl z4>AVah-gpImT`+udBYFi>gQmw!HRrmE@~ld%OmVDC!{4-+czF-T-g_UN}8GjYaJ|2 z7jHPSX9@hq*ymcL!{5*N5cO@M2vTbXllGgb&nyieDQwW52^3Xc=~8s%`egEjS5ZPr&Mx!B3c%^jD9-1 zuy0K70`8emAR#$Z2AA&ucp*TX3D z&sQmwHmUj7)W>R^wO6+9ypp7({$btcg_^v^>Zt)S>8+>qOS#tHZyHsz&c2$y_? z)2MTd{OOfk2OECpF+Yz;{~&T|rtB-ePhnPP^fV}Gl6B(yRzSGEZ6dQcnwXMypp~p= z5MmV7 zT@GBg$#FC5y+Zh&8}rFKlf;>JiNpeFe)p8Aor$HWS& z=MaKt0?7A9^JMtXNWMnyh=bcFKA(hpzCYT1hH)401-yda@q0+nUgC$mbL z(GK8&d)BJ3ZF+J6tU>E&`WPtk`su|o0+0R`5OYs(Vc7a0mr*-_T6ccnt3c`=UDb!0 zP$cd?_6tR2$=IWicP7A?sa9g^sZU?ZE?tYK4MG&8Ts2xgX`qaIx(`fkvp2uyD8}#H z`q4zrWwa}(4tTy`6OF7ycB)+*1`ourey~ypcEY{WBi-ke(`WmuK;M%4wd5tTPTks_ zo^>kIcX!{`>akOfpEnC#HSZ&zN%!0nND{qqbbeIGS~H@Dq)oRa753ojc9n>Zg63OC zQnR8)3fl?fmkT)aqv??8l%4~FsT3oh%8Yf%AU3Lo_083H3hEd(EebWu$kQJazcUeJ zin;RT!NpZyOD)aOcZUr3Z*0HXf9h%R(VBW!$%_7`UW*nP-#UG&Z6!tq^1zDYneHnw zjCrVi@3y}`Nleq4OSTM z^>KRfkDu)-K&YX*33>tJ)Ro^3f93m5zz+y(qtgYsCsKKuL!MJJHd50#ohAE(rWMg( zjkY}64WrH3qtSWRDzpk%O;-{{3uin*ILh=pvqNKQd>Z}qz>0>hY# z19E@y=YL%W=fhzwK#e!gukvvZdBgQeNhlqIX6o==qD@IInAYbp7)|AXBOANBcviocs&}Q-0!b3RanJVkp;{O^> zrhayOKyLm7_=U)j>j||C?VEzFS<1J2+lhD#4JZhL*HljWQuSCiU9DWE)_x2^dfAJl zTr>FaM^bF-#|yw^vVS;ySMcMi3J^VY;1YyW^0x1|!m+%QtPs&ztZMX3p%Y!7>s9oO z)K`c;hu7dDV%9yui<6D(M-vn3r?&K@iR^(sDX89vdvL$1mYx1-#Qz$KCJ5?8mcEGU zRZw}q`^sdcsKjI0hjsgOD-C_P4lV-~mk?_=qVGFdi-9-ccI)93ePwAb$M+{;Y*4@3 z+A!;WAz5)E06>|-@(Ny_P@@|?~ z1wQZk*BFJ`9Yw(I1IH8Os27RGWjv`FsjjIvF`*IflaG+tOHv>fWXw9K6jmx9#^%p2 za#uOMd&Uo-W%kmnKRfNWU}>0IfgJMeRJ7q9SxXZV5Yo3Tf*5cXQgg zj6J+_+OhCj8xNAON>ub7eIo+z=N~%*%fQ&4_9f-}RSufYnDPBzH}v42ztrT9U}UHI zt<(zmhDc6j^(VGfXOK{65fF3F#Kc7W8(xtz3}m1d3ao&8n-Fv-c~=Wn_P@$TFyNXb zg6!d0Q6aUW7##WpuxqoD{8seIdd&6Z*m%_|bp7WP)uW1U;qh@PB5dR%pUeeH1|Ez= z-I4X-i0gu5^Xb?=KDU@VZvr{wRed-8sT?)KyJGGio2c=cP+ovV3tsq%<`)$f)^%b3 za{jSVefNiTP*It~lO{C}2W48m^bjVD9{RI0DG4H^$RAc}A2{&}NL?)`k2NV|Mx~^H z;^IZEvCNPb=}hn(C3c&_Jtjw^B&1$QBj!9(G_IMLa#OtPv(pW5d-wHyB7WihL0Wy-y%b1lA6cK29Wc29(&a*N(fa ze{)=h2oRuK5n+@@cYw#?8q%!C1`wURqwsM#bnIA#5I<=C79$BWc}s{Jur2FeKjo_r9fPl61l zB?W|A`(S?^TwXY;KHxpp?@gAsP8kutrwu)^A?=BZdh*ID%E(0pymW(m!`! z9ul{uKw~@zb=qx$NI%Ad!FIwaQD;mGji^f=RgF>XhjldZi2#lqLv3G_^iOE)x|Dr4 z|Mv4yotQK5@P{(HkC%7hI=aiLj|=MNv9OvyG-Gwr67ycbbncgJ_6)NF?9*8$)A+3a zayLjwBsP7^V7}g}dmU_oChHoOGHpS^plP&=q_7#8?UM^3Y(A>7KXQ3(J%MqEktnm) zMb(#M`IW~z$=ixAjD}I>ocT;Sr6F`Yc}4D|BwTsoBiO7pXHG_4IkT_OlDIY8VtXAg ziKa+0dU2n7Gm7lq$d}A|_Kn@8nkQbt*~H`Eb&fr%zDbc*_MXnbmU0m2uSA4}$=;^f z>p0G_M95NI?)1xT(|FZ}y#%}9ak2yA8~4U)^v(rDgkg3-_77>hvm>II(J4@daOZ9T zj)uv1iWPV_ZJXj3F#NGS z7T>_9ulKI}5*rlm*$&ge&~0b)WM-J43_nF1)u;mN6uXZ`%G2&66>Ha>B=P_zyQklB zmtZ9!(m}s&2#8Vcyu3P94lCcUHY0p7P{XIKorbfaEI%+$Ow||M&ch#Sw}1u1yKp2G z0ud+Q%Xzp_>f1Jy%+u*YAmm;lf~hfxI8=0z1-TBQ(K;|h_}o++GFvWNkeakST!ScL zA^E!%!HKTjr-?vHLn-N|i0BZQXe!8t4* zeft0hXa%DkYbNs)A@|<*u+PnZ`AD@Df=I(R66)*i5|fU`8%^W{9yCyH+g2nq{HF}kHUZDoQqnLqetor&&iiQVId`G!R%N@@i z%G#$fKG?- z9U4ZMVes_l^6=(ItTGX4!(O6Z?kKtaizD&;n%V8tV(ai`q?h;eWiTsw+&0BG9 z$$V(-ml+@zc1M&zc8yy#q#xf!h!erFawF;jp5f$v9MGGM1QW?S*1J?2vik{G-r4gB zhC81w;#(S!K}i!XWrLOG{aH|zK)&ghah{-|*lim3*4-()tUea|dsB)*j(e>!#jzme z0F5{ut>fyMw&$8?`paoxIiOX^ei0h|2?zi9jj#sPMt_Z*B8{aC0Bw$YI#`+lM4gb) zsQMRvx$(|D(KLZ^-qIo@)sDO6m~{+RHBE@`lUsZgpb6Z}tO+>Q_*=WC1IebpD;J`* zF$BjcaM_}O>*h)IxK;S3hso1GRHyS`z|C|wYjb94$ zpWry4UGz5}HNj6)OOLz?_Cv*<1G$DuF?0!Ii-?Qc=Vt|aK8K2s-%=mq5uxxS?2O#p znv%xwh*gTij$;}wahY_2GXs(Xohu+&oCmsr1MU2e>@UB|mxg;cbwo&hqov0$rfELi zF;qpnDK&)bZ9POTm-+755PpSK`N51}rC~GW{DsOz!+}V)qcHUu3^7a{qqLQk^9~;nY>7jO931%>*Ls z>&2krV=g9aDag;wWI2BD-d?fFi~Y~9;9pD&mXH0@4>8{yJbwa*^6x)hx_z8#|MxH8|7Cqwj=G$^ XeO*T({=iN8`fjpTB>;-}m)+F!$Wcb-k|ZdM(e_a?icX+G_iDAKwi@ z&_2|qzw{vJ*c1e9{k(GrIFpCrx(`0Kd7VcY>;x@nr`@JpQN!-RLu z+cbUcqARL+sO0wD$TJk|(=&?C+Hq54wsj=t(Tl0b`zW>csk@JbFD8sq6Q#2rf4H4z z`qZ?7-?~Ev-58aj?SVc;YUn&3$tp~557en_8bU&xdjlr-ZYQU5ZzIfr1T3=DqEz$ViJNFyPxg*)wR#FMmem> zZM267IVC(DPSlg%1|6U4LxgFleZRE+c_L?REjDlGyBG+?1+sLa2lvIxpt2`;e3l9FDZn*QK+{yQkrD zxjumi)XbyEAn{wb960lx-YC%(VXcJH^gf<=n753VITC+D-utXW(tL9JsUIY)9gxJy zn=9o_daCL`&CXd#^}cB9R_S`dG9b%Beyb(z=APDM$<;2O=@8`v8CRjCfA3=5p>|{W ze|P)yw+j4E^Zxk)>)24@r}2NesA9O6{>!P&ua^%uC~bcG<;bA%e?$InVgEnEK1&+x zo9vJ7E^#t5%3IO|YH6Z-_Pn;dliTQCza+7cuPchnUobJPj;~8yk)lbPkFbvF4%wD( z_976^(tcy4ve;Hf@(Qz76Q20^Rnup}FT+EtI725To*3fe+B4aZu76zGwP%hHCn43) zU?m2$W~^0+oRE=uA_BL%!d{FXFMXw&^p%3u5k!@`^~L9z`5wu4>TFFyg3c>hnvY5_ zb$WTFyU+$R8g+4bnHH!a_;12ApBw(9S}wuMJ}^5l3n~pgDD9Kn|JFkuxi#&ZXH;Oy zBgCqa{1)iAkN)Vtdkr0Oir&9@YGoiCAuA|{^WF)au{MJ-r+?{q^UnVr{ci>S-@F2C z#a94w3O!9g&G@3IF#xU!+-^X*Y0h9@{erPJR>9yMu(WMX#aMEJozv)VkhPxwX}l^b zS(tkh2>!HSD#1(YK+ov+v;SRkmv8CwIPvkZecL0@0r25;zMt;fJbbQ?0m7Slx)Mvi zB7gHwd%?xK2==br#U=$BAzq=_`?o%?p$D>JJuhr&3~a8}jK?TXYI$VibKcD^emc5+ zY>Ex47_IfPE5Eh0d@W)x{Y1q?5%QS&jPq5mk&M7;=8~_^#K(9<18PdxY^&xLTGRgz zByoHGXASbx5Uj&cH_89On*V{f|3OrP^xB^sydilc-W9c^ez$5DA8JJ&&rH7%P^XHj ziLvtms$(QoM*#QId)vQcsqXzxY@}(a0h43xv_~J7SzmyMQ9Q!jlJ|m7;+0VL(fip# z_cOdB_A;2Rps$}>sPR!C^cyX%Dkf7_EfD6jJtOpYIV}nyhH&Dg_4gc|WJ~2K6IeVC zJE-6Ne)IaL;gvYs{vr2Ge3Z&A%=~1nyx?FQ(;C-iVO7DG!26BfehgbbEwIQyI0%|1 zQUaUgyv`ekL8*!`JcuvywTf`Is@|`&4Rf>yimo;jll?y5vx;$ZYQMRC zkG6T0-`Bbd^my{=zIL{PDx2%Zun+!WMxHrtrY{n?@~wZew`^Lhul?DL8#hLQzxy@I zcOaOazm#c3%&$S+zjtq9`SqbUO6g|u)<$b^>&FaKA{l824KM-hntRc<^C!Z; zys4JL-$upoFXkZFo6gcN!A-DYqO4307NLXaUh>&0m^L^#sDPa~cKmonhPOtCXzlXM zwE6YM!$LdQ*=r3e(oxP``OIpgocUq&cxRGHMY>HMVt~7v=hBRMeassR|KaFWc}Yo6 z=50>PZ|UtKT32dAn7BzFMvk#n@H{zPBX&xtuj3gh$9UjFSC?!Fn8TBH5pMCypIDGJ zlRCejc&d(z{uPmOthz=BU)FTiR~-K||ImD>QI2$4pPPjWVd>hR?5HV z2)Cd@dXmn3zhmuFo-&3AQ&&?F8=XelxR36D9M!FU!o6|Z>=CAMb^-8glbr8adgtg^IEKqmgDp|b9P$HN1%q>I$c4E};@V}pdE=LXW{5X|Sq$9XEgqTE> z33e;I!SM?CV`3lwxVnB2`f3dUY@Ds@6%V}0Xhk;9Z^kDFOtIZ z9Zakg!}Fu4{EPOPX}>%W+TZr`6#8%8KnNM2dLYu%z z#n4{M1LHTh*q62$30l7~^vzEjp32pU05FPE;4FO+!K>}54reNm|E(7RGiLv+S`GcI zU^+rh0wsihdz8k*75od$kaze`8&xrwASc-)rMR?=RCWI4&d}@)NJKZ6iE?Dp%DL)Hf088gmq z2R29!MDZZU)*{5J9`M$FX`#m7$>~~pz~b3d|2;oDyc@TFIbG|wuJ#4QF)J$eEsss* z0g&Y>3Z(0Jefd=1e(aZiAt?O&&TD@1N~u(8jDl# z^K(^VDtL=k-?jvZlYT}w7US) zuvNZ^UW`iYud}_(`IB<*qYI&o5b+6ig&x0(g9`d5aX6ES#KQ>8qi52sxV!L##>@i* z5&h)XM?%x)VmJNoxbtq8JZDGdGZ4K^XUoL!3h;@ki-&$Cy@2#nkTLbH=A4xA>h%xv zpSC$XSorncLx2CYT7L^+0ch}rSFX3ZqCA7)S?sbGXT`6N`0aVa*!~~Z*}ByizO?I} zUGvO3E2&DH*iK!9x#zBT2Of6(i8ydz+aZzNPyczjG<))z(zTZ+IiJ4jQcS5u+@h+vjhsC2w7f>#73h}28;=p@5&mcJheh80+K&qM)VgU2Z4r0w{4h~XCqK1?pf)N` z$hX0#d{Dwv4oX|2NE9!dUB-u#!A~H#%%c9!1`6nXC&s4z*{=9APUqiC03r4o@5~dC z-%>c5uby7+;0ZTig|j7Oe@j>Tb-B4OT>qg0+wQfh10ClHZ%0!@9=`K8P(mG%Rr=M^ zK(6P0ek3{C&IYbQNWsflZ^#xX<@ec_e;uQQz46DmbtaPh_yeTguyNMHgup_uOG3}) zWfcbo!=Uw#YYqe2Ght6JZ1fZ^d;f>&4t+tXKrjSw67yk!HIFG_s3o%lthCNrb2P(X z?Yf>`IP<0VSW_CkQ)G}%ra-h-{GYMnqL7JH5U`|Bg2Kg~BZ z;-xYk9$UG&quBW|*ROUR;_0&3#AMclZW(+fmY_2;58`zW3;HN~e$x*(&A$^5B{=KC z#zztd2XCfqc=gui-`Nk`Q4m3Au~d*6IFkboWiS}{H8`As<+;*6qs+0nwm#Y z_tb;?#tFcFYr#kZ*&pL=930jEvBGsjt#UN|#cxeWeeb<~H>alZgXVK+v~NQx`re12 zRNR7A(OHx+T#QXLb68MC@wlp<{6D?9Zt#+|l;X_WY_b>ef@)Lh1q;7HS+6M^FA(Jq zb5A*|C9S@_qIrJ0!G4H)d+KCMyj14M&cbvzjiS@|T!dO?$o0tfO}w=+#v9=cMlL*7 zQCTVJ+CzD2+xEn*W|8`oPsL)qi%!k248ih6D&~%ygj@^Cd@sB|U|rq^SMlSQ5$-ze z`nGPiFa|Z_k#sEeUVhcgJ6v#ZaOIIC+cQdE>b4%#ksFRWxWr&E1U2J4uHek4MC3ah z<(YFGm9*9*Mt*W~GV*ZJ)hHz;6fQH9W~$15Op8m_O_?x_lIO8hY~pd#!Jvw^t#pGZ^%d$b#_q zq^Cq(KNdcr&MwgDIy^^@#$6t&(ZgU?{~-pFuQzEWi`$%Ul^@ULq|eQfb;rLpJ=M!9 z(TC%c7K&pKwX4kO%2vf$0!zjH+&(4TWpzVT)VWm zJ>y1G^`z@+j8MJ_jEI+Vxaczw%(!`wt)y0s*TU3$+flb`aG4REGb7{_(O*kv>xUAX zhC5Ok>FqdG+EFbjhgQNZ4&`sO1}9Z}r#peu)v}D~+Ep*u+|WBlVD$NbG0r^11q0G~ zxbnJ_X>L6wPKzHhm7@`YhGh{&U|E7DY$B-&K#}1uvRg>@ALvEAJ(KM@c#xHCY&QLGmp=s0eLO6s+boY~udv_=`cm(plOQvh(E)Osomh~ZxDdwYR(eW5W7#&#;BhU0ZMI>pl0Cq-RgI$L zsSf|1;)vp=KDM5a=f`9CPw~s0!gE|B>_hkm$XA`+Dqo+KnOq@(fAiEUyePk(9h<57 z?%Z7@pGBX=OR87z0}ZKQ>IXuyl#qU=mQz5T$aPYBS7vQ364gF1fb@8^OE6dLZ2NN% z6Ks}=Qcw5pnHGvhN?{RUoMKaxg#%U{Rtjq(_U6N5x=w@1f9vx$$vRY(OD}$}1=!Q!T2Y;;EIZ!m^EYvat%?7a`tbq(0MJEPviOI>Acv(WZym=- z{cz&ym#*1Im~WU!ZeEW%Hcm@4$#6adUo|$V0`sG(0)qY+m(tvfJJMdNo#xPf=p4xtxOSIJ?Xjdw} z961_(z1C`}Ni#v2KgT$)<-jaQwgRJffQ(J@O-RVk_w189Dp;|Dmylen<aWGYz}!o77$F9CD(kzOGJ% zMaqfQ3>_-Ia?;p4QLvnuJx!ZxL>iJCj1(4b6I)@mkT@=kzghD2PnOcOr6n2a>swX{ zdZ+wb%ye3M`&YX}GtLU>IwaQu98cStbaPzx#dxiG(H_T#<=`e@@s_q^ZuzL zn|!p+Vce>l7^MW}Hz19lb|&;C*f9_sC4U~OGnu1smo_UP=PJxQX2(zX6Y9|p z$dyQ|x20b8!*6Z(> zf8lDa%uSGUg+@Kcn8gO=wdS_-gvwK=mMK-E6UJ06huWek^Xa91i?0N09-*DPXyu%| zNyJs+-Lck&qn*;X%1PjLM@PurXBl<)YYu2VqSJOI2Neb@^pSa@v8@Cb_B7ABr&V8WFyHCZO5+G6k;t?N`_HTA$2Ru&{fRyLi@xG%_2hG4jb};}S5~eg`bbUV+|l204PT%G z%)uZ3kTQknI?TlmIqDZ&^1KFf3ivz2tHv%TSQLf6AL7D&C++e;D=N9e+FlpFfso~o z@0j}wa*4Tz6`&NRQLi#^FRh*$7hFl;!k`0r{2i|v68xl6%tK}L140_d_7=8R1Ctu|M{~GzCSv40j2y{-AM;djCqz*eil+mcpOJjB{A_R z z(jt_eO-;zwdS91WAT9B+?Utf^MhzEzPL@Htp`Gn4Pjfc)An_GDiUav^3aEij)pdeG zuRB}9VOPEdX1g$C*D{N+2sS8fI31;}Df#f3<%Nbu6csUwnn|EGZAtCRTj_W#Ty5K(5uu|~Bp;1* zPwl>Qgzg>f+#(@kuGe?GTQ=J3RV7xj-Fl%AA*AnbRe3$FvvLn1D2Nlr3q3t~i#m2Z=(4EL)G@&+otY`bEWBJ* zKS9g!s#W*A^utn2juv1~$h)|r;i0q=cIf$J7s@j3-QvT8?0LKilqkf&>F*P`{CxsX z%F4td6^pV*j(QjRRt-(c0T2w?dEdwD3%Fi8_3jlyM|)=O^KkN&8TI6JN8gKN?h zzU7BQU{6d=?k>*c1WQgQI85n+^oq5Rh*eG1$j+f|G-^L;W)?xrm!k8MTvdh6vx$qM z`Ee(sXOf+d6GSyiXcuX`g8IHtfp2` z#+$@fJkVoTTbSu@tRm;5+%U3^sZwX|lIY@D~ER%D{RfWDJytN=dFOrOoTyVgtrN~T2d0PD^= zqwg{e$fy(&^w$FMbvM+^6ZztSOGU!?rv~phAZy{XMFD}qW3da~O)2glHq1}Mkw7Q- zmgGcj)&XJ788H&G_L3)B>cYB%u5xf6V$^+b_VCy4)g0JG)@4!wB`Ws&9!R`2SPXv{ z6>+?zR3Fxd%?&R7$)Y1PZ`1V;_328XOdxc$(%VukuM^7vqlfb|()IfFuZh!TQT@@B zBE${VeqiE8~*J?jylP@fqrqKT9H={hN~g9_)IkuWHIU7+R&vjo6z8L6bWTyK&Q#ZF>PVPmIqrQ z*J(0B4I!6bPqq?Rq$6rko|lR*yq5t6&sNEJ)ooZ`*@zJy9(f1fyont~gB^NQEIUMFYAM^)a z4z*I+{RB$AH78rXO1$kCDS&%|H+AG~xF4vlIvg6%Ny1H^LV*y#A9#32$lk=s z;q{q@YNOR*d_$+G`mmQA;HDSy__Dist_hUI?RM)U|NWC?O&yAYgXB@hr>)*F3gnAZ zTu^^PN3ASlC?g03#AuFGnK+&kE37^<@HT@+ov4}ucXqU>rB zI@wailK6e7fR1nHd!V|Jz%%$&kbAv_+m1?pOfhsahtDy_SvnjnxkQd^^p>`Om}h!2 z3*$Sy+%FAJ3mOY{UmZ_dm$qcQ~L&fsU{y|8-zB2|GxUTiAGnwLCP-v1GL{i-R@f8lVU;yi`zL<8R%>I-G0zw2o;U;N{co^2FhN~X1m7d?byomH3*R9Mv}Y!% zYFk6KWd4ZWV^RHl8-JaFJ~P*}xBga*&yn z#@|Ft>vP0BjPN57UkWQ-NP%8gev?(H*z7yl4bdgW_dfpi4QgPX#bXY!ii57$sox3dPeGJr z&X{sSb&eG>cqXcpx5VfCXvo-BT(s84u!mj4s3%){Fqr$ z_9=3Xu0C)BY;Likzgsr-cWN=3Uk+#n9eQURP82_8AgWyL8s^Cc*)}zxYV`ZY;!`M2 zq{;S^Q%)~!8kN$YdID!FXRPz>ivP01_VbVsnC z@UX6^;90F8bNiBGKzjRd+;bDgG*Yau?;wvt&1`FL@ikJ)Zp{zMrQP-k6Q#Y$k=zAV zwK8R7Pp=#IdYZBg$BS$(SP3PMozm^udMGW5(t(;%_q<+j3RHEs_6Zv;;1sqABj2`t zPJT8M^4GeH*tW32A9eO4latM3r1ARzPO2m%oN>~7e_&+-)NLf%8L@z~uC3bz*nuC% zhhK_8Y=z`WS||_@eO^xv!Q5Sbv!)) zA`_1Q@z7YQwa8^jdSnpgPOe$m*B3)#!K3rtS7S%F3!Lt3QZ7B~+^J|;H`6Fqqg**9 zAF}Gd*QeK2Xy$=CdH&MwPMJXuk(3+*{)AZ*iUMo!i%cj1=jY6 z-%8;Sk{?&}&+kxO`IjMsnpkn0ww`;apq*>(L5b22}Dv{lHy;!Artm-g`}*kpjp$V z-~;WgKYd;K{B?zBTbQ38KROliE;V}Iw06b!_OO#6r+FLpYBP^8D-~4IUW4-CE?wwA zRI{}Io4e7dkYv01)R+D!pG%(Z2X3&mAtV#MA@3je_PW<oSSVYi7X${4Pd&*Syue$b92~qglDl?POYrIx*@-nVIoh&3m&_c} zC#cg@@M;rx(=;-&-mPNvQP8-V2CqVY2*cFBf$PX2&gT!zgXhdWnof32C2kKV&?_}1nQ{*}D#)x(`+7QlpjDPJ{^cymdhgoyx2-lzcYQtx9(zr7cl)g?XeW9yjL-79#2^|co(wX>rt!ZM|KPw zXQk~Fu;M8XP$9}Lt|^!aDRmXGrKGKw`PoCXQui9+-qwVbB3)tK|DPNV-SMA+3i`@4Bb;)XNmSC%m zrlovAxZ+GZBjVz~gSGP~qc+jonSp1BY1GW{4iF*;c}SVY3Ol(WpiqgS8pQZG87T%* zm+6RFSJhM*>5}ryW-#InBSGAgdIACAYfR^qjjNW4uDx4>9{+n|%!|~>!p@Gd-+*e6 z*1Jx;G_A8XX3s_6%}m~#8}TS*lRl&Zp4XWmqT#8h0qnVVL^{gy(YsiNn435wp4xOu zv+L8XHR(?^K|XgY);qP7bu|DOs#DC0w0c`@?IJgW_KiVZ)HoDs1o0Lutnem%9IN>F zo$tvNmva)jYlF*Z;Wpz)w%tqf)wrREI|EKuX4Mvt>phA z97++o5*vW^=o?%#P&DLPEtoe$@At~`dfVVr_^qfKr2PA}?jc7IM{_~COwRmy6M}|} ztl$(5vtGeyFT?ez!<)t+v;#*NueXqNN zVS?M=F)XX?5)lX|M%l$80J=_!rJzt$nYaXtr1dCF>e0KAg+6E&7#Zr5Zivu8?HBoT zCu_*Mj4uy197-ru==4Xh}lQ#lkcA4w>rR zb@p?31xuu~5AKlZR=LHf#l9OKOy5TZWrn~PjI*QbZa5ru)MHuYw4Rtuz|xhE`Z3{i z%4;w+Rm7X~N5nF<57ebe<0x>ml(_gqj-XE#(zFN#H=C&{FQ+5IXCAb$<>>MJVr*rvse1@5)A@L5au@wN_VNWZ%-G`0djL zV2+cnwGvWRIZaDOvZK`Tyel5=KK=epM+CM(sn*WepqCZWGua5D=1e{M`jcE{uZ@bnC`%-;^bL121bZEW{YeHB9gPM(F?Uk?#@%EV*4f>+6y z+%aPCXp0sw!lAkzt^h_Tp2AO|VvZbSXU!m0<|hiTzuG0%%pYt1$TmXmG^lLUZNYWb zEQD9hrHc`27*kk1{B|N6boBjeM6VHD%vrqh0!#v_+P{-QnrJvb|B4r+%CJ0LaE~2< z!?lug5mxHyaTCi3hwq`ex;XU5oz9Ji=$ZuOQQ}opvfkU-NZ0E4xQ_OO=|kMxpwjl2 zC`&n)$uAor{+@?FfVDOG=+UE>+}+(ldbMY6em=c2kfJZhTDFO**$5GBcBt-XM}i$4 zQF+zGsdZ<60tF3Xs&?n#ms)T2c|0#0sq8ArOYbfTD?IU#6-yO%5@+;`O^9Ka1DZGd zpm5W>5rsY$&H8}6d+o`2;u3h-C^`r|Up)PrqA>n8%l!wf?K|54xOeN;t)tVtCt1kh z)*PKqb3mF!HH+2C2Tr~Ww4Be5B_#&e?};FWACvN&dry3~G|#1XI@-C_?SUswE^-f~ zVpb7&(Ei9Xy(uZamQ$0cCmD+}?gequndTPQlL=EVE&F`~(lRGTIa zL0nGbF@emeW~qmQFTeRKw4^i#?pJqrJ0-sta(Tj!>J*GWz$cN|T+#(y~sIaQrcO3X-R{Z`Buv~Z5`FkK9mRYqhQiR!Slp^jEKFj9)qY48UZHkS{d zZmk!FeB66bz}w-~w$(sxDD|*Wu1Wk*go1RJRV~6D!{C~I6uGS5((TnMr@+hj$S9)s z2bPN4xB|p?1iW3<)9EX=Y#M?l0m=S&t$JMc5ciGN65siSJ0HYS;*#H8{jU+n6|VLV zvvAsqz_5g6fAMmSNUhI7@hJd-91l``p7-}BJ}YP?>!SJ--i?2y!NPsn>oHQ5_ful> zDp>9_udehSq6=4yzT%B3{W3IumnXpEb0lU4%Z;TxJ7`ttLM0rVR~~X2$<8V0o+_)g zkoT&bEH(2b&qh@GI`asjM3>&&)|mtX!W}ST(@gL2A{v@4ohA0ut_liQG9r83TzMcz zsW3GY1@U#29V#>Q4j9q3refA(pJ0lk_cJ=Wx?)ws^0O5x2SRk*bx%Xq&LXwbHdjwv z?zKYrUxB;#Q7z+vN0zVQX%AOQ=9bf_Rs-GPL6B|VO@tS$m{m_6L`CGet#(s1*ZVee z?%(O`%2LwdW_;DW@v)TN4dK2V`9tHRrg!FB#A=A*6qS^a#Cw7~3Agv8Sa>a@iv`>8 z_DYsbfm{+|A2eLmwF zl5M@cmH8uzX|6LAyp#_xb@1nwpyST+`zH>Tn;Zs80aH?gffIuNRck-$9H{PKp$m7Tdp~;IXpWkI!Tk z7@R)L6|^T>Jv|?V70^nM^P+~Jj5w1V&VW{1j>uZ9P{Cu+>ELDo^ z-H|6Zl6o7|`Ai2fdXdi+80lin0W!KvY&MNIxL9=*f~?I0?B{*ZuvKULxVmKe;&n(~ z8${Sp_JX(?02vHK6^qT#waMJDCf5UxwK3|-J?eHkhnXdBfvoNL4)2~Cm|07ULFk{* z=EzjWoSuqb;B*_hh$95(v&Ae^661O+Cc=_vsRYrlY#e0e5PxS)>Mj(lk zJQ9uoKd&H#a!`WPuJ20;xY@HcYM0O0t|s=VEW@6UPqJxiXG#Fol3b>NEwqmK9F-a8*A){PEdaUImpu66g4J_ROaV1f{)|TXI zSM9TdIIv1S+KOL49?9G`5|plYs!|ahxjZt^Z{!{II5}{6PgsO_r{*bn5E|bBlg<0a zz?%npAXZdjrEsF|qb~+7mbva@2}(Za_fdVcXD9}iL)AWOG}L`#xn3e7I&6@5(WCxY z#kAp5Yb!an843J=Zjr8icPg_=9#youLVW9N6?n0wdm6Kf1cGgCUh%nN;1KtdC-fSI zma3W~yBfcoC;*MEm)fmBrCalV*gp8$!mFqv4BUvOX7DdoB4YV*G7mKPp^rf+nx2@J~~S9DO@3szFa)Cj_-{Tf;Yj#<1mN_LO#ub|wZ)XYbGD}*^PeYD^eOW=R z3LiySKIHQ!Nc?0&=Yr6>Kcyn_L*%U!mj*e+@F5KD=*conK#Xt}Q)v>^ehpU+Ft1GE z98k)Oc#cx>KcJE%V3~k|=OXm!R$E~0+>&rN-tXMrc3y3N&$N?xZ{O<^5F$l$rWC=e zm0pz@(*OtYUVMKP@O-);YCzfp*Ad8Bn|%Rl$;f2#g;R$l5)x3o{5+{dQH#||gu zNWLo{uNz6dC3*Aa{oVItz1g>Q7pq)qxBFvPw}OU)PDRcXFYTnO8CtvD0t2^}LVmoM{-ch!V-Ag0-hrOB1iUVVe04jHQ#pRhS12V2+7n7G&v_oeV1 z*|T{;dF}}Z9sG_*Raw=r)@1GK%519=qnkL9rK40$Q>N!8ugqm?pXG-%U*8#{jE#-m z*lus1ZFQ7qpnK$VGTkWFsP_)9Y*`&SVsJ-c4?*T+@KNLCFR#WvjWv>Zh0!OWi%WfD zH^-bvoEHh)hSg4f%g!VYvFq!}8S4Qhr(~~Lk#G)3Tx<659_3T(#MMbY`BArH6b~x= zlBx&yA@xaR7{=;*ujJWWf7*VkWnJ_7M28Yh_q zUzookQ9RI*T17*Of(2tW>t}+KU%xGll{x|GIy#s=*00pMQ9>W{fSIuoGcbZDd#A^; zbKD;L?1ii!1P}vPa*@jozCPgXWU#Bjg|a^O_F&$iHLdyALV@>NUBF$Fb(r80TIkXyg6u#Su-`HJZ&LoshE zbHrp$evxzk@W~fQp-&&Wi|KT0`rx$5k|QMiDS1qnG<4jmZQ=8d#sn)f=0y9OH4Osf z=>_RVU}B~B@pE8^McMW~D1=loQo$-c?9ja#Ck+HR>WHi(Xu!d3mczz|w zhcGS~TP~)DEM;PX>T$+3GwIB)ysIb{gCx&A!zv7JyM42nq(mnxP40x68*NBLgM=Kb zEaewZ}qrEByMReC&yAR*k{z14JMt7xOJpVfjk~H7RfgoAR-dGX**54HfFh7$T(=S1#@0q#H0T z(U;KmanAjvJ8_W7xhe2VL|aps6AtuyIzgYOa5!9^?Heo0zRSTG=OliEI%e}o6=S-# z)Z~8a>-B;L?#HDnp9FXetcMNi?)&yA(x_&?@j}*eaRt_2N;d3SbKRzlg_7KwA}TK` zV-~uJIoI6W^7n{-BP0F<%90HBd>}Q-AL6R;=b&kN$TTPBD<|ef87QBQ z%vd9TrZnv@#wABDimiS|uc6Z`7pY?2t#ZVa z%kog%Ka89vBX+~uv%|tru7{5QJAEG`jt58mJ<)_O--a|&c803XbdKeYy}d^J2Pc)o z%*iwwZ!({K(E!q7pqb<)7d8#C=ZIpE-AN??%3% zuIPJc1W3sfq5SfSJTqj^#DvQLvv1(VCvuqO+)d@L_lVwL-<-Qdr*8uQX!5s0PUQYF z-A?0y%iNZeDW7>lqk%524Ls_lW+=Efa+*tu5w&9gS$b9C!Wsoe5f4cBfGoPK$>NP> zMr@;!6p8HpdF|@n4M7Avtu|sYs_=T zQkfjV9zH&9)PEWxD%~aW*`XZr)!D>ebJrwLa|>E%--am$hfhH8^67X%?F%3U@}5(t zbmXct6^G|`rEjs0HZiV#5paGUKV}RJl@yuR!qprIU)?JoBP-*ij&GlnQ=Zpk_vkS@ zzu2;wtgLvJKk%Y@V0_Vy|KMzMrn5w|m9O+w=l~ZGH0zK|g4a~onqGfPpfeU0r<~U= z2&n(4uFmOCk9Bh%?~L?mM@5c~*RC|~X|4X&DCW|Y-woa+TI|EDZsqp-#CA|~SKV%( z%P(=~23uYC5C}#E?w8-pKEKFG4@#}5?J-gc`TF-3D6%boYIoEg zpEs{xmz(s7R(V@7XQ_MeYnkC{;Jx+N#0q65adpgK(t|anvC~|R_+4Rq__12nhJjz| z*#oIkR^)1We^;~B3Ysr0uzxNiWkkz;>E5o4t=5OnmMyh?`DSH2eL|u#Pd@kYF|U@| zV{V4XYOoNYC==tpzEr>IDv9FAg8jLoS=1OHw_&u$fX-u;+*W7p7cd3@2}=zNE=aBm zv~x%D9`+hPv!RS>chwwM@oqb#ZQ(naEG4LLYotIaC z?B2SD88pdV`ea+T`)e|(lG~lft0mSrAv<{j5TR22u+?e@l?JL2NK?l8DXlNyhAggSjRv@oi!e2-jJ zs#$81nb`wOP*)D}c?~9|V@*pG9#_mi=TDqs86Tm>WV(o&!Cf6-l0d6N_RMk?t`Q8X zoc=r!^35jr%Pq4?5Qpupm`|(z`T$w+X#z(Jncs$GD zs!R*gI4Ah|#KIotB~Uiq6^C=rugUAV!qi36{zKbhU9?goUmkEC&5Y|dHWD<5%nEV% z&ST3-l51>Lr+&Rrk*1xcEH(Q38Y!!nHl#pti-Hfi_knCb3s4qjx3sjpbnWG2fVb#K z#mV#G+DPWEJ*~iAIH1S#dIQJwi6b-3O?mFSuwLH=@9HRQ$(2?!a(}L!xAH82_D^@t z@k4YMu-&!K_h(O8goi@V1=_x{>orf7I>$()%GfgpppK&ArRBJIEz-hR+}JCLVt1Fz zGhR0H4WvHC@pWx+6$NN3ge|0hNQA+LL;xh|mhUt9I0?Xa6B0<%WY>%)tPrznRvERK z9olB)#)GxYvehM>rW&>4L;@igZYdR?pI2?{1wcgzbP-uIdZscIZbv)`@_>3V*iYuxog#HhX zd3;%TV*6U^H8gsZ;=kr+7~L15%F(~w%UoOCADrp8ZNGwx{#|YvH!WevI;y+7n@Xj! zei5UJi2Qm_(0k&Ans0|znfGXVtoPDrWW2}ukG^FCA-yGhb=SUDlQ6tI$jqg0Tf7vq z6SB8*MSTsI%>KX*Wa!(%4*Ge)@OVbdm@8=?srh&m_m;vX#k1s#xa@O-y$?v_e{k-( z{kj(BOzZjh#+~b&_&TwuX$Utqb_!GqeB_!vyuZ~L1eZY*ml!&dwMIG6(5hc=i<$Qe zS$=g_SQ-&rFviT+Z>QPY+b_MeVl0BdEHPk}vFdko7h7B^#;m$JfF6hI8;HaC6DAJk zG~{$m1`9)>SBI0wW+bF&d+BPjt51(Wx!g#-B+F_)0{Ix}+`iqO)5oO(J*A7M*Yn#Z ze?<-qeceZ@W~stL&isO@-XU@_U3QJ4gD(e4)`CvpTcOMeVCGkri33)oMH(hIwsv{^ zNMbO6F#dV~rI>T&=;tJ{9Ooo6X5}Dxg>+#L0W3k+HM9Bm5-E%bwWvdoHTv5$=_N^* z6mh^yCwpc3n#O$17?odVM_maa!hZ!G)5+<`76M&-Og-eLHh+-hE~dVoOIaWdE%*fu zpE(Z{Q95ocdk2O%7pa}Bx0ZGa-83r<$F6(}3y*qQ50LNRCf}-TknFtHkHcyQXlAB_ zTxsIPxab759~4OZPv|b7ppCdqn7Z?meD-D1C&zKw@)Z>!H*g8rvci8~ghENw1`xG~ zH5iZ}w}KL|x+@EUtSH7E#)X{TCW3P2lIJZ+xwVCgY5jwgws6*W{*WgiFidQo7#7H* z-*V>U{F;i2JNqM!F@dF5(Fy|QbF9ory97@Rl!c579Z6&^(~#{10^!+Hw!*X8c_Ghw z%$c!c%iF2;IYL}j_dI9goLAK6sn=+KUIn4rk|u-7<57Yu4L~5WvB$|>Y=xy3x=a_e z{rspzF~*WxpJ_V(4!|oGZG|yOXNWjw0ak!@V8VaZQw8mU9y+w12qf(L=hci7c58oAW2O9nuW`5w*}S=ySCoPY69vYrPnL+b)F~|| zkTgV8KaqbLpo?Dp2j?Jj@>+Su(w$wDE!K+_B}-Xj2|2Dns@*U5=+8GwbSYLY^^BDV zgLE#7GMS{afHJ`~$_T!zk^{l(&g)+jvXkI?u80~px?Kmn+gkrF(fKWKBbJv2pQIpJ z&l<(Cyx7_~joR+J$Yl=jmT)Iu7P=vB%;B^9%}eGf6U5ICuhZnFoDUf zsp+xge))R+eITQ>#g#Z?mWFHx^m^*e<5BEu>vwXG!&XS#rnk&})vBYl?!$?*tO-Dn zq9jOl=wO-Ldr9~S`177S#wT7;F$|e@IbMGOOjH&z;zjgHWJ`Pu$&1t}Cgt6+Y?OMlP zMzIG0tZaD=3P&aq7N@MJl^~}NTHH<+3u=CN^MIY~S;samtF6F}^`iH2lz7qdZ{;L) zfR`nJ!K`adQeaR7k-bgI3t1s^YPGe%c>B{VzucsGf3&%~K&C0@q(v%wzBwEdI4kbZVf3^>agcemug&4n&)#qCgCP5kuA##?ye(yZ8Q-1GTy2WtOPXBRKX ze;rvg-?K7SpiC-QU1GRZkLNS?=TD{htOdk&ge*8D2#+uJ?dz z>I~b)qgJT3NUbd>v(|yF3=tU;AhlMhf}lbMBP5D~Y%qisASfzT1X@w{Z~>A)5@qiw z3M#9LhnX-$Km#NohCm?U|C})Te((4H&hJ+nPEO8w&NJ?D-Pe6TnJ`+=Q!DulJQ~!i z5Ja=!IWKK+sttoOJR9UByvZx|#2Z$zIiuH9!GN94GcIp)#`>iC{(!*z9eb_U#Qj@Q zFb12Df_(hcYo{`rntEB5WKR?#JrWv&(ZSZR`)g6oNTa(2ADJhpHGem?b!^!Z^uKRjyejR$_A$O!UsuR`g}|8BctNC^)WoKq^e2HDao0LgIr-Z~ktZ2cq?WFaTcy(5S z-b;-MvMBHDOi%6Z`~lc@`1Iv>PhCZhSjU~okRkB&eF}LXNZVtKA z#G7&~C5eFdBa{gZ?f_x|N!LTGHaJhW#BKrUi%B`g=w!4e%{S$u5(0CLP}qktʑ zoO2!O+>{_#HsM(N)$u-@A&w#ZA_Yza&4t-)iy*%y4}O)XPr|4gdu@r%qz*ES#;NS_ z+7A`{CW6ImX_aq($UGxXu_fYtYXG&#SQ-`_Iry@B#H}Z&^BUc^WPpk&TyYDAVq3$84aYhrxQ+d#k`4tC z_w&7O8q;EEjDh^{_u*i^RhDgY_~*a$CMtRV?ua*gFIA>OR=yknK5zpjT2s5bD^lHk zEyj}UY!1$4)zq}M3H!7JUP=hunVISY2W;j^yx~26Ze{S)|Elol@+ zRM1F-U_QN1R9lz&n@SYgH8;!}1S!JF?Q2OSh^|t--JjFXT{vG-(}UrfpY;tYUxNVp zyqOzc7N2Tf{zlY=VCC7OfumN~pz|>#2!NdO4b2n%w{N8M&-W#e*=y!)hwStyiWVWn z=wz`Ru*L2h`8ZbT)qoNZsKwF8=bK)DsEJqDrKEqV_>Kgh*}O&|`U!$ak!7y(g>4m6 z4ZV4?s=2tX>(d!dGcU~1s>a?N!yRZh_02 zudIfpEp?$y4=tb@(+{=G^CdI$uX~3XedAc~ptMst3)1`3qJ zzyn%9`$jmHH948FYIy`S2WuMiOP2Ket(8JOi<~?aZCc=DaWoJfH1EuyC_}Iw>u5HG zSp;(clb3SijT0(HIx&iT$^jtP6wB)r=ziz8}j-3j)1>`&O$}wobrDH4vX^4z81}AJ&Iu zu#}Awb<1rGpx!k-_Zk4JICW{cSn3gjJHH749Yy!VAOEUor}?RCWS@>Wa|tAO8l$oH zi~B}MdqJtMs80?A35IMlwDB~do%qj`&P0U6T7*~Uc*bR|5*dJ)A>*B(b5nD_}%fE%Olh?!Rf=KEBg9t!d zO($t@MtDJbo5})O)%Xd1CpBvRPUVgBp7CbPvnavQ{l*K)mMNQwgbZv1!AdkAH%7Oj zYw5lbt~pv!C4OFf7?>0;CO|(MV?{6)!2Y>BKaADG>a8#r1j|~!`}buhyjifLUI8)F zpsb?3pL|Hj+e}iW#i;yn6j4C7M572kjJ{Kngv~|0%igsJEk8U4L{WiZ-hn+6`91sH zg6cd+Z{@Bb>dc=kd2CvEXbtyw8{d<+P6;*X#)7GZMGapB{YAXo)$^LPEVE&4cF#py z;R(n+7Z4&5GtaJ5%xZ&v{RM$gYV;Nk%hm0bSW@}~vL;$zSrzgxL~rC?n7i9sVtSQR zDE?=!gQ&}*oL?@o(6sL9+3@L#vM?XR_PYol`+&AyRqbp%;fHcL<{-kDDoNr!8ss#JEL7mvW__Ds zvW)TsgY}_dhu^=1^00eWvE;j(@@p$AOT-`kswh-APO)(Ed8FYC;(J>}AAyy}=h^KT z$#RKsHFQ$PaqD26yjXSUQd?s~xNnX*JFN`xAO&gZ>GS`1jN-gWB$8H3bMrU{)HU6h zmwA4}xUsRZ2I~LntOtlgk=o45XedcLiW~OfA4`75#fL9~ zfW&KFtOE6gjIEI`&!%N&a&Z7o6j1}4l5~pNTZ>-CXQrnkFO*2iFu>GeI1v`!pHk0m zdKS;4#59gIG)rT6Igq-7i>9N`qV8JOevZ94)AWJ(OG-CD_0G-eH|i_}%!wYb<2k2P z2?ABeM)v+}wh}MDvWl^GRK@||d=0cnlTQ5AZEVj>|6@+?QLqs6U}*H2@~`j)hmn(b|{F>@^?hW(NKJ;(&nx zNzCDJg0b|8{>D@umIf+TPsM}!P8Uml)UCZq6!xVr}`68*GJ`i`7Qe5c)zTU zC^V#KsgwuH-X>#|6TXqV4pFdkn%mYK+`lC2AFhVxytokxN)M29Q}^ca8mlZhqKDZ% zalR)>meJ(()CR1$2cBaJDD~*f-TXUPqGimnO5?)#Pi50+X> zS(L;Ojs&pEN2;=y^MR~0RwNs-#Pq({7V3rs=sNSWGdj>yr~`lpIO~`LxTxtVsLHZ= zf*=_ioU(H&2flM1M&3Rd_!~uv20MU=_w@G>)@d|wV(EEUEqhey!|MPXLDCgbH!Bja z%}+q&9U~h31Hn4V_JT*cJ$Zrfie>Kzc&3p64Er>h@Ccj$tJYv@$qP+eF)l%HlV2Fg zGG$d(zIkE3NcsK}B8g3k$jvS8GUv=A|;86mz*U0GU@QrfX3YBs|mxc;IMxC zKok=0TH*!A5S8AjEA%avc$ASEZGVw*fmqj8mkj2*<}}gV;LMF=+JqDI9cwuI?CdQ~9AA?@&_|)_C?S*qDnlUk^qbrPJryfx z;N9kG2tt>q_lhY4@b{K&kUO0WQ49La&d#XPt9EbxX5b92|E{$+iF`shx4EdT(8V5) z97)s6k9^eAakts~2OkyIH{NEmr+mPFfZzeD4~ybLr5S@#G0;A&{R?e_LE#US*PB;_ zz4xDgm*T}I==5JZ3-=mM!;R)DGLePCb$mZ&D~r@ng+IaHuzO1WL$);TS68 z(LL+5wHgoX>6#nk0TjZe<1T0Pt=v`A#%_!xOn8YmbL1r$Irh0~nwY^GZD@|L5>D}y zi}r0mG}SA_>25$hGG#s;yQ3Um(IHQ#)5>&hfZhP@xD#JsG#p_WA_XnIR<|K=4*vXNmvBA)t@Q_s4_h}-#5S1X6H8c1$Qh!Xgi35!&XWuRre0v z>)H+hTGhg(BXAv8_QgO5sOeo0R-uf5fBOf5WunV)Ibf!NKJG$3DtyFy<2n*wpz(;` zS9Sq@3hQTeX2_?4AR5k~ZFTD2A@7 zsZW{nr=A3sZPUi)h`XYKQf>Ang1 zKKOKSKgu9@XvWKEqN#+&r(9^f2D{n4{K_e|YMEaOH{Gk*(4dTY5J6m@&ivcI_C3JX z!8+fyl3t0xd1iaCh!>!Zu;>MAks^qaJ8zawx3%wtRTWWxs1YNIs~DkxrQib<#@_lK z{649n4XsJHVfbq5i>~m3Re#vF5E-2vx`tRW9}NPZsr3@^QEGw{A;J<+TIP69BjD{W zKzDl8zR#LdDkmhlbwInY5hE|j21D+nfLtKr2lqx8t2vXq4VBd&PerYpp{b2(;;Frc zF`S_OnK;FQs_7^z2B+M1*Kg=vLCxUbfZ)TZ^;qnt1oc%0_%R+wa>Wti{;v>wIpJ5Y zOmi3E=OMJWtw8eZK2v(w3G$4;)1!B1orFoeb|_}|76n{%E6u{pOq#``-b|5%N59S5 z3&5dv;46TatlF!xNb-eZX`_l|IzToP$#V~QV!8DKf`)#ur*LL)%GPKP8(uw8!=ANQ z$NbVlZqytoqq{5=V4aagz5%ExJOKcA6b*|vIW~$7754sx8+4bI_b9NLViu%Tg|M4k#Y!3&fGT}eiR|tbzhXB z(OkH80I_7%1|BbLr%_R6k%xqRn~PATUjm}gyk3>tS9};*&RhU{4lf0X{cu;{G&9!s zJ{MSj_dcK{#HBb9G7T~uKDq}WaK5@~zC+`+h!KG0QCXFRhhEJOySl;amiWtfg}+1{ zijc*RzVcD620x6>VaT4**A3z62K_9WI+PR4&2H{;Pp}AFAz!Z{$nO-BT;AxU^+eL zRRV?X3;?nH*H5x((^0c0QOjoH+IN!J&lJZ(j3dA8>3tf*Ol`#bNl-QWFO4cf%+ znNB!A_`s~LXP;T^d5<9k`^ii_lyD&O-s3!Uo1M(xUzio7Elenw8M_& zre&Cf@feK1Vbd+$pd<0m?y`{iC`wyV~D*|%<1-9)n<%LLF@#ff{h3Zfftx8oJQ}OT5 zj@cT%?AL+b!gq49V>Kmdc%rYnu%v#dT@}xnHc*i69(a4?HQX)xHl)+lp+Trazeo$) zX%y1(6#_#)m1Ax=oB6DlZa{AX{68!iBQ%DZLMlTB$(Pc8G<@agu=&U^q zfAK#?C)brkzVKjUaOr_PpUR65J#Hk!9dcQ-Ao60!f|O?kqVe4WW9&OEi@6WE66pOO z@RCV+lz_EF+j>o^yxc>@o9BjE3I#B`P#fDz>omJrk;=R2d3dNP$|ir)BVFp&A_QU) zMG4k$pTl>~NzrI-W(m?t>#L(de}4D%75v{iUz-d)d->Dr_cxQbCC8XvOiIPI+atN@_;^zs1mKAdUC#6D`a_ z6L1nlbV5WcvmejBBOT5FSqY#Q$RPo33JR|?)*y^-A?a20#7zf-XCq|96}BwiChL+C z(LZt0={oh2y*jP+MGz zq@HJ>@Y-~fjfD#9hE0B-v00?ggIJ2L*W|TNdsV6}s1F4qFIqpLFbcj)I$H|4>|}LV>Ayw16F_!HR4AfXa~@K0(Jf17yeXdA)80r(Hv14*5!E>79T`BaGX8K zks!cSq*tKPGZ32JM7w%SZe5hZKmtt>WHxL==Ne-rMDkf-(vsBFE;%V_@q#HIKHB7L z(nwk!+!+X?T`T6eHzv=%zni$d0rf~4_-FGNF7{GFP=o3)sQ2|OS)guNxayD%qTU0-U2iEJu zuy_7^W$bM@6gAM51Ki{;>!ivy2VNvBn-N541!^G{r4<5=(a!~`L!FFcKM$OOkk=Yf z|9)K0>BDmEBmA2U^#G;}4%TS{Xv?)m0b|q66!UwBw*P%r6JcVC#rD!I4}?g9ZY$f{ z(zCK;kQZ^t(es`-y~FIqH!aN_!`O(~;?&e(b7?D*-&ILz6KbI&#M4za=2KU;jACz% zihCJ7weV>jZ*I}>=e?-#p5qL3iR_HFcB?ZNoJVm)AO!D<$# zm*n_Jnq+AE_#1xhXnbLduHZ3chFdbsUXWJM%qfDt%6%mNRIExDWgtsOeniwZfhrGf z!c&s5#iU1)>G@G5WhHn1L6&=WXL>ppR1QES>F%*#0wM}FtWJ^x33R}`tP$7p6n3}~ zIK+V58n3@M8r`3!S#C;Lyzrd!3fdCWIBKtH_Kf2L1>0lMi>#&GXyTm-W*_+n(x`Dr zme>K}ACp@fb>ibTsFe3kpJdC(+Q)G*1a$MX{1L1y--69EnPw&U`AwR>sC-g>(l+wl zoN=pmk?&}dlW$)T_fD`HR#;wCHB?;^_$kMpI6&UD_ zW)=VW)vGU3EzJg|$}%LAG5sYAdnKYfN7qCN`AMGlYVY%5*aMzTyI1TB)|>FkL&yGv zieL+(`&Hv5g$W^Kr+Rz*=CAy@b0C<1gU1G@)Ki{Jmq+D`#|EX>|nY zQSF%Ft&Ql~H-jBg!qhyjtTw$@2^_3Yg&T7gE|L&oFnu$uJ&9L05gtiLU}`T;bsFOH zz8qdj;OcljJ}YCaAP^?IId$R$fYoalob|PhegYWbkz|F7PmdNARkA8epFTVHz*(Pz zDo#_?7l|NOx?f>X;ensvH!4`CY)`<}csFy(uuQszqj~t-dkA9U+$Y`}=%DLJE8V6b z>*5cc;lCLcE!8@)9BQt&sP%xuVjZo6N3O>7E6TKo_4EH^=7TQ-ykxNG9$6Tjok3Zf zDHNdGjd0%M89WhPHDO;cMOXrrHt3z5sT?d}JW|4F?S`A;{cGD+eqC2`?33jEodXBU zUt9+Pr?}C9ls4+UFe{L{hzC>bAb2thQq-x59W}JmvDv-^6kl()LBj6|j`Y4*niI$^3u{AeBfjkhxD1;jvI{Nwvy=i3Q$)R$M{GDQ<1YA=%AZ zR!h1ke*TL`uy6A|sq|#@Yf>`WuduK4v}C%*`}^= z0VXA+Bjkm@D$MJ>VBZ8W);1Q?%SoviMc3(@MV=9s0&@tFbO0+E<5kNshvPoiKq7Q3 zln<#V#=#=7V2itUfDKsNtT#I3I+B~PyJV@-jJNKJ-c7kZia`Mz%uy(^OiA}UIk_(V zsVmJGv^!%1!sLl9uvmc|H2VNfW0lVPj|k{Z$+F7yDfdBB2%g(vlk8mne5OGs z;2na4_36LWGuCT0R*^-)(huD15O_qLODglFdO?E{N_dv(vwWe zNN|^cBfWK1xDBLCJ?EvwoZUL4H| z0^5~;&N5xuhG@iFDnW2i0-s0TZ+@g8@g-EOtmOPwu0o$9aS|w1RvxF&YTQbB?MzRyns?KHjkIRL%WEr(Q){ zHcL(j%K>ZV*>k(6Zz$H$SMLU4>P*h^e-nNjoa&BqxyeJS+_uL*UM>z}^ax3-Q}cZQ zD(6fWB`_@D^7tkl5w&3Ei8p zx=lvV*D0*4yjk8M$W>LFP1@OMpI2~XkIC~6xg4bB=X$6q#$eI)`M40Wr0u06>Fr0I zr|at_N%|y&sDlduY82c@I9DxO7z4 zPHUnGXux`f6=zrR2+d9r89WUqGg&_!ym~c8Sw$w$3ryj6Dkve*Vyuy0d5zw-3#tz8 z*#>AP=^!t)BS-jIG*Te9D={H0=sVouA*HvMT)R7X*B+!$!i9xfTG#sT%r2;997^2^ zp2BvUisv0uQ2JgCjqnc_y4OooX@d0d1Hi^|G36vsP`ps6RgRg+F*@t(MRp_)y@u0_ z+1HAT(#*%o-!-7U7Ftl=d&q&)0p}rFZbm>whrN!omUvT;3WtGm7B`pE+}nCl!QsR`Jtn(%XaQy1!O#aUIDx&Fa-nETsQ{pce^I9tw0LPUIDx2VneFv({TS>O;gzT zvxTQ_Zlh_bsSX`OgfDTnggf9_SFc{}6DOfQUM9>H4 k6*mF3a-K=xk`yeq`eec zP{5hm)YNXrk??ej0i|<6s zD@tD%AGUthS3@!Oep-KY-i034)1DQ?EjB@d>E_oKcJ(?0<2K|n}=t-=ij5EO!sEz2Q2+?Lqb zjd+QofS+ylVM;ytef)X6>zOmsMYaCLQ??Vg=gkIO2(F9fAggm;N~6skW(50j$@6jC zupzYhWJaFuyo&rxe|rhGx=!BWy+?~rcj@?;^YkYxu@eO``Sq4mZmflpi3$pl3eL&CM;67p5Asg9q4`0Qb<%u%)f7?qX2Tj2D>dE*9WM6VaLr0|C%* zd+QoC692-3Cc*7&#Y*Q}c8( zv@=8+Q~X~$bTD5y;&8Z>%=FC6pxbf8Gm(S;b}A{_Z`{$~PJc1jfbDrZ;APAklP|Ds zJOQj~)%xk|#RU-U&dYQy!G}-12-efp?VDNUCfOJMZ&%W2{*#st)WeK7_eviE)Hd4GfxYj+PV}kUfB{) zcr%Y-R7+{<#{Hq~*C~~su81N7gI=HnkrH)mX3zVx$po~qpn~^d*xA*$NeSh&2|*}* z7twZFWC>4l1bqeXP~}F@8_`sAH-0PcP+~&S83{3(DO9vYzLcAMsD&DOTt{he$jcT! z%**aAs~(?oDpfBdFFcsFln%OPa~I!^R)5}@%>AOQm>QHMJ-qHCi9-Wq*aWP)VZ;z;Gh8^puJ9FpAE+s`h zD7Tpt3g1LZE5+Adff61a|CT_(*aV;OfBN-|X6NqO9w^s?FBgt~+11K@GV%+i>1<{I zPr`Tb4QT7=<~k+xUi>VjWg=TEN1rF!IG+MIc%#6A->M_>J{Zs&FLBQZtD|5*vip`& z-~KWdYai+LE2KzQ6h<=GtM?s1Yd_@fYMpR~Wdm4?xiG8(KQC_lJj65vxn3Mz+UBBI z$j585XLMy=Pcn1R(}d)f`=YaMI1fxTE;4lf7>1V6(-Caan~u?q|E;DvOBa4-?NVfg_C5>2>M1kDhr@3LCk<3NM$r7u;C0#F-=s;%5EPc=vS}Aru zrcpT|HR4ZDb7bg`Eo%_%lM8ag=32m?a}#P_YiwJ}eDQ+a1+GG$<~jsx8|PFDmK2kL zT#zn7sOS|46+8eOz%(xkvnOMQ`mLtZ0C&`qu4RP#?rTKXR*)_f`U6Q(gRiX0*`oI9 zI@Ak%yKJ<;;TRkp-RejK^ORVGyKu6P^wE5I7boIy~vSza8te*qQbk?GGku(vuaviKx&LvU zBXksCT7*-C&v=w4e}0GX#`rP$*i7^U;FUNF$34Bsc0#WmZlfxkM9?PIL7#jZi)FiU zdHNRQcQ3~kqR#g?)!Y}0~s zWK0RdPWBDe%K9YbuhqBNY|}>SUV01m<4--jAy>Do_5s5-d2Hti)9(=e&B%Qt z(}G8R2RTPWkB}+CRNmlTH~_=eKbsHcs!Ur~CjI)+foB2Z5x}qXhRe>thskASdnOZb zNc>iZo2BMgPmdO1alcw%`q4B!I8gt>!KcX}BnJKOCi_`zV2pPx+x>T@ID z*ZC$ebhFRxK@hnV3Yqova_fU{LNy3AfDAscm)4&ivL9TIIh()!6KvL{xA1qZfV8{a zf|k>Gi5to=i;sJpB&W%ez#-WKfV6psRTLRxveop1!^y8dfYXiTD0(e3>{Cj`^tJg4 z-fO^JaWuczQ~Klr_(?$&SOify0_rjhy<$B^xk3W_gh!E2O!sm3EZ=$yNGLh1s$C0v zo1uWpEl<+|aVrEdqKXxE9!#W0kPL#WsSuwOjsbWcyloEL8O#1m!DF43GC@0>4ORkj z2(J$`Kqi@koP`g0Bgeq*039{dt-eDUy#pE#Jb{8pMi&7Ng;o*_Pkd@vUdvr4vqD}@y>NB2^Sm)%U2C@Xv;uZYpIg$o6-j|TA0X!f6KktM?XFu_owr^@jCs7rS9ucyX8i1~><%P=9k>KuSUBc^w^=(s9BJS}FdVd`e z)$$<~S_ep;BUz3${^d2Y{Yls(aPkFlp!5e3d*?zsPaeQ1CngrH0@^OhBq0vpOGA0$ z`c|oq@U6j<^XRJ8Z$73%NKg*R9tAW&O+llD69!mK8obNjXLVPI(?zv&hYSpwC<|HUlu~C(ue&X?@?*Wcdwg#YXW{r~=jk~; z?>bOK*|wa#cwIiMjqd{ohYZW`kw-Vi8W?0(yY}8Oj=UfM;k>f*W6zG0z@r15gm({H zU^wg8ZZmpzJ?j!d;9_)#q7CwE@1Kh5eQF&uK!)m8cPC*CWU%b;?c1OT7X>*s+_Gco z%@B>b8v!GWfR}ukoHc|JY(aVzt9!v)NP^a>;y2Bv@G=IC_|o%?r|;q9=CN1a(Qn@# zE}Va#b@gi4`v+}xkwK{maKp>E;;ZXO=uz{9PsSPHpI_~D{DbT!I6v<$a;b?JZEY7> zd^*)ueP&lDP5j7TgpGUS)&jJiU%&KDL9NB>*m7mf zuRVxg^tuj06wupj~+m<_!)rYvAKl3pJ}iV7Qqh^WP($@#l_ z+`T&1%K?|C%hH|r{fyDsv&Zwk|E} zbc^90@xfb(G^VdV;d39TjPRhc#+3EV9QY-MR}74G7uZ;DyjCAe8KF%mWVNqP5_Hnb zY>{83HzZ&P$cgV%yV|geZN-P#a+4Gg;mpHOax>N&-O!}DI7Q%vt-rXhiON|xFI=d% z{Z&SKLlD20(g49i>%S(V1LnZA4uB$Jdi!m_zJIK3EeUfAub{<88cq?!=5b)Ffa`Yfe(wAL+YQWM7&;d_Pt`7j0U25_Xe z{4UIcis-R`FF^pdg$Pa2Qr(E?`os5;RCub@qO`}9bM(#(rwqh%1Ihky9brT~Oe16` ztAm{=NM0GT+0gaXYFu|+S#`JJ`1-QKYBN>5Ta)L>jLGt;08+LxR-#^OGx7`Q7zHb?{PLVz&0$=+FDUh)M1xD$fp99`hwckgO!uh^RDkXbemTX^ zFGArW7-AxM;xoLTi)!;P%keUhr5C{efA)aun^sVY&m99h1ukS;p_Ipldrx~fLFM)o zNWTPeym(MH%33Jq&BN0hfm0094&cYb`q_ge3s1Rqc}1f&#=15IYHSs?X=XGWAO435i2?Lz+iF8Eea zKd$5xFW*?LNr@J*!-5({?wg#ANCdk`v3?fhw=y)a@T3-DN57R-Dr7eV&HnyHo^B3P zm;Jw8u}W4MN=tWRhXqlDx4}AoqF+=8Y(2a^{-lo0eoz7QdnH=iw`Hf|>3kGW=Wa7U zLeqfgn)IRufQ06z;oHSIN_4<3(dK(kI`hW2uT)3evP!62dH}9%gsun*HMyKLzaRFS zkf62g8-Nzip!4iU?LiKDWQ6buC{N3TgE)wN+u-?G>wT}XsweX03KsuIbG#?tVDaYLWfOOt>Uvt6h*kCQ>Oz0d>9j-P;H^@LYf zcuu__X2tFsjN7q#NAQ8|#$t+dSgaz{{>pSyq>e+3)) zXY`BxjohIO`~-6;n$h()uxuXtm4;0gA1*j7Piq5uf#s_ni8|nvY0q7$l_0m8g1Xdx zVyKIYPhl(Q7O{`B+t04nJ!*pLA)%VPqGj0NDRT8zPyZcT9@d>E zvBkd5%nZ}Fj^kwcD}SnbJKPUT0nMczk^O7k(Pl58+jyV+LfT2NllYU*2uFC=URka~ z;I^O~%~`*ePOoqQ?p69WgKWcWP#Y{)Ji8MRA%J&TuOw=2g?4F%&P&{r9Lh*-S>{f) zA)|Nl&^3ZY9pBzC&NE>8QMk%BwZhh3oALNLjO&+p?dy&Og2L0c^0O$DONqu{U^}Q$Z4RqK2~L_bRE6wdySlzJK=Y zeRHD5-0k$%$k}>YB|#YNB%z*EYN32 z746e?I7yek;=W_^^nRw)Q-Rvz2lAiP>#o)9W<_Q=*Dy^PPpR4Q9}0L5^+SsG*~n)g zOFEnBi_Z)JGI6tNX?dP!=^@v~i<(s`oer-(30~cuPHy>|+eyzhXceE29qv8aUFJua z7YVY1z9mYQ*I&T~=E6$%!T)Eu0AJOtsV8o_5yZh`1|C%dqAg$%#e&Z7GIMdkJ4cYn zqAS!wcQmz+$B5hL?VNyoU7@8|wjw(#W$Eg*RDWP{u_$DkY|N8lqTC-zs=qmsHp$WBb4_J@R1YruV{P z#{KB#bvJFV?loL5Of8ePx4M>r<5_VRgUQup28^0P23j(=dMS=== zKl&)SN>3T-MmuO(ygQuIHHGiHdYIYd?92xHEnTlPc{O^g_y_eX*OLCYxBlc!`CIn9 z>cuZRZEZi_+UccIAXGdi)p@1rRm}`N{RxrVAOJzxW^#$qT`ceh{=5IaR=(%;T z)iORNEq-^7=OvGk_=`EahF|}whs`-~`*{2BRo+?ETk71drN`$u(mJh*$`{)E!*zR0 z`6g5ED5_=;(>_#zjqJO*#F@}znBChvZx3CLZGA-*f1<0z>3pMAn?xVJ_ib`blIBQ> zvr|RStGmoX+(k^E<#(<5{7nBpf zRnBxxBPSit8QoP;@bRbie8AGea8#q>#YEaRzSEc4jt=Tg)5*hU{? z{>w7v{GnSP3bYEF?x3U(FK=m4r_Rqi?LT1hx%02&I&p==!14^A3kQagE_2V>zsT-PvjWcW!c}I z`1I00enZs@5cd2F$IkF$pKF+c*DalqsACs;BAA@9xhFr}Mw$pGY^t;{uaBFt0C6H8r`+JaOOMjrGp>1kX#l zV_~hw^hk@Qj%ib1<={0{U55Ry{hnT4RpXzV^ia-$S)YnQNry79InD{JOXzU!{i5b5 z#?Fnr&W$PaeP0?io42{6z_wtdhD#BY>1haI-?!&ETn;8VH6ZhSe_8Ngw zDM61KRxHcnz5i|b2fyXgLHtLhYK>&@_tRj3>O>zCH^@W)_|Cd&7c3Q)58~Nh>kn^J z8P_vyC{J@rhPSm^K1NhwqoG9I=55fkh-Lesc*QDWU$!sY{flzuTB6IIUHMAtYoo3o z3=1J>a3Zj8^czYd>l^OxSh*299ENc2mewEZ$nqyF!B_hxmPIUlQS+9}rp}xi3K{$q ztvj#hvGU7F6LMSy6Gn-ica8nL(uSht9Xh~OY_n6p-f4eluM2Pgwsh<>KMoJ_K8@we zZx%%Zk^jFA_V`${tLPHUt=qR;V`ksb|7g+b<;}H@>1}&~fxK0Jrt;CQV@{h7!QVw) z1%CDFMJpfJ<8b%vt99%=f-eH>d2rzV{eRr`g}J4hIbcd~?CIrs9l|Z}#q5X6J|q0c z>X8zGH*AG^?gFD*f0+!}{9mcc)yoiwB4$xTMV|w3>Gm0`7X~m~JM3iNZcO_>U0d17 zsBT!q5W`Ctv7Y#)nk?4;|JQY%8sfZJx+_$vOCw8UFwZ^@uyxV@c=+;1*4H|KiGTW+ z?g|m3^|Z&u1XSRx|MWl7bw0-~w$*9|2C4gKrXA0cSk9DcenS1*5m<1#r(%@RnqP2d zr=?i26#eU+w-R~5rwiS7sqa)y4CnIf0lc=h!)b_fd+}S@v^80dG5z=OhAPEi;nH?iY0e(PdU^Fu+P@zc zaJg@g{zMPq$KTrH&3h~Jo?vdNeWLn%l%&1wixvfKGFJCXYgRdN!U_x|d)D(um5yI6 zWId2_U}=W^*N&UQ9ml{fXd$-hENaY|TZxQdLA^81s!u|Q(xKEmiP8ser zaLzM7>;3+Go}Wz$-aYr-*>rty=0DDzoHM^-AX^Ep_4&met3r-8lt$ij`lP*5dNE4b z2DxWP`KjrX8~9I%xC5cA|B`EA+9Oe%r`8`=xIN0Shn^vK1to?@QO+xsuOnbSkDbo{ zK5#Eys}s)U0cWsq1Ghg)S&6=}@Bj7nwlixOJ|ufAKv>9kM`q#wk4Fo&i%O%Av#ePD z;nvMXjssI4`;RyZ@;07CETW&Lg)rE&|Gzu%uuek|!##9j|Kre=7D_|59pBmAafd85 zQr*&R{QjM|D^6Fa4th&6XUY!#!#@Ta<8`(o8G{S{p4`gs)Z^@+@U9!WH{GJHKz@Ph$JM$B z%D)T_xY4o8E`Hr&Z*Hckcl);ZssJ-7DPe8_HX3MlH!O_&IBx(mv#;At5p1Rjp2WR6 zi`xM!@K+JdBPi0DhTe447PLpOB`r0EU#s0f{)63ZOE?#mp|e!IZ#)d`L56di@|{yR zu5>8dJp&QaiW*Y!aENrh-&CP@5qC(0qG%O)qfCH?0bj_GwO#p~`*!4y_w=C-%M%#f zrZsGgn|-QaX5e!X9!E(D7BWbDLHDJ$(Zfj1E4V^$YM#M#Dp? zHBs#I9Lj)Lo~K&bjj^Q00O7u_pyjp{_ana2?w04)YnRhYHKC#{7Kl*KX&U(sM>}V% zw(z>E_Uyme&wfR3i*I|fYVE#Pd@W{|T%$jQgD!vUaI+Teu8!K?ricD(o64z1i8Ffl zwGE|HBcOXiEonh7@lH?P?(@{^jHmKJA(&s3?HX$A=P5K>R>E+gCw>ioy}EE-`>54c z5{gkLOI(Kg*M?q{QwI8r(ak-!7)EH-+D47n`6(sVD!-z9__QaoE17#U{ z;=x9ghCDBWTjo`TT5KLU*u#0*^zo+xsX3%!o#S$O16aGpPS%q5bqpO~@6fWY-da9}pmjv*aOJhv@gKI`Q5{na=V-0@Q? zmwx~kN4xw|S_2Qj*|FKTb;-YV;jGREG5hzphsd8u=^Bouu8Ak|It{S5`t~XXf1DBR zt<3f9UXU&((T9Rc2k~iyl-maHjF3Vu)rEa7p5~TjuLwNX7u-C~BZlY4jv)6x4QGx) z{8=}E^4h~zAkV=2#m!?Dm9)V{^S`8oL!P=m!-d%?mP6-)+_Pb-B}8$~p3O^_!W9~H z_Sn&m;x-*%aM$~Z6C|Mj_^yI6yL;$qdc`%7J^DRu;yS4!q1kI!mRa^|{^Ddx4XLWc zW2!SK2j-va3+*Y{YNXA7bKR$Mj8fw%lGtPcE)V>GO#>!D8Fnq;ew3-m&9oLl-Q*^I zJ~oeCOu?#pZ5v{|fp+@(|CmVAC?eizvojr~tVF(aDLV8N*|s|g>wNJ+NVzzpG=ER( zeQ>4g)W61?pmK>RLmxwAx1xt87u{!C(&DMFpxY9RcP>YG!Wh}6-iCEECO|qrHGOdd zKf_@kgh8J>T%4(^|9PPowYO`1V%9A6R`BD7*=AfS&Ovwj=-Q?ZToGBcmId98ROpGN ze(IQha-@@Y+p_Y#87lAR0Qq;9T*Dfw&o6Bmn@d5o z+A&&@PL74j1XV&<4H#B#abNN97@N&(Jf-+=JLTE`rP1xqtp72Ik13YujdgH@Q-oSH zHWiqAfk=v_QaeCpma6Wo_6!s$fr|Hc3Y8>pL>}4T!Q@x zmFGxZwk)okI1GiHCT|^MY${25{&KAJXV-wSO+{zXc~L=&le=wr)J!iu^lh>Q7bW2- zt5^T;N#I=CXK2LmFI%_2pH<2GKadl`ZocPS`9P2W89xCW28W!=+f@2+a5csOl$ zn0B6T%hnxKILB&y<%S*hnB7_1f7;AEVO1j+#BFFk`f&Hb#2MWZ^%L5ft4%H{%c8H( z@$yqP=b?}gzar4ND#%7@qn7=MO5aCKgC1%W^+XS;2)6i3D8t!OgSWEB{l}q)OFdD)oaBwV7=J{+ZtA% zPnC%uiYIsYgL~9P$49^>>F9`zy_m6nboAnI4d+-Va?ALAHotOgHMt;ns@Em2hq1U? zRq_Lrt*czk&nq5TK1ro;?d_HI{(Rd3Ut`!2BF4u2{CF`nHm22F0HyrlBSWH~BbT9m z?9}|!W%&WSaxD0zUoxCg3_{1KKsUHBc4;p}7Y8mtvC{Pf^z_HYS7Y<@%!W>sZbaJ` z^^vZk9BaAW{u=Ihb5?T@*JAG4q5b~V$y?{iGveo&bd;z;=X<-NnO{dQifI~NE|Djv z@sEcnlIJ5uirX*WX$@@Ovb~hCHUklR>;}n+Z^t+CoE(f<7ssRo629Y?s3J84k%C_? z_ejwgs~!FoVhrXz=xF}h;AW5WX7e0-^H*H8ne0}*A!~$~xyU!kd2mX1;ehpp?z7oLq|mhwEGj2ygW10oC;fGv#szvocS_#SRHrG7wH zymCkaH;@AM+l92MgM1-^hdY41sD11dGv91rIRY9=h(RBHL2nJ;)&krR@b_@W2$ zK%5ECj-dX_HmeVQ;kZJ1TY*zMb7HqffMTM0Y7Csl%Rg@F)1e>(fdmaEkQ4wdJS{V@+*7EYEpK3}9zp~1d-bEO81K_RvKlovMx|9%XUc9yOJgfy__JOA4=<7-_+SxTH|&hBd)jG?|dc50#5%-Qqxw)l&u zzJ}Wo)te^erl*AZ1X}w*zm8dqF+rlRK@{4yh@3^r@rA%Tl$_+4PwDZC zjh=lz%S*C0n8E5*iDFGwXGYNC5J3WeEb)R3&-If?Cw40L+r9ZOZNcL*+XW;ZU4rof zNW6hgJNgKmadEe%garE2q<5NRDvFHHnhB97OHH>}$501QdvR7N7ai5ety#*HJn!$` zU4CP<^`t$NUX2@CYn&&cpme{*yBWudE1d%?je;75LfM=CGd;qj*-J9&fm35b0zi49 zy}EkMRljq?K@=ss@SwgfX6Zv~Xg21g_k3`t!m8WIj19Y|8g6IZyn2&8ZV& zs(AkKAY8D&AN`JsuRaw2P((?RcOwZ!k`J<)XDFcdS>)jnYz;0TWpKJQGK%MnNId6Sj+ky0XK8{eh*TTlI|w%7GuREcZ@$oZonv zU{hLrgQDz)T}vLp)%1vZke}5ds^#L@i5E;wq_a=C#f;_b|9j03*`Fl;)a1w-21?ln zu0WY5QQi5S%%X^M`tnztX|X6-A$8)J)d&nRiCVVb5je1-8!XD)oND>J3wtpG+pDg* z=qo#1H9@TNkVp$QVh1j(hCW}8;YatR)s}naH`vy#y*hfcZmZ4T-O!?o3A(#nJlZ-) z$Iss$T6itgQ)pdNJ}=ykUiO$VmA16?8tg5fYY=W#?@~1iJ*Pdf%dW4x z@8Lhc&N3P}7 zCX$M4D=Su(evhV1@TR34)I>Sve)X$O6{}B(+Cc5TLM8& zG{GnP52raNr{NsNe#GF#$R9s`d@=zpqeM^%#Lae*Bi8^CJ&p>E$*ya#a))^#(?}xT z*#jozk3R|05@j`$l$Uu!SuqbGpTUaat)5mh&4Tgjy8F@R>ZV9rG+e)`>fT}cC6aH~ zw12@yM#&;{e+G}WV>jeLB|mpBRgdk>*m^%fVNbC)N}FM7sqNOfzG2m^S1%Tw=7&NI z8$0RcBQ!_r@Ht^!@QbR}MD%=s#Sn|3X^x{(hpU&AD!`rgy@3`yPSG5hAcRRPb66>V z$I|4u4p(?SlTdP(ObWWO;Hr03Fsr+q#isO2{;gSFpqtYvP|!Uc1rVrGeq^qXd?+^0 zH_?G7B*^vk^%T^oXT6f>wSsPE(}Hdc__naNbCS~xtd?aWiLJ1}%RDW~aU=A_#qi4> zeGlNLJ$v^uAGC2!(AhG3APf`WLP2%UGQ-z-xYSf18K@+<_R0MNB5v6Az&WU(acgM4 zPIcacxjwwvbr46@B#mFI*qUYsu2!N$M;wc*{Q9jmt-Qz0^E6k=H8r&zd8PmB$nx3frsGIcGcx~#u;>K_$p<0*gj6Q z+nZ2?ix*dx@8`y`FMSrrZuvSz$%_Vc8sCa^4cJ=k$*|5hp?eP1G9H2vdiCV1W z(YM0LEY-^cC9Ks*CX#gyUAg*vb3KKzRDNjbnOWVmFhoBrnPm7WjPw zrGZgs$#GT+nVfOfEi{Fk%*PCv8>H#w$A|LPD8^|)zQ@Mn)%HwJc%8riS2w4 z@7P^5h>s@ApS3!r9$Xu+Mr;!z26kmVIFbym8MpEtbHp^~bbh%1#RYZb)QWQ%DE~_m z$+@4i-74fwDGs6Bx#(+V#aOxT*}E8LNE=+fFhkBue{btW&t7$prqk)=Qck%{A8XhE#UVl$NWjtBf^$Hp1c!F*MNS{CyD>m}tLcUY=QypH)yd^~YMM zW3zNz)0G?(PtRutR0g;~ZTlD?t}AGEO`plgmV><5I_2BhBSY8jPFZ*xb#P0Dgze6* zLPM+X6%3{{S0jz5b*@2Mvqi5Z2@cNCwus29#t-kim*$)}^b)fyP)tDX@=7yBRj+Ga=w=Q%Ugid#i@`hh} zfY_c-2_2g+I6#FaSNHx8t;Un6mG0Yi7~8HsVa~KHnWg}9OYVf5$5~r$eIJi#m&Mf&Fuis^&LK>!hc)9JQ1DAZE@(Ojpfp3_Szj+!ToGU zw7m@_$w?Avn98%{2dP^31(fDD8i;h1GJ3A}QgvVd{oMadEQE*aZ{3&5853@1wp)eL zMiB5MfIyoCXk!k}G?jws+c1tcaC(^QgODd(c5WZ}9}4sN$-2$VqtwGk>%R5%Xxt3l zXd?)dmU}hTJSvUU*Jw%}TR&B)k>Bw0j6c)93S4V(=Ttu%uY9$tMsVhy=M=0K5mCz6 z*4{ecSsP2zs=ONSP)^_*<4=3pb_cxKSPnwq6hA^pL9tPOJOzJ#Yx3Akd(u)6PkkkK zXa28wY59)#Rk(6zXxS;qxWk+$@#_eLM`M?~v#NSkpRo(L`{&B9BqoJ4o;1Hj-uz(8 z_Tfw?#~?!zwn4LdbNbZZNe?^(EL#Xux~#evY{rS;WxoEB?3{cA=)S|mh$;_N%v>RE z7F+V^DOBtiFcVcKIyMV*b&g+>6s5crer=vvN5#zqd^-w3vOGeWTH?|4OYGAJu6$e9FL7II(zU3N8b!Qk;%M^asAm0QiKM)@W<)P${% zPM?a!R|aPT1^~c+MEE<%r?6P*s5(ZgT4u`K~iA zGUq!67CzEmRe3R4C}`~&{bt~&YJ`VScPZ)+f@gCmxSvUu3_Z@sV5&V-ba^QnB3j)- zpe#mZ}wMbfz|2y^!lic|7JsJa09l^t#xQeKo zwFFl}=HA`j*dK3@Y<7SQp2SERjkY4(o1w+LeOTDB?R?O!(7wLD&K~CsMA9ZkS6lQ; z4dMrRrOmr7{f_WB{qf=9I1^{`0I+N21dbPTD$sy0%1LjZ&-zbSqeO?cr-sLFS2Y#Z zEt!dzb?TAUmPoQRP+xt2bC_5+iBwgP%j3urb7$-1+@*oOU+ibL`cpJZSF5`(G55#@ zPw)-BAWW(O`oRmK)}Fpd;KFfsBG_TajJddxoRm!r72rh=wrlTjU+eptD<*v#s!JoH zmQGg84fQRh3u!#3@!GQAz5!7?zvDc)$h~S|zY7`2pC8{j@)kQsUT!o*hJ5K_H2Yh! z?BPUC9U1P!$0O$lN$M4}zT_Nf5A+nSSVrXlPsDxXS&b8c@c2O*g8qZHGu_wMHR7qO z&^c5U{^M!xL*(Y}-MeSx>1I%6C8N@mdKGVKoNw|V3i&->n|6Bfn6>SurgFYJ5(#tS z5ZaMv)BeY^_uy*fAul*>Hz>@(X<@DbqYstF6}{Lb<Z`gho1a? zaAtyQCCZ)P?ut!HJkD@0iOFOe-Z-{PaC}=0SVL0O3!=5vVPv<`* zGP&Z;b%!BVAM1x=jG?4~f6llJ?|rbJWBGBPsMqS4XZxPL(j}|>4!%}#1cdNh&6?h} zho?O){_R`PyUXlECjKrr9C3Wx*q0;C87|F0LvhrEOpspsR}-I|?RKVWAN6co${9o? z@S+R(GP?cuH=aKM!y{|Q@ZetPlgQRcrw+r%WmSOC{Ij)4m}RhAtbADt#(;M9jcCo- zuu90xFsmmCN)Keu)7EQz&^JlEJj>Vd`r}BF!TTMHh`ZVCZd~d63>UA@^k?ENRc+bH zO19PR6WHdEae`(tCZrPcGTOt{)nIKYYG1p+{{1=m`Kh_NK46LLAkycLYU7Vox4$LdJw&ehyfHW+(FC6ENCo#f6 zV=S9xN%HO|cbn)ctql<8Ep4lOe`HR~tE>{~)_HTZ>snol4%S0k|>ERh5}Yq0bOO5m{2=WFfME9x;iXnx+5W)5?`KGiz6zW5JH??6 zqZeFntsgVf@bIeZWfAWzIkOncc@mJNWq-XwQE(rzk^S|q3l?svLW!$~N*~Los$oX? z0^%gh2TIxXcjB`I&R)Y&r;b_?YHAAHT_n-OGn;d9Yy0PBEvkOjX2%IBb#M8Get%#< z;W}cq<)zmxHt;*AqWz2_Vw)k>*zX9VBLYeC1kLW^w&fF6HAjy;v{ra|X$8f$xp|x? zQS(`MjMDB#o{Rlcf~vFlq`z!;)jT;coCjJec+fG2bFLXwI~gaZfr~>l)_)BPF{k-?7r@S$v;`N}eOg)-STqzE3DB;OsCFZsk3>ime}D03>+B z9)AipuQ=i$aKjwBSXPOrya$UvP{NDY0 zy0)9M{4+|azI`#J?fvh-{V56=%PV`GBzbuv-ee;AQ;Fh)q8$ln%;gP?mp7n_Pi;Gt zoKArlU0wxgWUZaOtQo}4fqYoFc2%G5Wu^II#fo6hShZ?SAhhx6GRpyaFZ)+6}ZCoP;qLuk>BUyB2hX)ZR-=SfatK_pqW{; z*#Hi;I;3Q+Gv-487a;Tvb!ga+U5IXMPG9=PzDI3DIUxRiji+pau6E-tyCA~*^Gb3v zrvUwzJB{^`-1L{%t_DlhD9w1PIkYF@AtFvT*|US_V5&b>bD>ACgzRK^OW-HZ>f=9Z z|8V8N)qi{c5V7}N&_~05i~Mr8h=5C@gCBSQ^n8wKHN|?j;rX_>4v9$U19}DP%+xco5!^K-pQTcoW&c>3?~pRj*$fNPckcX|0DtY_&hf#WId(RONyqy%WXX*W zB0g6={%54R{ZNo3)I>PinxugFh_l^Tmp*tbyCT4n!LFSau<0D^9=!c_WG^0nZNPWv z_xZ}T)U3AYn_fcK<;`_4F!f$s1%chDKb$yB@W3!bH{0?;Rgy|}$$26=>$VO4=l=40 zd7L39w|yz=#d*2lM^T(wGW7{6;85!4X2nHc9fT?M-nfu?Rc8v)g@37Wa%#I-d>lJ) zRX~hfttjN=F*wjIAKdy$IbtKds`0x)CXHwoI(jf_)@;VCEM$2iVLs%wHM=Hz@-)wT zPC8YZ*-UUg1$`LlLcM{j^tLynXFr2WjiJ&|f7uMk7s5I=ZGa!KBB3lxrhH8qOgzRw z>!4Xb-8=Tp(^%qi>%7b~jpmAO6>7Io@2n2k>Q;@rGMjl$>CddeDvTb)UdGQ;Oeo{ zH1t_=2t-R`axC=JGNDDdpLlNJr`v<;QJD4^c0*suW9c6>Sggz(3IEdCveYHw@%3ebY|q<7x9W3HU7nmGJ7+vcLT7VHiOubiT6kx)#DUe(bKzdVJo&SQEheW>*A%Y5S+<3U@=blsIJHD-H7WVo&@R?@w ziQ1sCCEJ0jh1f3J+NFB2y|kiDQ`V^lySMJz=V9$?feQvWe?tuW;%jYs~ zQJ`$VH_+P&UqC?Byb_fP1#X+C`J`BsN=d*AYN zJc#cPob!>C3H##NYk@p@Mr zwH!rCa<~}o)BRdsX`tlen~Z^y062K9MCu4}vpWwjtzrz<(tDlptT|^=ZG~pq{ljUF zm)Lcip{i}9*LpBczc`Rk-?yLrvT6 zPW}4v1Zq0U_Oj0#<3e>+@+PV{ZSuDS`LUvW``e#}=&A=rk@yoY|G zhd!McW>0>XK^)<`Bm;!T$M^tS;xok0pYASYh$R-@Ypd4xl57g>_uctGeiV-Y z-JE;mAG^KANeix!Ft`1ip)HPo2pu@qr!jJh0`NWd{y`_nD!<9mcog#p$ePdnZG?AR z3X1LD#S^&`Q~qv8KH#pq!>mlLzxv1{H>D;^cE=r0q2@s4QO{`P0@fXlUDCzo8!;g?SfhXxkI1lVdbJYcWg==^C(>w3Uo|lkx>mF8@D9 zZFo+T+z6|wlco4}YVjaFuBR>!peFb|G-b)xP880x+f}ujtZblcMf!*mT**kZ1{2wFQZg64OGU47krMo`LRZ~&!>BNGuvx9?!_?a5hHbc0dR#!x- z!(**TN6m_`F0?mA{KD$yN;7Z9ct?v`z*hV@M6$b6T}mov)mosXH_pDStnF!}(q10D zgM_xEKEZ8bzKhDXI{8S}z)=gI%$3S-r0{Wk?&(|$d_0j(D2|J0yXplpkxV4xbP}w5 zAl(jg@gH-HPAT>GnBq;F!GE>V{Xw!#ZQ0G<5)?U8JHi6sZiZ*OdlL4C%qPz=O=S|A z<8;J8O-3)*@?t!A4v2*tvx0-$F+;}0c_MQ=M8H$?v%#K-+XtYf->*`vl~%kRVa7q> zXWyEM9lP8KQ_BK9uv%&Ue@t0Bi+aCFE2WQ8hGQFlkKMt5$)HREqxDJ(gU&}GcH_^SMxr}xf zuqbhuDH3rumqAkkOL^+g%005xG+ay^**P{`EveRPD~3kHs<=dPjwg1uMX|?f`DnyZ zSyl>2PKtdUag=?v0FO!l-*?K+?K#M3+$tATWnmzP&ajglMH9z3p4V5q^MX4-T1QPKJY%FDnATFODC zohYnPCl5g)ZdZzaUl>5GIZ=xjV*o};jYny^n4IKE+ynNzS056%P?^!~eDTVkyICu@ zUHmYK8Y%KAC?wti(|{xkfWk3jx=@jn%1?D5Ge^KoGL9dhgc%Romm*P8m&|>!dm=DR zP?Y{s-Pe?B^8UJ*C_Nunm>1skggd?(VL$o$&Vv=Dyiga$?mYI`G|NxK)GuCmYm$}G zy7@kuv{6PQ603%TM#VHw11I9r7mq`>k3Jx$zc1Y}pj5befM9LAEeq{7lo?c)_U8zJ)*Xnn6T=^vORf${Ibo~7AauymIM`&Et zLK=1vs?kFbu7P6yT`o zeZLHwJUDRXbsSt-(1l0^D@uScQ-pCZ2waPT_qa{>myB1b6(HK2nL)MzgT}W z@m?vizw1y>q>PTw53rdS!wwD=a-0)YM0Fgx)&Htuj-^Se{drj1_sD)=o<9Bo>=Y!4 zQ|{cW_LO@^zvn3D*|S=l^OxvJZ~gdq#Rw)DE`9oWln3`V9@TzQ<&Q)3A{5Zhq{(mf zKL)_5Rl&`NWo1>~nyf{n(3;R*wRX#Eo?DyImKM#8N79@nX9mi0{04~5?Yc`ZQ3Vg$ z6kb~=Q$L~14hd)P3sxh0An~b3; zxmguX&#gW)l)}h)(&SdiNt_XMQT7Q~)8(##;;+5*m!6@V#j0e;q^J@)`;UcCp2XRE zN7v3SqxbV%P+t|O(a3tl2?$;=-hM;9ANApO_;`CdU-v467th!zy`12{Kzy(o`&1O+ zMO2k|siC-eBDFr!W+1C94?14g{xw!KaH>X>Ha%y1jRR0z!18A>sau{rJoXBkwPZC; z^1I!DS_iT(FO(*wH~^pBu%!CK$?bP>e$LvlK{ALPqXh#8`^6GOb&L#_Mt_f1oLZHr zI!#W2{lr{>x$w3A?BlykFU`OaxCm&z3XbNBywQV%RwI!EH!^XJ^Eg$mhH7;np?bu7 zmB_oGw$tYh9QL{O`wkc2vzMy|%8@@knM0hMxPz&VC4qoI>h%1#38}0?<`sBN+`Q|z zD_vwr*QP9&kQ3aKS+#uR2(YgUl!`s7u&aZmn2Nac5c=1}A=%*gzt=wnH9yC0aag6l z+{;knMv~wB*`;gnUis?-zAk%)>noqUz3^E_y6RyRIJ7^)0TNYCzxo5Z-T4*o>O;$F zaYDG`kEjX9>MEVJ?LX;h>I$eh4(c9Ix$#&tZBE;{w?mP%vDmUKTVr<>-fG`$HNyhz z;jr+&u#KmRRv{RU$E*)da!VJ->w43A@FDWrvp7JqMxnJpu6VC0h#>yi!d#!6Agh83 zvMr#VI<(YyFx|o}kRg+qfo(QkUM$zQ^}iheuU&HeEK{;RuvOp30u~VK^@#NkK=sB_hfvo1Zxyf2 zrcv)2z8E)#+Mtq4vGhZqhC?@cdD~c@=F=B)i;3>h2u?3uM)`om)9r3<+_y%A6=;!` zc=jKNa~no3jBS`8)N*d=;*QJi=d?JguU5aNfe+)$8A*t)TT=D)lic5ACD7JDP9C^y zBz$9VPU|QrFo{gtIuD2lxCv?4KNf?O!`t;a_TVhM<1R_eW>7JZhSjalob<5#B5Cqf z(DMypa8v99f(|Bq@(wCz{}Dl9>zju_Ij8(<0}8=BfEz*RR`GK403Ir6kMjdzgnkj0 zpBYOnLAC4n_`t#k7kr-F0+R1bz}{Fxp^f1T@v4F|9l4R(5G87f zp$mL<>I@Vz^ynv^dXvY<%=E1GeCSEeF5ZTZ!^U@lC6QMZ$!IOn;lcTP>5ss<=ej3z#a>i44sRm-HlFNUO)-FvNJXf$0i8h?KJnKAy z&?c^!;T7rKf)~K@?S^pKnrfpzdGG63;{c_exf9q~DwC%3QeJMugVeG$&O zWWJV#N)WG-3E!;{IhcvMFW^CvBX{q1dh&bSr-cMR($Yc~Wyiq>9(8=NVi+*$B)Im= zpE6P~2TbpujUXIHjeIg!hf_QMvidO8=6N}D+}9P>G+26w5;&TASH;CW%^VEIojE}7 zMPMMX^w)_bjx(2Q@s~Nlm6HygG1Oit#OQN>E(`*aT=p`^H?qBG9)JZWis1^Q`QLWy zMO_9nW}puuUqc}1mgbOw>VEUb&zabT_oUk8tvupP2iIo9uC=+VuxQUB);>^%NfU;{ ztnlXMF{6$co~O|k;bRp_YTDtkI%M?%&kWnX*S7X)DN3!Av*!IVs5Q-%W_0PfR>5x! z(-l)pTc?murTZ;t=4j^T;&CZR1w-)%7HX%Kd#Iw4N1b$yPh`ANFItQz5_rPm3W~_C z4)b0_oj|4x_nHqEEKbhcPt0F=0yO(yO;3=m6#iPhKq1JxJyfh$9SS}1qhb;cPE{(y zy1`!2C6rtK;sbBeTgPfBtY;^nO91P8T414&kz8IWk>fC!+bvvUG0@fB-Cd0jb+?2I zhc{g=tvK=Du4^8jp`#90veF zHU({r|Cw_v!p?9lJO}UVu$Wf=yzWx@7oW%I&NdA}igTNK&YNCAe=Me3PTPo)5%D_2lPKes#)pIfL3_hB4sowvLv4CKq44tGoaT35 zf@qimxdK=FbF6=c{?Z^yQ&&Ff&c`kzOW0U`J!myRtXKYJ^`&%SdFBT8k;Ha*UspjjxcD>-#+;bA-X3$TE!7tq@ z5@;+e1Q)u1xJOvbd%@Gg!wbOfF-Aal(6%F#JUs8dY?J`RrdpIGoND`*8cY-{_Y%77 zWoFxPMm)`9u)j+s3y^N0CnT?ri*t8a4@k1APifBwdrAkfrQtqkAGMICoRazGM-m$G zTeHJg{qOHBZCon$m}jn9%{PT*uvMq$3-qWX&JxPX;l$@*2#LZK{;yFvWLh^Fc(#T&TEzF9I$AAkbtbrHqC&PTtuYD;o)^woi zoD>bvg(IOU)eKVAx1 z;#n>J_Pa9`9^VU)%wR784^7iH1=uW?{_WN235sxed&}f&b-A(G8m7$_bB<_hW_hIW zvFh6hinE0yRAaN0u#dR*BRY)Ucb$$>jE_cB|^3k=a3~(}}79z*LxZ z8GCczBD8vGVi`;$Go86PY&^vKPR*?~*oXEd@Nu|4WA(N~xsYY0;J5Sorv+8M6HfgU z{@z;JNM@Fv;9%NJ*U(V9M}PTYVbJ~DUveNBT_F3rk5&CEH<_vks=qL<*=OqOGEpEe z1_y7N!=WOMbGsjY`I9eZge7rP?FM@yS2NA`L0d`Ij7Vkv)@HNJ^;m}t&GX{{BrC*( z{?`R8*sHegh@D`_1umbeZig-2>ss!yoivsHlZPNzcO1LYVeY>iXYTLY0f{GI>Aezb zLm01)GvDMLY}YuCnP;zL+UtE~yZN>|+AD)WAC-4zwS3W~Cqk<9)b^|zC!)x#`?76( zw*G+)zl%o~2hZ-E8n%Jp+D`)$juqOadq>(5@3^0Tp=EIMSza}jMj+DL#soK{O zKL`?Iu6o{f-eW6}2E1G9r*u?RP;C2E(P)&iB!I_CftC@7>=%w6QfAWpn~%1ob)Q@Q zgEwkQ`Y#i6ruap>-nMUHrAFrX7V01f+6BqzEY1IgD~IJ?^*oi6LqJY$UQW(z$T1_y z1d(BxgiY+R-y1t`b!YOTpl99Bz_}F`Z5E3jlQ=o|+?u}3_8wa*p1*H&ik2W8>o$CN z2B#)P4nHw)QyY%J4a88J_U(IfD<^W3&lLmf!fMjw9>MtETxWEZLLEtl!wYv$9Bnn1 zyVYDHYkSR4geMs(rx0dsHdC8VVQfQoiQ9gGUF|rEk+Gk6YRO)!U*K@^mptEwzryO^ z96SGAPd(T7?T%6IvZqE2SWlr*CF6MhCiygf6#5ceinEHR{&JPF5yHdM`i#<&$3h^< z#N-_>cbjbzw%C_URflBSJ_?yv9QufSK>7;5;>*sU*+KjVyWYq;oD~x6fA2e z^w^~MCRnF#g^a#Kj*bvT17@-|u-uodWb!^j<)eUq*9&>J*r5$=p8)SnaNC0@@CP2m z=C%jXvOS(F0^KJ+N2d~t(s2zxrYk}+f{ue2TrI~YX)q~o9MI-%oMomKr4#2Yf&RN- z-7R=}e&9>sDEwn$UVJz~PKP}II!s94acw!!0X-jE5K8F%ka%8hwJwnCU6*%S*&Azl zBv5wm1>bzd<$Ma_9HbYTjy&PAGOW1NNxM#r7OX12ezD2?uU!ZpYq!LB^RDq2ly2kj z?+@k?6Q^lg4|nhN6RDl7Cvvj>j; zYN^ytR2(pGmHc*1XBQOLtSpRQ>VwNTp}A-O@0pr`Km;^e{MgCAVP6k@=iW6sqtKo) zKiFk%b_uXo>kqP4%MRKszt$Ye0zN+i<}`D_oD^W86CQlMZAZ0i@5D&$oJpkrJMH~7 z46FriZ)!>|76&=YuCVR7?+=~rOUAo9vD~({mX*Bv)vf^YWA0sGt!@Y*rW#i0ahJ8S~a#`I|!;;2Qaqh}=#At7`81B}iX)w#LY zJawsPkm%k_p%5%h+jL>e`KpkPmMFUS;W_o^<3zN3MPbsQWuM&gy|GbMQ@F6V#K(qU zBt1xQyXa^?9A0{R>>KGZ)A^Zyg}o?PW!B^%t_fHj?u{UJK^QFDE|C;NJqoeBp1AIe z8vN<90+dJ+W5o09y79T!Sf$30}@ZqJo>6CEz<_FsTol9e&c!lcqS z*UG`Q2$$HA@#S~QDM?$tPHjT@wLUF<7JYjpbKLUn-xOMF*y!q-11F#3KJ0|jw@ul^ zp(3a=&Y=Qz|FFatm=>7~IwdE+N8D(1Qaz z#s_#_{jW?XI8+bNs{MC6-#InHf2l6y^)emXMz1REql407QSq8V^O0=xknWd7Lv#1H zTBk%UY=Fv{&ZXF)k@c}R23p7T%=bv=>bY9S#nTzZr~nv>kM=g`!R`46MGT5Dhbk7_ zsPz8TKN^JkvTJ^*vTITej@GIznWVeC~Tjv7aV#S zl+)0JZ_1wgEHuMYGfBd&i1oq_R~yV9=UcIw=jRv|W}%x(nzy?pGE6xVU?hCZoj(I9 zPiB$xr=Qemu?>ufzu$`sK1UwZ-Y%lD-RZ}<;ULOPU4~4fpF0@?c{q@oU2*0G(84^g z_HGNQ8jA^oR}tGn>h=SUx(FMm9Q@op;=y(6;)1v)j`ePBZS}*)!72q|)@p_a zgNCtdTl%o3l!~r;8ZZv|=)nL{)plh$}{ z?Ua1uPnA=-Amiw1rtkZH0ArShY;9!kV{6bpYHRO(T4Cu`rwE1kPIjIJh4^40ui7(^ zfegT=@K#kzFIt^~3-Q|w@_LlFAG{Z*CjW8*18+hw6KB=`^h!OoqTLg7Al2s`-?@Ob zRS!wTDNzi5Uc0vY3q-1Qz8ZP`8)sfsw>|Uv_a@|z|E{N_B*Z~|oTVQRa*pQ}f#J4~ z3&h-|@`}2-w7R;wv<&9J1n~+o?emynCIghYWQm3>fIG`FCr_{XEs-Tey$8G-(*j2B z9%b18Oc3PAMQHVh#yaTUF`uTjvp3sitGDR=>}AzqMw`F3bAH~5Xf+>pLfx#RD2png ziKgA-7YcgNW^7oD8JuaeDs$;jkgcA~j6Rg+_=O2B7?id|y1m;1rZxe>6C(m83J@M= zp|8J8w6M8Vf2K}Cmsr6_=qp`@BpMi{fiZSigS8kfdM)2XBlyiR=go;1jUqKwlWMDe zw5kc2Qwhz6HItbU$Ubw{GxFZXaPcu(+ zeA(=C9d^!+rK3t=RQGH;>Qv4Na*K(vPD%eeX(t56`15oIDX)()lV&|#6jA>j+ zLWv>FGAR{hrC+fwD(l3$eaP(cu83MI#f2 z{}e^}twP$!0^14pmQ2MIV%BMy_K_~_yiuoBwr8aNr<4@xJg!y+FyUB10Gx5T{5_B0 z#EYiX@+Z2NoW^H2+ujYKwGyhd6420X(LT>+yBBRO`;U-X^_PmxYB0^L|hh=gUAg^arK`(EkU1~snXo`ZWy0WIVJxef&20*U;hd9WY zUo|@I zx;61ip(mi&%d&^?#6xTC9{r(unnxP=H>JYEb7r8t3hO%`U(Od#WRH)oVOY71Ui}o? zw*K^6mAfI9@6oLH8-{e6wHoD?Vfra!yuI>ev78&2X-4KCQIOd;?Kf+&HqU}=tbG0b zf1Mi{yI>8)i$PH>B?BAlEWXl{ScVC#51q@2eHiT}U?aMDy)C!Q$#@9YmbPw@Sba~^ z=HfHJ*Li$y`%ql4-nKdtQ{%-L7xoNydS8R3`1CnIir!aa@su(`%PDRx&3Zpm{UtmI zb1$v#Cpwcz?|!PUTv=UkZzeg4e^97jiGDM+ z>ItT+rERD!DFvg{KG0b~sI$NykWJ5@y7N$=rSC;$-oUVtu*a4GFuJ?0<=OWLT3M5Z z&Bpdg(|Q5jv+e<=obo}b#-P!_;vN=chMy-(iIOxnz1r=2q>gRUPU|X$=V%z7UO#r4 z!3ffL)SPjPDKPdHg+1kLywR29s7TTf&lrg|B!vRnZrj4AS3xWc$LL7B$5l!O31WPjS{ z8Pl!MLsYBn#nWq%CZtd4cO-;EE7H_{JVmByC5*&Nyn29 zTW=|fr9A=ojKSU-^!NpTyFl{mg9t+~sp}kbF{3qNLT1RcI9Ldyl412tS3+9Shmup_ zVk&jVPB%>*>!a!AJ9V@kg{{)Em2y_HljorF16WVrCP0HqXu9$PsG(^L*irZevRd_7w+r$rw&>56Ii{xK>k!N>xR zHEsoRcn)$BGruR*Y|&%v<|30s|X0RC4k6OWG}5%3`1lF zWCUf&46;KApjP-Q3L212A|fCgK~|WGf?);3u#>2aKv+V65JKR81Gd%w`#cbyki74` z=bn4kJ?C?jqmQsGf(g}bXVN@oM8eloU@KyTndH%aaq(fXOJ#Dy%GPttMw>*njfj;w z?XIr!r-G|U*VFMv*c7?=Y}nBAS5t`}5-a6)-v3x>mJk2MVkuOx9)7N7AH3S`Q`_m~ z&?6U${wTZm)b8jWwWEqLH*OuhpPeldU_AT^7Jr80oFbp%KPOIqo|2vpEl*DgPMl{k zQMv$!p^k(OKktT$uA{R>)pE5IFO_L_4yHK6ua?QuUEtv@fme2YRFFgDqd|aM`Wt-2 zjT74&-Ml~g2Y6Pcm~{+RO?oid+5Ke9>Mo_rS^~2k_6?4MpFeY`U)fc5i+4E<)#0?W zCg>VFA7K((e2{C+aVfx|UX4fr_GqaXH#Hm=UZQuI;v0SK8%O+HEHf!9;k}Zitd*MZZa| z9D2($RTq`KrLgu-F2FXG-hsr1PwsKsN^eU9IBF+Kp=_`~XQ1zoQsc)=urtXIu0V1Z z?;9e9IN1FWW49(UW6^18tz=JSnM%=to*>c+T1z~H@jCWlt=MeXb=pL%Gz&R9(hz}e zt!eL~aF-G65AbMlJ@qpKbyX3x^^0{YIrSzT;tnI_IsTnLzf+tonvE9hoSv~gQ{wN~ zQ+I*eel%cN-W{P``W&;+^kGHvjv-x9)SB?r)0h3G`1O(fi3f`r)+*gYr*rLCdyqPF zHb|dNH{k`8(gllsNf!|W3xNBO0>u++%YWNtlWulY+9us^`IIyYCzpq!V*OUcg6BA8 zCtz9O>NADqNK^*Kn9OaBD70gSoPY)MPuzG(in5Ex3sqLH&AY<(0z9q6fZX7)p4{xA zyW@s|al*w~B!l3&)MU{AvG?-qvbpy+SM$X;tsB+}3zy-Y@q%RL`&;EHTeJlwqFERj zqQ99XZEcAhq_PV?CuNd;NhwpRnLMu(#5+BH1z6F;Vo zeD+F>f9o=H{iVD~z|3G{Er!;;-n_Y%(Qvb*u_BxPy4bHKJ@FyNUv-{KO>Vp_h$Ht8 zE&XLCMuhKzT;YhAZ##%e3fm5%6io%DYfag3#EM`3(y8~2;JlZ^qvGZ=b5oc6ZenYB zIcSoN3+Edf7U(4*E~Jsl%;gJiLUYB^^PZgA+Q#ayO--N1BUZ{p=DmD;QkM>>=`23X zriFj0EanT+ezn@8lXg_^Wiay#wYlCqONZ33fn#&lSR@Pj_@$zl<}zPxOhw!M3P;@DmCi;pI>eASNv6Y$8CxB%+Hy^W&`=Z9 zfPS0;k(x0J!(CboSzfpYO*^fKAlZKCeOefwU0FA=WGPQ!TN9XVP#{7P#)~Zhblkn4 z)Jzq^TE@|-W@5{V<@IP*#au{Oge}&tbmfE2enGio@QTi+jXtxrp{S$53c>@vT}NtL z{fnWf_1C#WhS`3-gR+VPwLY^$L$`|#P1CpwnPtH#JDV<%9RwZ_j&H-uZb5V#RdR{1 zYT1u(9y_i*bM#!~VT>e^CRL+J+<8|Czsk-fkAqj48K7c@^wuLqkFncXNxlK zWSiBvySEB)M>S}VsT$d%1_h8v^St`-{Q8DeBhsbCHj53t*4NNz5GDaJUnSyaD921j zgpsGKXMJfaC-c*Dp7j?$wGO2zXzspyqe!>Q)8lG4e(csC%}P_M7UIZD<55=twl6#& zQf90I)&>I=^xp-m9HY>6nvlz1$D=o02Ib=@VJnNVU9tVzn{x06*3A|rCUGl$vHE=05y58Vd89xdp~`;yVNe%ss#%aXJZqk)aU&xQR)oPVyRC7ga3v=z>;4?kH}P5%O&7i+HDu-F&QpKZ2Uu17>l zEd-zDNs?cA6E4gLAVa%~?5KvO(-=n!EKyBFTil2aHTeN8cM29`eS3yAYN@>P- zT?W9HlV=+$7p7%(!-f&|rD50lyNzV$G_(VrPff>B>Xt;}!u{Fyw(|}fUyJn#L83$T zPpL2CX2QMPXB%Ry>`LbYN_wb6y43{cgI%7QaB+3?6oE@u&~e+m0NgX*DTbTaqzA&U z`VgK^=|W(x9x;o2N;o3JAd@5n>nhX*QR-7|%%w4Tm3ecoHiBlh4h$UNjxb)RhmKsq z>d543*~WVrm7Ow1LC4q66PQgWlpp@(-^u*&Keq;D@k_xDDPs%<4j?_^$C5r|u%u9M zg8vYn0`#Aq_f3=8WwNt}_-Gx9{POt@dOkVb;P)#CY(2TP0m_M*rQydMu~f$B24iJ| zF#yMmSn6z5DtGW;vm@^X!LdLrHT=pENdFZ%rgq?T?yk_iHPu zL-(a5>DX^{4}H*lmE&(oQclFx&)2!oSL_H3Y&|?3g(9Ks=}Ucj5>2oGdWpHryapAR z4qOf8^EN3t=}-Kua?EDRrNAltG=Q>F9bGuwW!gnJmE)1$(3J|+vy)#&DQOf}=gLsn zSFo*quRZ}Yb8bd_+!IKE68n%1i#B(nqT30Ul#s*Q%HN1-@#}s*@8@$R=7}QUkiZTQ z@xts?)rx+2e4eR`7ECyZtE#sgbF1Zx6KsQ7eb}-C=J)&@ooA#)M)8tFjH2S8Y^#%- zPx&BM46?g(>;fMBT=pALT$){yH}dz={<}HfMbiQ9%i0VEVZHEG%zAoU%DhV7Jj8WLu_1o1d8W z)Tw#m$da?SflbOpwX*^YBvgiB1^igp0B0lSNNX(Ik$>F@wiR}J)K^q$^t8l zt)jmS$kig&PKO1!x1r!G^nSnd;j-glXDCPY`5><8L(|glbXsi(o-(0Afw1wsFcl&b zJ0m)J#pq`+|eap*|03F5p*H2?o)OBE?@#_wHjc=3>MFZ%leyZ z&b=SSKkN{fL|wqr@nfTIIq3qDY{XR8Ja8KctxKTp&b-Iso6<%nT%e>pm0L8D|S(9zMiH7^$Zd5{$0@f@@+ z8%|m(Mg+~x#lRh2ZLK)>nSb~VR(|Y}I}c$8l=c8Vt+Z;bVu&$oZLqfr4YS|UXQ0#h zqazk=sH0MYscPd_Iy>Tzu+k0Sf!gy^I{xuXqQM?kFEtnG)QFo=?lj%R1o z>-K&-LY2IIa0a7#T5-K6m%5TIVs@Eb%{4YQ9#_`tU$QG>q-f!S`^tmu(67l_q-o9E z;6KmrGU7vaQlaShkDKhF4CN2$PXcupA4JuKv;Wq~4#=%Zb@L%GZ(!#oYZ}*9&n8Z6 z)G6$0?F>cFnFND6x(taBGFW<(T&=1c4P>X*;_^bcFo;VF=cA}*m+t3H--9Y@y=%2K zlj*TZHpn#5UuNHCIJsdjA}YHwul}msY4%z9ibEOWONL(QN{v={SaYCB(VVr4Zy5@b z{#S6N^q_;=CUPQ}$)}O-Z`P$j?2QwIw|+U4vHA83F7uVMeN@@gwe@v|x~mCx4x~E;=aP)7nGxz;@BJQv zihG|vrl!`z=3c~(2D;gtj0r3;@uQZ`F2fRaF?Lgbr<|8@fi~W;vTRi`nV%wENi90m^-T}Zksw`_sC44>k1V=+K)MzogUM=nlCRyFFe)deR;7L=75`9M_pwOvpc;+ zMr%Npqq(=cx8d(HHxe2c16P|o6_MSjAZd*cPyM-nVNE233UR@nSZ|fMIIF5E=#+am zz_Ad^gQEGJ= z>Xu{nsazWd50n#Gsn&+k-Mg3;yOfLabt^5a;2B>Ao|}y@odb`{GY4_b{#Yb*u(!#J zM^dm!DHj4$vx4y|1dOuy`jJpwY^G_qXLDB;?b z_Y709#uu8ZAK3#(z8xF=@r`8jaX*BdzyOWV&|U9AG^;OgV02{G#@h&$Vnmq2vGsaHg1%B?ylASXT8#YD^?*dfCt z*X51Fe$e-*B^GrrKP zdz$x*UtKS6A4%mrV#BB)0RwA?ZK3@AUHk}q=t^yA80xKZE%OYsv~vx@r!-P^>|kDNUNM{eF9 zJ(>-8QVM3f3lTBPFmzSph z3}WjwQYa_r8hcSDZmuhEs+~`{>|L33TP75ppQV?*9!ltaGPjPol>#>W^v8Y^CG@gI zE9RgsQexw%jlJmkgS=+Ckh9I(M;7X?Cy27x8KVz~|G?UK&yX5GI&dfW5S!27^Bx{k zji7?KnX)(#Ji%ob;ja#9sShl~ZBF&7sj2m^XhW3Fe?){dXvM?>bPPn-Jr_MEGTwO} zrfq{t;H!RmGX+L|%K^)@v$Lbv+sXhzpT6Ec zN;r=6-{uJ3PAx806D&RD zekDgxlHF=?KT^y%+isFWp5??ZsZIuhM@QOhW27hzuLqn|LmXG&c@2jP>=bf|@+^o% zPr_l^tAG93@;3+?H@Ki-=~Gf)$&6juWK}VGlnSshlt#(a@$hw>bhWYk1m?_{aSCh& zIvAhBxIx4fl5p!rQBtyA5=#eHD$c0IdCxUDBHFJ^`TAoI9G~-^5>HnyMmxkM>`jX{ zh)(0Za`-q`TR`YU=%lER!jm@R=(q>mFnQz7xzRn5Jqxvm?^mc1MaiZGDo+-1Lx1Ak zo-eYDv(l3cZgoE)p8~#FYrJ;C*lUaXVX;R!XHN4Ds+8_*hZ$XCEMXg`iE&aOEppQy z^sfEx+84~~t4&((u;Fz9mX)9lTrqwDpgPrYKyQQwy=et`4dxL z?_W6+5dg!E_?gIf7x^2G63boM>Xu}PxOPtRmK~KH4=>}MNj@Zn}?2&|Mfl8gq z>Eex-=~>w|E_V3C5Z^yf{c$C|877t&doN%^Mz;^^boX38Im`=va*r+xEU8iVu13caswt!b_55&cw(H%6=RssUEe zKAbzzwMSIHC7wQPyxCXbe#6`i4u@}6kZN6rL%NRQH?cLe&C5nzmVsSOqbFXqr6;z;XJ^;=H6%`GjOxP2 z!{oDb&Zt#XzET+r1L;T~Yg=}@Rz`z}5D^8_Y`KZcgbJL2@)GZqine#UYdpzn%R>h5 zZC_2!sZV$j`-2l}KZh_&XC*>0{T1wwZVgM!37x322-a5vt3T+2q%xy zvFj{ouw+fG<{ZrL>8%pp5*hunb$ZZb)Pn;J9~wXg@{cB2lBR+ds5B z`)lNNvaLmf_7uqD%c<^oDUCYw@l zz6-&Ve>b)cw|NpkS<7$Sb=bj`QB0H%~fK%C2;c3QP?o;Cuzs{B{h_{W&S^pR)3h}y}BT(*YG*DS^qh-fvp+M` zJ$`%g2Ob<4U`~2Y@opUd*H&~}$$Z2b6BcoD9IX9+m#={q|D4zG%^TC}HQwj_`xI>a zd1aVe@f1i4Y5A}cIY0`fM3e;V6%tzm!=*W+un1335dygP?`^rQ{O$@?@5FzoU}q(C z{Q6_P(36fmr*hJ38(+#4`P<%)Jta94Rz{zir3~|rRe}zlj;VPrHzo%N-mGxuy^b{6 zRo)vLb$2i5U&6}Wm|s${t(YcqO_1}{IW(#|GxJ05FpbkUnle0x)_|>e>3v;VX~>8U z>pb8;`N&h+F~FyeZPFjHTG>bl14lcVTC~-7od7-t4D@2QgT^0C&O`XIyX09ithg}t zAPkhsn_k=~XJ{QX;q8y~ZS&7j(+2B|?D_p)0!g(e^06kqHWh;!01W{O+qvgZ&T|~4 zerk~W7nt31eQ?Si@VAT$wQl8}W}q90&aOLRO`tWS7JOBt4E%h=aKBd(w-(LB(AopN z>xNl=IsnL)?I4^}pH;-GHTONeeAGjse~=H-M63zlnNwCoUSMD2XvW*6I$h9(OM%Lo zbxkG`KN$(e)O2r~;=QkhVAQm+btvibY3uJZni=ZSM4Dw)Fnzn9qEb6=2j8d*?YD-Q z6E~n$7OkfKSWxR>lUJC{oB>XuFnhiL#A=aUNNLb(;MQIoua3!eyHQwEz*k^z7y2}6 z5H}T+{>9g6dal7-mvqO|90DH5d~S#sUSC>4k*;R+)yVJ3Nzn|_snOG_T!O+hYbWR( z%l3PuQ#I!{yAl%&pHv`8^OYpxy>9t*m_OznZLG@hFZXRE4XKhisnW~&R(YE-f_GoeIDMQtrZE4dTuJs*8Hy=`#{b2fEhw@Om3 zi8Ub20YJI%^>f4Pi;OPiy#h(G2YC3OQF&D_!e-wv0sE&;OFQNWdwi`%On;wri=umeiCJ zY@Q2%C;`&#Sl&c$*!sqCveWh(EFq;&}R1XA1mkx9X;E>&PBniRwfFm^JEydDEDt9&zYYmrnB7 z@WGCq6`H44DpvPmmsFM7jk*(@7mm?aB&U#$i9OZ{y-CCU=41U$){Q}+#-{s|+y!6m zZr@Om7NuMj;uG=$fzhCkUAMx?(ZOc42gG{%%> zhiDKBq!sxAu20S9A&{l7Rg7si*Z-iL_)bo`qtT}>kcy|S_l#riMAvSNCv8q4Qq!gb zbs$QeC+4oHu9ZXkAKaW12eSQet_Cp+v=OlBIQ<}zPU*RbM7r;!PC4Yen? zx3)sh_Exx13w`niXJ}UQdOiL>aB4@wV3~{I(IQ<%6a7(8;IiE#Kr|*1!0NUgo18W_ zF)`^m1xeNgX&;Te+{)w3kGC@9X{ay7dtgU@&DaBRd@zw+c6QY|0bAF2%jRj{xDeW% zR1C#?I39Hj_*H!8gN{llz9bf!k~Vf9N-_Z)Im+lw&q^PgxVMjI`I&EnHb1>l59=!O z*(3pc(}1ikUrXNa1veJjX*IQ58qhd!1*IpcJSZULE*+N9+nZWZwgP?&xiIO;r04vfM(uF-rnBgGMXk5_UPFPWB-W` zBFN&U#*GB@8iN)dy^in%gKSIX1+=%X@7Hcahehl*7!erUm2}S*^?44gp|9!q6;B7O z`0>$^M#Hk~TQ-qB=3zf6-CVzj|yK_Ur3 zPz{TK|LwH_P7%;k+0Zya(d|G&&NIK75vgofj zLMt?RV4&X2!8n+By1S>?21H%d^c#C$N2OA8pz`EXSqBarcy4jBv%Q0pn%?{I<3~_r z=r%+LEtda{(3l&UXJC$=aLoa^!;7c7xbdr0$LsY_Ce2|<(UZWysbkDh#sUt95oKyBKNYhL& z*5DecC(W6b0YIW-wlWL>qJbsce)%pGaY9r7PN&IY{qX44~V@;04q*N zOHFlH|038Jl7lIcgz01ZfBi;INY0}{_(*Sp7iD>MJ?E4{F0Hq`q+;IJ#1^Yv;l^CQ zvul6s1L@18%narlKkJp~B=RJZ!d_ipNSB52i&ANcW_j)=q?MH-En7n;4Shp@rV&@T zTJs4@Fs_dZ#e4Z4eG#~PjE+$wRtXF3J{F&{R(se{^=jf+Zh-_$%mADs6y*K7HY2+w zX3IO$p_^D|7a(3E2@01MlK}3NdTWT?@oXK7GjQ?NfN~+_mUQSSLFoUI6R#G;8EdR4 ze=>&687)@bg>JLQKL4&gp3IJRk}PQ1&h2&cY3qIzvD1w~*vNHA*YIyC+vxnb@Rb`! z->6kP`mJHczJ)jCCC955#?`#$W)+bhK4MM)Z0^^>JdK)&-VA#uu4T}Rs^BhAKDR%9 zEK{DP;EvY{wo~);qHz9d#Whf7cXe{;x}|xT8$2htzJ6G%0<6+fX6;rIUqTRa6EdS{J5?EEVAd^iH~Ja9n(*{%Z>H1 zD;5T5esI!F3TW)w5=qGch}G7{t}ame7kkqTUO(n?LElk$_3?DXZ^+Fo1h2C_X1u)0 z`2~rykxbb5B!TR_9m@u3ze%k6>CtcAOmYR!k8q}71EYsNEdK@4%5Nlx#AeL){ulB05$$9$n*Q^5okSXiV*@y37#mAcKGGMxtPjXEI z9h1}2kwQLB*kuXrN=crcp0L=b3+kvwVY(W>^L(S^iiwZf2=SPyT}>8W*P&o~Z+8Q* zEdLn(bJwWx&gNLXlj?l$hEhO{-e|+UL&;W}@$1nRJ_+0_*v)TtcjVO6r`NmK$i{Vl zo6r^;cMl&K&90wA(JN!NG=ijCp>Fvk&r{@G5-T5K9Ll^0Z@(+D7ZsI_={h}9$yWi z`%{^F_#{a0*}O9ePlGT=DcN707%eQi z`&hYIr+ZPlrH>-j{s}2m>I!&ue_CO(ZI{K$Uv@tncxNY0LuL})f)zWk5OSew($6Py8VKfi&Sd`pshYO!ZY9(M>}8)@N8x!<>LlGH(tW7 z?8Vhe5FbM^2eqXEL3=~b?5${;=}c_$91>(OG+w&ntWy#yZmbNe7pJ9f4fG*Fs;SAE-uuw(VV*d|iYaqz}g|Htn!uqhkWRRR#T+6GI zZvF`3Qar!nE$SS0PgXiJ_wXSgyF(LN1)%Kizjmj*=9fgR9HSq#`4ni6?K{;mOrHEL z>uV}h)G0amY0tTRW@xTO$?-#w%j5d~YqZU2Y#l)}NQxZ>3K$p3(6&Axpk0q`dtU%; zgQy>0XtWrDwcQ3S|A*p`hh&Ws=GV5^BlWUJgDQZX9Vt{oeYR*EkJ~BLbtA?U#Msid z2IiX3x@nA^J$%S{uZV%4e$VeMpK#B1J$QZo0^bR1kcnFF_ZLL{2_Fw>Ufj@+`co_d zH`Zv;nh8Z1fd;sn3C{DoT|k)A@&Cw`cX=ksadMi0yNN%xU+@R?Ziw()5u6(rJs;lO ziQs%G*z~Xk5jD;Tt5I`PL(L(A8l_ja=#p(#j|0|ECQuW$yY8nrzj}8f;`Hw_pZ@ZO zh2KcR6qhUz^T+mu3`B$iyh!UJZJ z>Pg?m&^GVu%p3VikR{gse;d2JCG#~kr-)Nkm0k8;9in`mx7nn?W&?mwIGy{W!BXs{ z-7dGTl1$2ce{GGBB?e2K*sl~0HuiGcp&gFTRi$cPY9>mt7Z@S#mm>}irjFr$Hs0c# zZLH?>Hj#fV#O>8NJl>KLYHK z0l}}2T1QJk_jI55=dp4tP_}j8E9n}kC;x{}lD09WuZqGNblkVpnbJlN&!YVPinMWW zeJ#oq8{`f=iw@Eq;#F$8-UUaQY5QP19;&5YPXNgiXH3sMjrF{EbEXd(p=Y{H>3pDN zAfx0YP@L%y?;MB~H^}~Uv%SFB%jc(6wpB2@^ zJ`eo6l^@Ffy4!V!aMsFNo&_icbuI8aIju^u+QOd_&aIT^;X@>gf)MQlBM6CQ?{WR! zF9ZKN*rq}%+VRfFclJbVy#%q**y5KLqYTsIz;iMIe?PqjU?3# z)pyO(*v{A6XL@-t6`CZ?=KN6h4uLe+t&Y%H5le2!k&k-aOPBop&3kS_ZlnWMsW+DL z^F{LqexrY9N3InMy?Z&IR{HB%2{Mpb4u~C10sa@U{XAVpg;wzdx{2hVsUwKSM%tX; z3iEb2#90C+{O?k_Tgb4}eMRsmXI2JyEAphIWig6scWK{^^qdEE)o!*YV@#TzZ8FB5 z9TMX_gjSh29({WApLr43Q@}*Tw$0ZFTD1ZQHA&*xb08l8;<^o5tmGdEm5~I#*H)hH zKMzg5@37F3gtTc~7qU4WK}~&Det`A0*tiIRUZo`-XXpW|8eV_R3Fz<}$!49*oIlOZ zj@qR=lYffip2fcY8N_0WMc=)Q(7XmL-h!>H)SZ!26PD70wA8WIC>=%1=no(nDsu0( zgjVhi%p0j&8-$i9{;1oGU?h#{`&@W>#kJwbOQ3yGyKfb;A$K;aWc~9Zw1BqM-1zck zjZMj}rURz6)js~e%Sge@{rzZ8Rf_Jyqa`o79D!GJv%?L)!Ni8bSYOz)%&3-lw`rWd z7Hh>Dab<|6MQ_kYii46e9n7#`g5h|MG>#AfTDmm!fupQEm#zSWFFW;+m|F7K`NKRj zKJ@Qaos>4t;;pI7l{I(5rotY_ptPjt{(rZs!u2(XmM{gQ#irfkz$!_$+zP!kbtp>{ zAA(vWb_EoF$}awv`#@9)>3fg)+gcPMJm!}K^atayEVNOQLiq%g$EytrhQ z=%wb>cx@JUS$c@TjjE_^sVvDO+~g!yNMt#omsu4~{go=bHwPO$hlfO}D5d=zwu z7y}F+vQs`DdHpw%(Zzf|3%Z3Hyrjgz)}G+_1W-pd{WULoRk}D+7@n6g|Ywi3wFo{2mkBs8+C>5pX1=Po90Lvs5^odWS%n*-w#=+#`%NK>{C z5kRdF$&ck^iq~K7&3UNB%5fW3nQm>V@)`(ye=gdqo@E~~vshnKl3#*nXK-6vTJ-0_ z#&k;RMosm-sZ4a}J8I4W7w=^!f4f?*%|~*!!3E)t{L?;#8$P#eBiSm;9oK6FUhj&` zFEJ8}xJt>ow?j9sBsP6~G`9G1@kU{&`C-mwAec(itd{On@}_n_!-~?dh9Fno+Dc{n zZ*nhQ%Jl2+&5LB;46AVGpWet?8`)XaWFg&olOj?eD|mW$a#y~ z2^2P{N5|Q~g^WN7WNGvPa9{w5d@}tdlc&?ayH;@vGX1kV-ubEazdoU8+r2ND8b`WG zkv;lqr}a<&+|=Nd6{ai{QIR!7?%15Kzjo>wxP#Rn0z-hOn-&8&4Ab-a-`x}ZspJOu zC+>jR4yKdB9aDvTYEAd=O7Dgk=lrc%FBu(se0EIGJ~DIj=x|7>*+WaMv}0eG4GeR& zn%NPC^7+5OU<-G9MuUV-P@gz#p5K_?Qs9)oKf3b=K|ay@m{08GbnE2`AH#+hYM1#5Lj(JEr(?S{EY zVMEI1VQ}k}J}JyFh9Uo4jek3O?e|o5|s)>}<+c(j8x$R-~?u>60>prKpACTKBz>B9FQdU<(BqYwmA#x%P&;QABXv$v_Y1L!D$rasP zoem0`?KxjH@=>cx+htR4t*TMouYX`wl(dO(YxEBI82`r(tN-PZ2A2-v#u?8w3Q^;? z;Z3aJMgmRF)91y;wL@1U?}7mkJ+DI?!|R_Bo`zxq2TNtZvtyb+@f_D5F6%!l=ZAWh zhbs&7`VN4()d{EhVFDiQ8_@F1S3AS}l;>GMQoPK+u2xk*)oD-d%`Z5bxySn0H;#m$ z=TylhY771}BDpmm4xi5t;vWDWQR&ldM$e}2AT#sz;r#+5fY|?CvVAdnD_E_Dsig?b ziA28RWfKXx4Rc>I(Gcr}mRvi$``4f4ENk-{!5aCr43eFqV=k^!Dlw zp6X9bWWf;yL6*aNeSX*l5XX%NCN;XJF98JeMU`(HQ0nGmgKn5Pa%PY37U zIa&=^%<(mlir{XB?R|`UWNraXq?+eR{&z*Pl~uowou{`M4M(!X3w^7O|L1mKbLfr}be@7;O!|k6RE^v<3PY8gm0xA4 z+45#?Qq;M!g%K)7q^EUU#|smyyjIxRYr{zhZ-z zSf_C(WP)~=EN)pm=)Q#?HBu+IRp`TcCPPfcFAqE0Y{)3q^~Xi92(mw6dL?ovxcxu7J7$`|AOjAIwh$Lr1-$Rb zLUr>zpriOqo(=HtRk}OfuZu4>q|*%%+;I(T=XM}yj=l!N?FXs{QW|Eg6OqcZFAg{r zt~GV@NP@l|u)Q?bVoSud1O5?&-)?Ex_3`0&KhPfA-`(bH$bhUEFWeHzW(tz$Ed(ut z3yQaf_3t>q|0JEkEM!9kUOetuTe1-+kl(+@Vb&AnZTlQ5+3U6zm<`WdA6hUTle58{ zO+?y-Zsf?SA{uB1C6eYjQ;Ko1^PLz=ARO3ys=Cxc!M-DExQHg)NbBVL)cQF{q-6`3 znYAc?G1%6Dyb~_Raoe74{rku8)uKEfbFTAp-!=d5h6Mh!{xio%uF>_;{dcDwqD4dA z9z<(ayWxAL`)A8n-ppD?dugt>a9P9164i-DFPdi`(g8g>#8o?9JzKgfuKF5suE3)) zP^9W1xQ@Ttyrwp{z-CX zj5K^XiJg}36Yz`*MJ zLtw`;P4&epP{?@!NOF80-2@#N4pb^kBtI7}L59V*3Xx?C`r)?GhCmWpJv+jE$^!(I zEAu9s2#9{qwGcU^UfA$xC1t*n)!>1N4s)M83Yj413j-0VM7k&)m&W4|D;%OQHhHmO zG#_rM_+kUGe4lau!W`J^Rxk1z|9=hxOZoY7G-Zmwvmq41y9Goen!DxID!lt+AIl=^ zTsyPV6m9ha_CuH+fT2A1=7CkjvJN%i9w-B_7uobO1ae=2>D!<8^i(=+n zzvW)c=3$;253b}p5o^(`<=4Rxev=RCE4`;zG7mdm6wsQ3J$&%sK*mdWajh$}`)BaU znFcL*T(Pbdx3vjS!hcbgg?dwk55emr2L~(k1t@;z^V-Y=Zulkw5mL)gNYb6Zt@bch zVV=g>^;O4QEwPFZpu3G22hTZg3P7w;Vm4fZUNGFZ+nu#Nx)F_HLolFPQ1|nCQ#N!MPbhLs|tKtA1br zf!V%_E$qs>nBVX0%d6}$Y6ZI@Qb&zav#mhcwS{X;MmAZ|l9V6YkW!Ild|-`Dl-j$R zZin$ZB)kI>dF1VusjZ~8Y0Y)0FbN#xO&DBF`^s;Qwout!!3{waLf~+ePYTku%10En z+AOXc&~jUIZ8A%IbwaI?jm^}W=zahG{h9|iFBeC6wqZ9qI-248=4Z?LY5V!LwsU-< zXH|Nc=DQie8meynOkx6(92D9$u6H z7|}X_mfv7asa<=_+b|O|8oBEWJtgd!={eNEj_4UpZmjq;l$v<3 z+e-bsi#B`~L(VSGNbOyEU{oIDyjY1Z)Fr?DglAce#+U)fNIB2FEZ#&%?5uQ(HLD1Y z1H5PqSc&XGLZGKO`@R0g#x()-06iK^U5y$tO2~@ST|1&S;@i9`Sy2`V!1A_p0g~*> zJX>Q$`HuU&|9x|CA8Mi3+s8l6sjt|clee(202iQ~#<#S##)$|Cu_@pUw}ZSlT3RmV ztHJGq23;ZoKHcJTPy-~YSz{EK)BzX=_H)mY`W0+vni@hmfD|?a&}FZA_7&EU59uFY z$OQN3r;HG)8kUQK*M|BVf`$*9XfG7<>3+S{?($1lRzX&4Yb!!2;CrkplryZ?yCqk3HBiTGxj~0H zx4f%UDp(~O6`{7v0~hkZvLve9*@}nF>1uCTB|yENo*j6Z6%ShP(0J{4f(0mLzwiK? zHkSql1|+4^ecvFhB$B{XjnX=g+i*RWqZlGF|5q)z!NzZ*IDmDC0-@b^bkzge!`22` zb%TtThk0MJ1%!8_iHtB`;8)*D =Y zuG|H0@5sHxG<$C&YSAZRu&(q2-N!UD+EG~=p>iQ)?-5J*&j>#{6)Gau)m zQ$TpvS%}%Z&TCWFK0MhCULd7^kN5y(8tF2##z@VSfF$1PB*%hRh@+$O!P&gEY=%jJ ze*s4^zOxI?Lv6 zk`>|N`5U_Di8~@C=a~34KS3NTBU3psofN_7=<_)ys_6u)W(*HE1hW?niy86a8|#rO zhV8}nPPE#vHFtzNH@tUtyi(I1#b$9&1T@KsFESXtWUYJ=dco-FP1i&MsN}!8yQ#3!&ff-(Sb9j=sGe5f^zHmu z`sV`+zJsDW$od5)_NXZ`AHQhXh&^hMEYe?usGVwSFLtO6%IHj5X<6>~Vb8tmhv&kP zg}Q3izZk9F@fa>FNRBBYuL9GlV=i)+>dIKIZdO4F!D}+Xoif#pUMY{`KG9ve#X6Aj zW|8%X@s`o;8h&bd$i|1y3NMn+B3+|M6hy0>Ist%^UjeLHG41K34X??+B}ZWF0T%6qP9(DZK{)0<9IklWbV93Fn$EPw)(`$nm#8hCm=MN-PwMab`LafX5T> znmep?Tt*ElM|U~83|iZ(1nds$ulHGgk$Kqa0a|p!T>l`Y^tQI|;#a&E=GeP{qVflqr)Psi8cSLX13}3ZJWLVlOHxnKjAJD82(T0>J zAGnHt?TO$FWgbp^u1wM{2S5Ik4LajUhqoka|B$G7-a}C9O`Ku$?9$Vnu4lrtQGYKT_6>${+=!v zJ}CO~L4Dnt-!dcfyj5f?Jc9goR9(DCOvkIMY(<$&g2#zyI`2_0)6CK=S<7Sy=9BoZ zz6z_)TZ?8$oR2PqG+x0JtbOeMUACz=U*dGN8xm7Lg@EjZ$US(>_?ax4-<+SVN8V>` zfV1O<1{T=#>Abv0BWI|h*thS3M$v$X2R7USb3vHIl5R|%{3A>(-aYd7be#Z5=_~Ha zF8s~8(fMV)VoGUvzbP*Y;?{_2iBb#uoKZDU=G_{lroEzq7ZMc}()>A+{qp;LnmYFl zQLRhckr%@`TxlW(k-Mg-&OTzlmcitFjn=E~X9f1xQj65qnS2wO_m4Q&5ke>b=i+4> zcz%PQ-b^WwC(GwYbkL@#17@@DX`jOe&ahWrFp{NIm4#|kDkR9v0s#v=?i$4-&?(S8 zaDzvLjQr`vU=8u&{1p;yoGL61Lg!W~+_cr2QF=i?{X&1!+;Gp^vjZvG&#%HMRrl~9 zW%_8l;+a!Vm->K5UGSz-Dr%+f#$#QYR)L^ZT&Jhqt#>!WmkTo`BC}^fWTcY&XqPIr zwA?e8ADDl1d(`Kl;0#pTIr2HOTh!bRN2(EPvon>aB!GE}_PPe+YV#~Ih@2koBQIOc zV_}j}K(+yoDy0L{V0gd7Y9ptyWR>G{^S$@VYIt$o>h$bTO_uHPkZ$e19UW;g`Hcls z`wZRoAv7G(Ht|-jcrd(}5WHMR95u-olnVjQTdA;jad_6+tVgq)r?^=L9OIOk6?ue? zFDQynLZ)A%h^8K!V?JiwzZ<*r(}d3Fx3&Bk5a#$2;}(cc$~{`xm_FqK9JS9Bw?%);n3Pp1-yG?n7gLIViDy+|BBH16b!Y4Zs>q*Avt01K( zj{#o;IP_Y`p^C{PfUb*Gf|4XE$ zYvDZ_OgT8h)?R1Zb7?xMMrm2}JPLpy9Z{)1zNCOVhju_Fns=p!*te}Q^2lnZn}Eu- zf~GkF4OS|=ZnZqe5F@*R8-sByDw4(3ZpLyNZrM8KE5ajAwWk8bkgrfEM)<^YUObk2 zq{1gcn#XpO{u2K3-nn5?QBX(eTsYY_A5p~b@V7!F`I82De{`(1)JE}Ir$-pi^#wx5 z(g#GSdcAplby&PWs$6!@Z!D%EvsDLgslh_mm0d$>MF zchEC`SoPG|@rVD9syBgZ>gwKyqo~xPQ0rS!nY6ajz9MK5QO35Y)GDF^2?${j1!YDE zNFV_xP!X`TG7m+RA%TQJri3AN0F@yqUSd4s@tz)gLH29+-4isC;eK^d3uG+%))wj{5&Scr zv$6`QZgY!HB;?m)s|vs6(g!j-Uo~H-F*21qHQ<_DA3vBVGWNqoRXganNH8RF%QGTf zaiH0kr!hEx6ZKE);(jPSio(KeiNY2NyYLk*g;eU?;%k%_-EwVaDGhytCZlxyLN(v> zyM9Sq4ic-} zLmvzoS7&!Q2MFch8cbi7p8^?&tK$Xpt>nfwS%sIMiSMr^dvBH;Jn31FrXZbkbGl#2 z4^Ym>shN$!!ndY=o}G1S&2lShisAK~KNp^&x1g()hJK;()wwAplMry`*+huL?sAMy7d`%m$=L>Ee6qJc`mLA<%0RCrxgzCMLl8hO((%&t<( zd{|{)@ZdjXOlOl$f418~=>*fM-n;RMVxRPOj-g7w0;eX$58{hh=aS!!b-q_TCCTfJ zyEon4$SWR+F6Y&a*4&_jpTd6y4Qq4NA`W=+mODZZI#XKgB+h_iYGj18h_PgozjKf< zJ1#EaUc?kjz^fWkauj_uJ7(E&Z^l;O2jiH3Hm_X(K(~o|QD|B0VEe zLtJe2=zdO*kHI_YVTuC>`Jf~#liQ@D`u@K{Hih0@xjP}nqzsZw^>)?;45zUll5?20 zE_4Q?PL6*Tm-;eSD2WCMw3Py}tzZEy1Z9eU(oR=7Ao;u43Q0IHTRRbbbPS{cR{IaC zc+ApS_$tS`r=-j~L6bF-I}k365lN+yln=giV`p)V(!c>Dqo0Dm#?8C4;zkIQW~bfV zNxU4bh}W+)R}4cCgutvHeU#x%i)Rlj==N91R>I5o%Tm?qpHikO{QXSE{=QK$AY!Q0 z1Opm{4$U8AYH6vdRnPy?NSwV<$?i`yWj%QN7M*IG^i{$Ye*M4MNerTC<0GA=fr2qF zR^UrTf;K@`My3i_=@7?q{;W8!C07r~A^IIf=1MKgV-Nxn`&LbYZ$9-9ze8tl%ccG=@t%9ig(7|98+@1jdj@qwk)Qo@f-PP{vW}E~?9v%_Td!(VZ8o?a5 z-h;k9RfM#@=}PY23V(aHES}Qo-xv64uQIO!5Uz4=k2K^S2!>DJ%;3~~U?h%H`2Wm; z=po976q$Jel`*cksO|u6^aVlrQ98&#NUx?@ItKHccoSY)0*Q*rI0GchZieZcrA;e4 zKs0Tdsy`hsFdY^!i25>XKDT!Jq#~|ZwUwr6(_coOX(DzZ-HxoJ}A6_e;w8^3?SjOwH8uo zMs?&s!Qg9=e^I9S>CLd5YRBX$#;9%TwzV`PHj9s;L7W%vCnD`@pNbbbdjo8St(N<) zCZN@2g|#l5t;y;R81K{7E*vlT6YcY=heYqgiDl62i<$ZlV(`|?fioFZ{~B~0=N@JQ zx`HrI;zajlWIbX*Yo`-zdD|ft!X<42~B3GpX<6NF?2FYw_B@(XrQ+Y+Ay`ZSwsHfS6=?1QZZp z8zxqv10Mx$IL_Jm52zUuooxXtSG`RG+(Za81AZ4YpeGQ0c`9l5x{N*=*+-B7?>O>| z&+#*^63@PF;(t844IcQc%SppQ>LZBsaA0$v@o@`^0Q_eaFjCz5Gib52PJ#%`G@|~Pn2iT!94oWv_THiziZ@L~qm7sU|{0?oMwFEm6UDJl*l~^8wC*X&bup@>Ff9s`3 zraWh<9GLxfj$pX|2L;z`C#Tv_Oo7MeQwzw?|5f1oa%6#%yDCh?U0E!j|ATM=?NGx& zTtl_oFrM6{!N$`p59AVAEf|V`V^6sud$kjGDf(S2IF)7MG0)h&B3*G8EU(29Ie5BO zp{9MDE#EUhWnT@Q#pO%5-AH+LKy$5NXWTgkiZ<&|2qc?DtX3`HG(U-?5#W( zZ;y1?)*~-G24Jq=%*LVq^gT4gjeV=bhi;>m^jlhkPS z`O~#q&lIaQ(9M~xMFVD7+N}6WhNJY#=I0jXKQ;BtFcj=JIC@k|cRuh}p(VcGohTpO zp4XL(>Z>i5bt|u}vg$sSRcc_#bBJ_toJ_e77nkW+n`h*Oxp?*k@;t8V4+LBGZoRDg zaSbrG?|TfIrVSHxBW%6oS^lV<8ARz7(#y7; zAVfQ;Wh{TeJ@n6Q9Ysw_n;cU895W2VmZtu7aARp%Mb1R21XLY+c=3O95ZCqCGhDgy zm)>njuudJ7Vn)-2p;?v;dZ+ko(P!0xz0K78gT|AZj-Vp*z|VRK`!Oiki)2e;qRkE+ zD{g`hx0JDUcszyFpVZVp%phL`Z)-174!-ohw0c^mc??NnCp$x_xijNdCawF*mF7-M zvh|Kuc6io6=L2wLbn~!U0kwwp9LN*2;##98tl&-N{32QmUZOL^UxQ~V(NzuxIzya! z;vWpRbd+12*RXqC4VF9vkI-%_nR=`-Ugng6m!1-gzl20j)WetZ-M!__UYe?Hr)}g< za>1>Uo+bSTE2#yN-o7-XQ>oSY;=#8rwi)%G4#Q;rSI~rj?QtnIGxkyDq)Y8L*P9<^ z7#ZIPkLnLfLM^O(R=Ca7Dzd#GCEFtT0Rdlc7QCjI=H%R;!@9J|tP?`g-JzepjnIvn zfvmYG8q{9Y@pfMw^Ur9fIml8Jav?~Nhwuz%jRe}ZyV?tl>%j98qAE0{E!q%MTH%Z5{#Ux|-9R3)Cut03U23)|XRg#mko6$R z+@z_eS2sYZxaOX%7Pl?4nsszDcf2QZ>?@D*T=HDoCJrsgUJw+!P_3B(-x><<0hJs+ zWM-+!mRf6y*{afk1ZjuoG0VK(7?r4tZ(t6iOhM&As7Ilb&&v${_B7fBye+1!HX&_` z_C^@*I{5~%|3NP~BE0o)(EXg>KM|}ptf4&9nKDSf#t0a%*mZezMJ!4l7JDW3itRYPuV^T3_lTHtp$_{C2_uH|lH zUwD+)oil{_{;#joW3Kyc^?f%LqLp!7n?p?&6hln}mYR`*GV8^RAUTMIiFPqxuI$RV zz>k7>lW%JcAw@Srplcht5gG#e z#wX%hnk%DjUzo|&XoE%q1hf#OPy+s;yI!A3f+(^v5#3W(7>xQHEc8bJF^_RvrjQ&HJjuZ*IJDZ=-ZukZM zHaxt!{#~8le0f~l)b^@Nm%BWl*G-kV@n|8FT?2_~(tHt7(a&fxJc-GR@MSNjEoBC^ z9;)13sajx`huFvo(2bxe1YTq+(56ioymvHK|Sl`frK$K=jIBmOg?BPrZt|blm3GmD}=zG_H>GOQ-$_6(x1V5Y-&0T zA&yFc8CaP-+bm|?UvA~aG?Tf&`i?2;n68DV&Nc#(dgmY0Juh79KZI$7Np?lI+JC(= zyCfrR|CG#QMJlRSaRIt-+$N*`;<7vzX4n5A`FVJo?2yOcqbI`~WfJ$PzWpVHUapCh zsuA3zuuk`!QRbXJcl^f8%Ez{C(y{fT$r1mhc1RA~)He_N#bxo;i_W>JnUJe0)Thjm z8S-$fwQr;Fj+?3Z{KByDSfWJ5>FI&=`ELKvo*!N4=qqE5p5qC|e%R9{JYCw3hf1Dg z>qlMu=oY#>yXO>ZSQ3`BwehGr);^uquybq_;;B5b8tT#c_ zae}QSzTQsSqT`(x{%o7hTG?r7m|z9nqhC0GNDsb}*N}R3e}^%h0KP{tyOohi?5DFX zMUyK1hQj+)>Ge*ZJwDeIV;N0JDE+vURrfMQHN*YVYFz9;|ED21k)aL{K_;WVhGnCn zKRpFg=W)RXYm+K`p<>5mOrOyp5{-NyEdAMF_Hk)ai(7vIpl#l%UHMxR@9INDkZvoj z-<@f)REioaZNr?tAB^dt)|&d{j$ z3^pfWzhV5*ZRWANx5D3njs3DWq_0ZIYUwQOO_R`iN`fX*!HfDs&98>qpE(S|{Ojj#7}KfC1LlGvtF(jDeQBAp#f8f;PWx8ktYRP=-4SCt=%y;?(a16? z?~Byvkx!JYgz} z-^3Y|ycSs7@iZ}FoNH*r6LB|s8yzkyr2I}wTdQ}yUo?a&BfC5tAm8;79RP3%1oJ>% zEp`-CpVrC;t?9x6qoy7!a6F4~egq<9Aa<1%n!5gv24hdm_1jl}UJ=~yW_@fRqpfX_ zKT7_r61!|V<$TRzRfgC3AI4h_SCB~nuoWKt@p^ygl1`=UcfllDh7@fv1A($Xu57;i z$ybFkW)W&fXf5_}7fhiFRvT2-+MhCe|6=Ck^xI~OI!js*UC++i<@2VEq6uBXA#|3!c zZ%OBF*tRIMS zgc^@WXVvvS*K_$Fk=BrOW2rz54o}`^DmM?fs4&dafQmT{o7z0_KVDgA^^FerdBgA$ zav6EPoA>OBzO#m7e`UAJt8nJ)N*AZLZ(+^87oQWvqmA@R@ac+A10S1VR3QEYIRj@( zIm`808YX2?VNa=3X6_Ou&%d!@qeLZRH3xN*$0DD3v$*8YoX->R(aApUq%+kva}olb z!c)dH5uSVLJnl`)(uds#hn90td~DlV_Cyep=ixA~ji}Pf=;fb<;=3%GCK2-ShR<0y zfO@2;ckDkL6N#7LJ*wQY9G*x*%n5t~lenmZ4al|5M7A?>5t+{n633X$ruoZSffpoL zVP>*s?l2>oHQ+!3t?(CJT_>$Cu5sP=*!BY^-y0AY1 zkma1D7O;W@p46eyDv8XZvc8phU_!;x6!%6bb9PJW7HtJc|E-eGETqK8Dkn8L zGS8XT=Q`C&WLdp6H}%5qufG2J*Du`GuU@tDm;0|0*WCW%haIKWQRGdv&YO;o@I$J< zc%;+0w#sRXsuOOLLVnfGbsnp)bc-^6j=F#U*2#+F9=ji;O!KtzuDMN~)Yhl4SmDl0 zenB2>B7!Y$AD=#+5k#wlzL{N>CjgMxi!`olT`h6<9zI^CBq>C*a0+3U)_5qyjm=~b5L z>`UHOsOM5#i`D5#ETJTaGt^^W`}mxTX`Xl%jkJ~RzwiU>$IrD>%BOO=1Ko4ONM7N6FT?Rv#@`@KFlKbY%~!rf|9ZcYS9M|PTr6>>PC|o{Am#((!SrT3u8+w-XbBx9+j20 z1m)zl!p?m1@!>xK`M|0SA#=bo0`2VAw;In-AynqK<`4YD?FynMc ztz?3pU$J$05OkTuMa}&NXB}-L@wIH0yn) z{GDCflNKkNeuQwPNn7Jf*Pjjjo&H`PL^bB`)q0YCDZltG8yiv7%OJ5vy?>*N$#_D2 z=`ibUQf}@si^f&1Zd0cJET=Yf@>)0CUkeoo*u3mZF9UtWE68-Vni4Fc;Giei*4x&G z?SH-s$GIl2We)h1atiAp%{d!vFL;1N1Igo;=MC4SSJg$&UK9M%lT8(FF46G@rO-y{r*C#7y)*th;>o1zU;Fq-b zcQ9fwrEZ$xqDH5Hp}qdRO)(5}zcHO~BiUUgLcMI+TD{J&p0@Y(;so@t!6rB+_ze59m|R z8(>2W?q2+r){0M)T5jt9vyxmeV8~5=Kr>J8r#cJL@9f4VlInF}d#&r( zR}c1WkE3at2ge`Z8h1?QIkw@!K7IZ6O9#Y*ZHI*m@AJ&cu|pfBl6qOVb$CyECd_JP zu#H(HLbK!b)b;jH3x(ahriY_pId8|pEMk&xj*pMG?3wZuw#;}INQ?H~H17MLthzDd z!;U~JC=vj7qFG;9fH2TW^x0=+9d&WotdTc?hHFH=JIZFEx3kM8$1wnnT5mn(}gUHS0lmLN{bw+zFkOvZ}ohnjw5b z-5@y8w8*iN5o7Xkr|HL?!{HSmETih70)&vDDXMC$6}MTa^e^O?i*sba|1%5hH55TZ z%i53M>s3pky-#}6mLJ8Bl2cQ~%L@w&L9}`eqo&oyVX#i&US4PiJ9K8iC^-4n0e$^> zM;jX(mGqR9u*EiqEj`}O^Qji}6Va`KUsdIjrISBp@fdzClp3S3KjqrQ_3+Zd_gTi1 zN=cN_{J0nfQO4=zc{a9%;ST+P?ZT?@Q6m=?^{ovSHtHGH`7g(8S6%E2iQn(<{2q*M zv6j5#u~4TTS=@Lp6tAZyujYI(yXYmI6uz`rcTM4wA`*F!sxfRT*=k6ecBWj}|LZE8 z&RS8OsFvx!cHl?^u#+ZJQ&Or|?&-Hyd5q~O`2uCErMcP1~!KbZStDY%ERTp#MO`)M!gu(uBw`4>+b=TjRu8uJ0-6bHuKIRj3S^ z>hZQV6}Ljd0X&T4gQU_?HM3Kd0!D*P& z!Otxsic5Xy(}&q`PuGj&_DS(9!c*C{MwxR`FwlL`WtvF&Tl}ja76ED~m}C3H)A*@N zPE-l&S{Y1@D)T4;o$BlwJW}m z8s_1dUJ+S~TswBm22j|BnBfK)x0I{)Hz7v_d1Xy?>crY?aazMgbAr_&)sxRQdvx;b zDDJOz;EHX1YUhS3A}DoX9Wg+aGt6d#P2e{04t6g6 zIe`aXEw>+G$0&m?G?cfEj2ansLwE->w~lxt-qF}0U7W8&j|{7+gy3Z+0b9CUv{Hzp zLB|Ad(86v(r}nllanVo7Ei63K!7zA3)WeTIB0cDSgsB2P-H%;oGaNL|wuBBX-R&-s zoiQaptHy>g_;*lqgX44^*Vno(MAAA@sSJ%B)>gX5Q(|<;caQGdju7-#o<+Z6g|Ka0YW(UF`m;NZ^Urz4{FCZdI`(*+AJ&u%l5>robi1-%^S>~ezzL_Sp`=N4 ziD$S&y}RloKQ?yLm%BgC0--|YP-}*LffFZ6K;9zRh$^_+#eex^5&d4F=+lPHj|yG> zf=v(oFYkway2Gm+-l=UC(9{#Z+|}lQvkwTWRVn+47nDL{ItNdGnDPwbR`~Yf{cES^ zENHrm4Pk1XzQjt&&qF@d;y<5PIEOCnZ0`+Wh~K>AYf^VK(pX~F-k9c-+3@jasF6@3wW~B*&M&v{1bt8f3mIf}q<*qxlA)L} z_llX2E1a3H^r`ll`o@J#wL=O*&F3FISK(Dn{5)4!Jx%E>s$Qvio?tTfN+Yj-!cA4< zRS8YkNQ9QRx9jJG_k9Sj56lDDD{5l>%Ftg5Kxb%CkDf zL(4BF#cWz^%!bv9gQ0Xjp)%F8?=oURFQZwXG=dFST-a<_!qum1<7Ih9*{mx=cW{Xn zQ-~~sI66N8x$3>J#tUnLU_*d^D{NLYlh$~eeDY*;A$HmuGcH>DOBcFh%w$t~Y=NMi zeQNQ&zWG9ejZIGWu2sbWqp7Y`#vP1vQ~^We|Mv*MBsg5vak)9r*=v)M4!?kqWlvmF zxGt1$rmt0gadMb9Oj+_rY-nJFY$z1Wp^f^+jVLJYo>F=8RBMABcai|JyV z-&5~UKQ)RJjqSvPhqan>JC z$)mpU^cutN5YhA+QKZLgz)rD>y78tPv9_xZ+tpxJ&zd;W&iQzc7T)4lulGbQ^PxnN z1m)9YrrfT#ABS-|dyHwu2D-M?m*8eri(}D7n%(ABP+Fi;<{D?HOB&O%Ijt($W|>$u zej7AsP>ieIot)+UEg1+-zKloDtG@&$o&8}vL1s(I_vdNXnd=q|{5j_X@5MpoOL!VG zy^1u0IPD(~S6Az?Fon1iz?k;9tj1HIR{UrwXeJ^ukcE%%OxY_Rk6<;-P0$f5tM7K8 ze_;onSukMq_6Th6wQ)*eqw|lUo8fc1;mOa9ktaXA^VTKGp6qq0>qa_>wc+52s~>=6 z8+Z)UE;&een_Ufd1^8+7I?oVEEs#)EZP|W9fx7wA_MrLST!)57t}m122tLk^QWWUM zJy~9T&OqL0eDPG{WQo-9$0@g^RK|c!21LMo+pJ4YeP!52f(00Ug}yCo;zGRVTD|1) zb}m?;701l3-l=v)PcA9A;+htNe`>IhiUJlDJM5kF2HaCw&+u?L6!)qSL^XFBQwz)p zrq$x^wZ|2KcQfQ+0JnZJTuE(|-klCSB|pDT_DYoJDE+TfZmo)92l z)4Hp4rkoMs9c@4;;j+IPRNI)1!OcAIzOGC`zU2*Md>xRA!>xl*Ug$u0q3RbL!dozMe38SZrQ)eTy2tDlzHg|80b+L3UNq@{GbjMKFh z+QU*DdK>dgsKqgYj%&)6Me-&PtCCS8ossqtMMr zkece!$(iA+S^e&kgJq4Tk-xyH)y0!mz)FW@+%da|w#R7)l9Dj_*aP5GkVhvI(2A2i zn~)oHHeR$%j$G6%A1S99=@^Es>a*1cp>3ymLf9~ljC!58iv>D3`)I$wv6kF=(c**3 ztiVP^q8Z>QTl$2Dg+(ryU((bxz(3JRU`Kzk8pnBL#5BqgkD#o>n&ywmZlqJBMv6MW zESwS{LqFd70%m@$k}Ovxig&UK{Two!rN0Sse`$y3a4tC6oBD@zCn3llT4eq}rrLM* znH@kR&?MA`Vcmy8Rp%iOksNv?ctD)rLq;dXH7L6C27DJIrN5}9PSSbdy$_hYt7ARA z5JW)0@G;hVWki3*1ntr2%v!h}V?Lt~uBYoSQ_1eUotV4l8rCfV|1${uPkin;6vK5R zE_hLQ}hGzGCYJgF}>fcc-Soz{ObVjii6EutP zO8_Oh&^$po3pI6^V*|^-TxPtyQ zK3>aDRB%n{b(k9P=83D>%rZrTqRPnH1x7yuSupafRwo17aMl9%2vm{PFIQXmwB;sb0(CHk&QG zFhlsQNi}(4a_$rA3$d_$h?xPqnSUHfu&cM%up=72mOG6)rcm7%9=|xaGy~m`)eJR- zIeD-Q+4H!7wyrZ86^)JI)<-}|h4!$=+SGv(rI<%VVrh}|6`E04{C@McMWZ1X=E{Wx zX%>BYf8(N|tq2I1ZI5BlYM&vxL$|92K?~p-GF_k#NC8ThY^}^SI=_GQKWj4Vi6(4m zu3R>nT1(>me_-c}6CfA>hDsVSvX}*j>*1};0P%%cTCC)*NJ{kj7T&Sz7c-bOFTwdb zls`$~%?@mS3k6;oLS3UiGiaQIOT24s3tkW-ggd*~LYLY3cDeIB{Ri!}@ywQpDR_mw z+SlQZO2e86b0bCphT5%pdVQv6v&zBD^u-n=V|V5kyNA=!Mf4Y>7ZidLavZQ9cqP^P zw#B6v4^kz&OZ_1bkQkuT>eZ3CBiEKXal7q&F;?EHaaI=kjY@t>w%2~$hEN5Feo%snF* z1{Yev(5GvkDfcQ)$b%BC1G<+6!oRIGg``H#Z2epLP;iD9X6}rtY=a%ZnJW=3y{lvJ zKpzg4)r#X9f>!x4G7TD&^=#R~zQqzY4Z3^L9~KK;(;_3E^l{U{AW!G z*`DWc>8B)pdLF^j%4E~YRxLuFUF2!7&|SvXW{{07K3T}1G6V0t@V4Bgu?e?%!5A@B zbJ_J&&$__OnP{7F=)*~DT!pTUyydQbmFji{#!B7hI?-}=88k)=w?0r$y^pEUMo^>S zsykQXenyPx`m~}TlMtd{B+N{98^}P}N)XpZeeWZXGXU|oRpm));DYwgo2)b;ySAH~ zo9D7ZFksq>Fp~<{;zu+G$)kJXW&1zCBy-+{Pz3z4rhM@@JGkbq-wHD?t=#|REJ&(Wc$VH2@g`V9 z!wW=GQuoEtWB2MU1t|LZ0R+mNcbJ`ppxpskey|4Hov|}*==<)eGMk>~Z$vk{1^qO7 z^aj`(I)^2!FdsX?b+iMVHi_=<-$vbNFQbcWy=g?r*3jPt7ICpD=7+fU*yQaVTikW3 z3hO2+Ep^nO{`G7PZc96C0*UM^KtLV)BQuQUrx%C!c2%YGAES)2rsK$z8jlcYvwUP& zqPN2Rb=?|ZOUe=2ef3&}EBWLjf|sBW;No3&B&DHekXsP->bx}5iIknzaPDW%-ekZZ z4I5q~Z_9dRCKtYny1@zeOjcV&@T)@l&WKwDXPI$sx= zRaz#k6pJqjvXmlV*%Ci@I+O_iMzcIBWsk|f8R$EKxZ+%ud+CM)w*zIi8fT=J1$Xc4 zzW#cN18|h0EwixIYuPyRt=u6wF%YAs%3HA)B!)FGXd%`nn{c8OqRUyz6+kJli?gF* zMlj3rqNQr?XnJuk7v0RSn|UQHi}af?AQV+c^xjgVnMp=e`p-*u#;CVRx>fy_p0()m zLSDPE@h-K(VVA4CR)xcOWTaQO4^9l!7r*N?FZb?UxHuu4*zslwL?M~}F<@pMUKn3j zfWAib>EXK1Ivs7KM1g5FK@I16G)?XkbG{Za&@8$J(jBIWsi!8W&$*1eMijuh#(tPZ z`hn|KxmE779B?`dRCR=M&%q{xP|INP&fi>p=(g#jS*sdrx`%N8v|F4~7xYu8|3VHM zC>wE?5_t7Z8ZH5EQdG!G!|`fZeDP2tXfnN0 zHNC|@|I`v#^VSyog6qsJlqXpG&5o2;~hMg-QEOqTW)1}6bomtdQvDSw#=lTwy1=r(Pd z%+NKk8{_|;^o9?zxUJ0wi-S?8HTrqr2eEVMr&ba`W+(pWMoUyx#4fylmL^X2zDW?mtU+6xG(8PZ=!Yc_rk9Z zcV0Im^oL_H7+g^b6%-7j_znD)zO^wy8P=dq&9+^C_WCdlr}9oZL=!A^*_gH+WqsNU5@m$~wo z{4=Q&twvN;J|$1k4GESSgMsj%^LRN?!u97Iv~hd=7c2Tm6Ww}4D(La3TL)+!(aQgn z&i{$Z>=FtNw&;hD%Dusslun5nWxZi0L2sCo8R|cJ={=a}48(KaKu(I>q}}SkqD6 zL+RL(;zs9|&`^)0nS#u-(o3W!>7kDGhy%bb{Yslva8cQx!x`6~I|=4?#M7qxF>nFL z=~ATeyqwNnv6vpXI@gp$2t9Z2^uC>J`aEr(zfDw6t9Y31 zaGxp6JYyP?8K^>c##3^|*~eqB-RUYTaxC{4eP!WSCrRFDIin8xVROm!m~21G^rI!Z^f>Ga*2QE)>R7*`>YPb%xKk*)JH}%@h#M!78Dbwuo-GK zK28el!uA5?GylZ)Bw;iE4)k=YXSDh`X@1EmZcQf6&;~#Z;JZ;zhEi0PU+S$a)>1#x zNZz4YhalfUnS91Aq|7(Ue4lqrgk;V|YDpWlGE%p1)L0*N3+o&`*x+d1Sf?Qeu^)b9 zq0hRfjPag$Yli08{$#_8nRb-F$@53vfrpl#3%pNQ%QeZD!1KUW6|@$aIZz77GaLuk zcMPT`v9^@lV&dMv4?Zy;?FvEUC=_b z<8;l${?LJt-d*v%ScZloe5c1)idCtb%{QbB%~)^8W_3RR?b)9ruR}iZlwK^(h49o4 zaWT}LIb1?MHN3P&c+nOZcEks9;3S@x=c5aQRhU!{c{wBJ?mQncrP!-thT6 z)IA;bS(5l1Dt}6^W|+>CkiTER>EwEkb{TfwTmaael6jzW38qOsf1buS(EV5)4vgt{ zYFoq0&?4kR>}CKjj?2%jEW8kDEAZv6VS0eSPZ`_*FeIBkRD(yJ(4%)Prl#cV!|II)Qa?` zmuWJQ5i*7UusE7DY@Hb{_!TsrdrKJ>R7 zMBLLY@$972B^$S4`t{HcaWu%%MAotbI6B2!m3<@J#-GF~VK_If?kt6xxu5ClM3z`? zwpQT8!3*c30H{FRS{?Y$l7yze7XTJYGb3FL3Tlan+;u;7K5xVtzI zPiNRs+%WJTQX3(NR84hI)4=CHIAJMUJZa+I>a%>VjY6M*s0wZa4n}ulAZmf>Y* zrS`*j?z$cBB~JA<4p$aZbDT0YqfX-s> zs#u5&mySY^)dBPWlOe*UZ{c^jO2BXs(dsJ2nP}TD06-SIeAXRT*EGGS zQmH5NT&>6XJJ<;Dx7nJMA%Qt1OWV0Zz{*I%(j|A2y&r{PS&5+D;?72Wq+;iF(5D+0 zUrhph^OO8=HhV@cEvtZ9#PTP-EC$;FIgZ{l2%8>5U;-qd6CG+-$v8!aOm|maYrJ`= z9H4wMMR$>U^;Y`J${RY@*E|)ET-&uOpxq=mVM=9VqFl#rJG@l&ASCljpDDw>_YW=1 z2a{`qfz9uL!B7$U&j+Cw`SuR%d7L-$wT}nT& z6HLcb0lg?5tT3$0yn<*-gqimNZ7r!)QZ}`gvAYgW!cJ(p>d2lyr!0@|Fnn)SjidzSAGzIwsvK0|;*mNe0Dbz!2i zf?0jVl24f%TGg(z7xExL9T@CuVhw#AR7GiLl~2UPpGH^)6VldJ*7A_QjOrYOgvf}( zPcE{=%4zq7e}+q$F4CeYUq|M=aaqhB-Ts5YYg|*p-kC;zeALLB|HC+hgdE4F$2@X$ z(jrsP!RJ-5+P~+nyaC>TiQyW1HxHUG(qJl_GQ~U-_i$BpXJZUJftfifkH2&XodoMb zU8b%)kqH|9YA~;Eo2YN+4bHb=4uaPZiW!e?3zwixa%^M+sv`TOQXhgA-1 z-~gdR45GSQb3@dvU`fM>_XMqszilFOtMc|N%FIz>WdB;2jw=JQgL&kj>V@lc7W@|- zU-BJgY*KHdhTHMOlWlSHD(BXysj$QNH6!iVr|Sa$K{A4_$#jF3+PYoTjh_AabmURd zN}@2}RYN&+L`Y2W4sm9qsS`|l{d48+=;(yuDU}RG$bytpJOjD^3b82coxzRA*vmpe z&%ZEX6c=(a@QXuNYYGCxxV#qK;&)SiCWj~Vx2uNn?z8Rq;H+q}Fgprb?nk7Bo|QpV zD6Xk_#5y|M874H?CE0{rjFq1fYd=e!(a>Yfhsyqua!rP1W*@4sSfH&t{Mph*?7{Y8 zp_w%+1Va>H8{K{S8ElmVcnPw-I^`rPTw+uP;^>(~sSYbI2z<+RA-JRk=x*(S>~)@P zFJ!+!O|92rnmK9UDXsv6OlJ9F>oI@dYxvs>*S`PYVX5)%(o?UottXxt>QhsirOXO9HTqlzy_TPO%hYWTjV zbEc9~hX46Gg=ltwWGhn4y4ogtNv$Q{-VS)R18~$ii1NEzAh(BI$`$5dz;grr* zp&z?PuA@vyPj41gKo8|QBwP;OZkW;YOiJVGPwPT!lt^at$1j1>SFrMg{LwHJST&)S znv%@lGWO`H>mpy5QF_ZY!an}t3rByA{hP-{0Y1Q&McM{`+%;z6w?}C?)Beghdh0j9d>IYLbMbTWVo}5*9 zYMarTJ+0XmiuNB)(Vc$^{=$A0WUr({UY+-Uab3GK8aAR&)7tja7r4Ht2cta&7`9`7 zx5GA};288mFfY>8k|)OiXUk<(1sen-|+R31gw+=i6 zcjz9R}PH+3hMdCw<@U1$1;w^MBlY>a0FveLS%mjjy z&G;o~7w0a!GEpF1UiN0xdbGTBtiO5$k(f(`#EH%#o+*z%*%YQq5q#R~sAs&8IvyKZ zC)ve5rg){07Rxjf-5yUlmUU=GToB7#S^gjix>xY_ee<ZT$`|$SPFq4(sIgJj_-&-m{N021w3!If1{KM9+ za^xbUUog5b&hS!uu9XVV*gTap+_QmOln!JyKX%ysC?E1JtMNb+GQS9{BaxQTtJTJV zK+Y2UzMPUl6_4B13d@;NSv<61lcpZ*2#i;;Ld5mEb2up-!T)byH;PH^lZPjb6NdMQ%Af@MaXlO-ekVf zbbc@N=Kr9E@kErkY@dE)qN1pJDcf@T&m0F;N%&+(|=|6_I7GZgDg^Z5~5} zf~wp}uLg|eT!a%Ta*2p~9X2;9POKihbK^Nw5Eq?F_Uxn36b*G%MU|^KkJRFhH4;E1 z8-Im=S;CyF4}7^iKzj=W!fCWB`-aZrx0ixr;YZ+pIh-*sOG)wPogls1857~MXNEFf zju2_PqLoYUAIo{$kdl%D|C7=cQg4o@OqWb@*b}}z?wGc%VJ{(i#A~XjoUkEd)uAeC!(CVPPYj5pM}{Y?-TpZlcWk97v#^sO^B()E zxY@KyoZv-U%d&Rs$iI~(q7dHPvNy}6*&(Co{^!S5MIDx&AdUVzX%<`U%pHlgwIn%G zk5h#1y5?u57!`o4Y-wVkCqR71p~r(NzLH*N1Q7fujZ@~*1w&i^tuTYe{Pds!VD|;h zZu3l^UC1uec?wLo8TGhhT%)O{?E_}PiMO5t4^<0eY6bwgEo*;p;@X3>Tsp=slm-am z+=g`uu?CDl;hdLvrUy1mM7NgShh}M^{oWv#&nG&WksqPMAX)G&b=+dROK(6~5D@d| z$aR^iPtViwP+&ZsGKXj3dwA!N1~dkkRO%*X^C{fmX5y)-o5raYAJcSY4KIl>!xoJ& znNi=*RBFvlWXYZ=N+&Po*cnfoK#VQrCz)5RSo0*&jy}EBLBr-IBQpc01_s-WzQ!rd zq#hP(;C&q?cK}+{}Fk%SO1X{Kl{do z9!Y6=_7OlVo;+NQ_00`*;6A5W(8F!{a-0g74NEg|F3-bGl6=a8x(@9N6cuVZ6{UCr z6w!}_fwNc~)6WhGkwQOGjOV6N8VEeM;`?={`}2$nSOJ+O+E9BMVuhhXa4Ajro3D(z z^Mp&48CRP>SoTeZnSY{#SYtk&(@WEsy%>Gri7#V5r@A!{{O3iw$94dYYXv;Q9xjn5wIX*!kCQ8qOpFKf_hP0qStqb0PQvYN}M~*jB@_ zj?k)3=qu>@wV6J5SPk+d1Om6n*#N85fJ-MiAyPVQx!mN*qxL%b9lBBpTW(9Y)8;5Y zIeG9zFT9b&^bY{0C!6*+YLZ?{>V=8sy*zH@tA6UxYd&>!Pw2T*fqZ{)A8#4? zRde%}H_YYt^E0AcSyrvFO4-7Wi5giEy7z>3#>1P%Yc9m>Fg|wri=E1`cfbAVhx4vq zsix18wyOWDyx71}dBhaC><3sK%C6$8i_`u)6SY@BKBUp?{uYj!)A>2{yl; zO%EoqCf?gL@q>t5QfnAG{f22w5|hR63nU`Gf8+wR6dVkYdv(#pm11j)80=MDf9WLf z&h+Q>lyN4vAxDw;<1yDSx&i`QEitV4G_TPk7yOTgTS)y`ZOUCD&L z=yRyGYO>(mkC4ryMmj>*-q~X^Gr7%yJ8$i6OjoN+hNAktY6PEr%&3C1bN~aTqh5ww z5SY_cY}Mr84`j)%ULqwNGylQFIADwYmlCzTqf~s(T&6;puiyiESdLw781+UXnVR;# z#+<(Ozzg6Jt@i$+D<2$?7St*OsD07ShR>nRKSQg7L|MK0 zFR)QW?|ZmG1YF)W_%54}vr4atRz$U1%zNl4A z?k+`mADzZX5kzyT}_+vC2x%3w>c5?!kx zgdjU4hjws@4Fco@KVBh)=ImITLw?@Jyn*-xLF=qL$P!S9U~r7T$gM4gS=QRcTof#D)e@SN+mnHEL6P^Q<_f;v1ybpEZnZm!ddg>xWD)=ji zfBMxEvJ6$z6?6aon%VsBd9&G$9NC9QDq8#fMhn?y%RPDNv+h}kpYqn@_He3h%@1#B zhkg|ohZzYC-|WB?9!_0O?&O?j*NF$vd4%yfIapj{@`(JVZ`K!4!h(j-FhLkFNKQYwCXgKx3^^Md908MMc2Y2v$H)mTaw68>Y&X83ma#WXc9@ zwG{yk$`BcffFy(sVTBhHep{rurLIp=fEXFbpR*)7ZS z^L450ppQ?ZLb{iduQ$*J>>~Tz8x{1G2kHgJ*qf^F$h!8UeqagRYQsJMZ4H#t$Ni~VjoL#MiVwy5^y+Sk`zD>zHmT>Z4&rSDoos$0jNjJ}b7 zuE|jt9`!kH4s#|6T}s|{ne5UAX|AtQcDKjOEyZc-xcV+VSxLf} z9{oRi?EfC7Q`dZ0Q>c#^aC5p##jYHEA!@Su1{K6 zH_mEK&&gb$pVO#vzko4*a*cnaUOFAJi!=;di%pVU_bL5=%%=Mxlx0!sL?Eww6glbf zCIWJ~$G@AXBrWTy$0}xWA!DalA_AiWNCv_W&A7GS73h;zD>-leLd<8irY6)%6{rHz zCC95|ANnfKbEJ8D%UVU^4aZ)8%&LgO$zAmb(>v85Aj37GKDpc!!lw{ zhV5pgjoO=>_ZcNB_^yS>o8^{;SJ;SlT*VCVKx60 z@XBhSuZ*XPe9?JGqyEn185z2pAxy+Wo~R=}FccB|zokKD!$$kCp}~LY^F7=s^uU|` zRK9C*F`N`&Bt~q09JmjXa!!-8i5idC-@$6hZZVjI8W~bQT8~%vcl9-e2Sn~U3_fJX zc!e@CanHy8J`3OIIviqbZ)CmuI7)L#9h<=0ZD3bi`B|tWkp#G)p|$~(-My7IVV!N_ zVsO*$);$qg0YnX1u(udzX*w9L`qZGx5z`_tlemojPxexWQm+AEvaIV$jR9Fe%s&nn zHwIL=tSeTC=ogd}*%3Q=JMjegTJjV1jU)hn#=MB2cN3GdeZ=rnQ}i_OnDva=4`#94 zc@6sWk`W~U8IL!GJ{)nSk9o2+kE1pp<2?1CC;o-=UYDu@7M9N7cg2H^ES1zY> zUiVzvZvf848OBl1E1h{?6jAJZ#e1mrDE{rzm^OZ<(ORwQ%Dta=6o&ZNIdT5ScvT)^ zxAwlV$2^qXGL;N6P`t`xDY*hU1G%QVu7Y!(MoseF{lS%)V`H7QP#1~xVc4m+!gDTM z5Cs4N;m9~XTD#d>x@{~0grEY+y#ZdqBNaWu=RsJ|{ zs3a{#12Zz+!IRYqCK5@YIn{4G%bnJGxW3Y|<#}b-@=*;s#yD;1mAWwL0I_$Y?M1J) zapvT7NN-}gK;EzR9nFd3GNn@u^(K~!dDJ#xrK>70x2D|Jmf>~{lRGQ}2HS^<@jg6gJW|`+u?P?<> zJDQ9W{pxHgeHw?GmW4P9e6#(XQIFkcDsx0VSN;W$@4=hRm7m--Oz7?5Y`vDoMxEa2 zBZ4KI?x__QRe`bSMnmeD#_a^iKaeSH)BHuZheWYL&`>rx*pHXoH1{7n;hjdBWPdss zk4ihOEjvbUw`-u`3;2AE(&~CKBPxlb#o9qHN33i+hfWN+DR7pa6%cJm+Tuwldc%9~ zpOw#B)IpJ+g=S3F!=c2E$~R`y={4Z^%I zH?MPgCKHL;e2p8b-xQXfzs3XgB40aL)|{8;*WBI#Rb&j0bOy zHQa)8I(a;e!#(p#ouSFYXQoaCA|-Ky3v_*tx8a9MvpH-}c89d=vN`z7YMQVUc-~Nr3Nr!Ohx(S)aeNHZ;xy- z5zNsz&kM`~x0uu%<2C27_;UHfb-~&o3`7C_mD(6UT=V^(XVnGu0?vulUzaQqA3BUw zsW;tnWU5LPd>wlH1Hif^!oSqXR3$c$+QcDl3K||iu&X7?{2U~f+7%%wZ^%mqB6S!Y zdU0UjDEfuDFVx8@`J8@c6v!P7v|9kW4O7{vhtNs34`HP+YY-?aVYmzyRG9XEn692| z#>FOxFh^hVu1-2hjFN(gG^phUtlB8OH&s_OjC#K6I=UaZE%#1Pg};e+6!QkehIrgOT1aF5d9Wmv9;!O+%A4BA^F}aui8#s3)@;9y;vvSXD(v;DLzVqE_vQ@*{hFxo}JfJW1Ps+Y zC`^Wl1b}YS%?m~$29R8@|GR>mtkox+Qq;d=^R>&Y5~jc@z76=(-6qs3PR6ET(GH;V zj>vw4W}H4=|F}DU5MWeH#!|=WWn$u~?|Hw-pi`>y_*_r`_;9>McqvT3^oOGAdI)t5 zX0e!k5R4#Mcb?8xGE3*YM8$^aqn2D_?=gTBrYuFn8h~Ph#zn>imH#B~0art-;Nil_3y{1*)2bprC@*xn9Uufu|>A2WP$= z*Ri(1FJwWbOjS~jjy&X6gHKknwCXbJa_e`~ih1eFF>t;$Lw4|zJ zEe>o)RmMRs9}dBDxtDc}3P1yKJk*;WWsHQqRLPwa2(ixJmw0~C)v(8Nl?St~Uj$FD z-{{?N;;Na72Grmw)7SdO`Ywj~!M}Ug^1{O~E2Jg<(Gi*FAW4vhh4v!8ThMm(&Li%< zp__V(I65F_3ZeoG(1Ro!a1_cWTPXu}cU`T7WNTr}CV=UsWxZfWfm9Bk--6*gf-g>) zTJ{{GF6u!Fs(!TaPic3E4ic~mU+-1iguvLCsI|7Rs*I+Z(tLl&e+YqHHr_8QJW6JHKU&+6B+E*wp=z{ke7_f(~=%;#+HgfU3ztNpZ z;UtFX9dP=uCa6-%48Wte50tg|?IrdsYQ)w7fk6o(#c(aJON|#zT1`}13-U-HdBX{- ze8n)-G@TSxdA-%g2*`EjlbaaH<`NOrIsSEA*3Cg^Zy(m# z9l9uO$=Wm*9+1BGA-hpiaNB?jy^6eA_2g#OiDi=YpN@j=4s|=(B^u}%*@ecqo9?MR zN4?I&9o74r9Syq-SWm!geQe&IekEdSU|(_d1dVPKEdaesm1$;KnI-l)JZX_Gwe;?0 z9?<@AYp8;oeJUZDZBi~EOG)ceQbLcWyLNGeGUZ_S*d^tqgwlHvhf3M+T77X{Zn}V+d<;IU)0i z8Gg7ElKtwNKy|8LG3{kyE`}wm)4Sr94ZQSLGCpz)-dE+q<*?@?qSG!#q=dD9|59!GRcBWsw_7{Aq9R$*GNCHJj9E| z+E4&&xW_ceB`kg#(uA{fvQgCnkk7Qel+aZkt*^K%Ikm^+Iw~r-n@ar({Qr`!$%ql? zb`7bOoX;t6v|%0|GWkfrI~PHUp@CCO{%jOd@XlnK=R><_M-t0!CLhVP10@ZH@tMf% zI-{+`7Qs&?(f34Hm3I|=57u?)X6Q=KcbseCOp~O{vf;9ZNC!@9@RE0Tc6Y8lV`00I zc}~@*e}wIe#YT(TH%;~gP1dO|7rT8ZwXbc2m_MJF{ejuU-esSwaf$KqUBv^BbWN;% z=YP4908UQsAQ>NCmrG55LJkz(RE}OC%uQhllXA8_LV>0!ZNV@(d#e38cHd$;yHTV~ zwjoU9pQhn z75or+p1hAIVCUvjFHMzr@*NTGypU)~Ejr&Rn01h!>%JUKnod&Hc!WT7Q-rU`vK3`_DE0(Ryj3^)bn-alS)?%a7XI}2`)f{fDx9~ZhOk$ShOeKJ%m+1dmUDTi6JLTUo zd7I-GOzvQ?-3wqjZ8|ilc$lu=UEg212?2rlofL-Y4u?{ublEe zm3r_8F~rRgle;+eUP=3^wK84+6F1FAIt$r`3V1*g6Vq3+X?|cA9n^;h2kvZJk0uUx zMK#E|n;}-pgGV7So=Gac0e`SEQ3@eRcYzG_a(r>BsT@JNOYE;d!)qKW4TH>6*|Qaa zVV&JbG)Z9X5S>JdT|aKS|AMIubyJw~cldv;$3=#DP#K3LHHVB1A_+2#)-wIC1>?8W zby@~_gvHw%R+(6u^#;8QLuiPCIh9g20VQ)tQ6|ZmNtHi8K_A1Iwq=WWPW^>qu8PJg z(MOJf|7W(My}Nqy(l~#w!w^GDD+s5x2Ktc+ZmnXDg}J*I)9agigEKiR_fEo#{=FXP zeF5^?4z53d$>msfJN(GiMWna(vK=GpqP|T$RQG|0IgCp+GlO#>_d02Pc1sjiExqkn zuOXYVF!;?jyg3Er`Kl>?N`T&X=5GLBko7s9=?g$U2aR7u20~)2I+=U-=r%V&wo6QI zd!x*;9+h=oPESCc@v}S%=xn2^2Wc_L1J!TxGRuhP`(Fza6CpRx6?+p_I+MlU2n7L) zm@k;mx7Y*G-_!FhqNrGj>Z7Bkn@NUL9Xhx`zi~~ZB+df zbyba4Qbi~>OozFnKKO+fGOLcrE~k_#e;jT%(tJP4RjyCRz#cPx8ox@5Xv=@&cpO4S=PMW0Fjn`2^C^+}9JnOXbgJo)PPlPprl)Zn)LO<5GMSwb zNx~s{)miM%$SCJOly+0;s;Ey7++Tgix2d8Ys&XP*_v}1*IdEM(+mVOb4RUX&6aAf2 z-R7;&tNWr!r@(AD)kohrJhi;xP)TVJT^RHp6;D;D5H3W(-9jb_!Z|=QVq)nX3PQ{3 z!FAvLYAuDeptuB*E~(Z?xjn7QySgC<3`JL0car+~1%H+2 zsMLww;mOD zDfXWR?UX(#IM4q|s`KjjI=%)LW#98FX9uw%2113akGa2y1MQN)24taj{f&^s<6h-N zdq9pOXwojA^gk#WhbZ_;4K_hko{5Q&{;{94WFcfUr;>ue`|4w|kk@n!KC@$6qD>wQ zH_^QJ9My4DD%{1#XCb()t!+A6L(?nfmAtBIKnvWbYD8j=UvqMDBKqU$ks_+#8c&mO z6FdIy>7&F%t_+JjMO`bkl>|= zV#aDx=Nsz*omNhI7Ks1QLTBovpPGQEnPnsqsf=Pp@D=$1UC!%$h=UAr)6S1Ibiqau z38NMfMFA@rW7yD1A*t z<>fJ1km?}~y*k|+P3;7Oz{l^U{1e>s0#TxT{OjmQ_QhEs*KmR!AvtR>0u!T7_6odA zS}@=V0q~1)A(|knUy>z%V4x?f0-%`YIdurggBf`x+x@(HL5MT}_}Rhx6swaLbO5jE z+(Y@t3PiqK;|K%=dGS!t2jo~*O9$Ap>gj8lRw_JAP3Xf3T{=VFzn5jh7neW;0d3&I zUpICR2h$sSxO2C+y~y>M?rCp!G1)FMu**exW&z|HjA{0Q5x?bYZZfkU>L7Kl%VJo-E)+?EP3fiy#98RRWe_hiwB6kO z-`v8x`%9kkRAjZN-lVR)(%%ZaJChYwBO=k;rLr@cdi#pXF2CNNt23F^Y>gj&$=psi z1gG>4@yGuj^cfr;Q1zIbL7XJ;GeM>Pwva_vhG%GnX?!RA^=&=g_qdu!r`3NTno+Pd zN`OwN1?9G;KN--^pSo>Y({3U>XJ4{gu~B4IN{~_KWVtFORQn?&qies*a5l^e&S4sW z72_fEHfK2CMj(=hz&Nt=%(Rg|<78uHF|aZs^Ak!N;XQcTUGqP3V9S`2^d7UXTc;*N z95^&nG}F(=m?pa?BEU}$)*Gtc63E-SGW}#@)~}kHUZF-WJp*PM1w`-4Es(zzT)j83pYo!C zn4!pdLUPMFvR6^hsADK-x0IkNX%5W5r9rGAW)MAu)TfSyrG6^$^!9^1YHt8B z7`Qsim`BD}O}p-{7x5(IbSf(a90_^rxs>j@U=hsKq_`%Ic+it9As@`zY>w54^RRe& zM>m*)tMACSIsC%KDJfIbRyV*T%?}Ihc16q^&~n91cSEXMGw>l@%gQEBnNRNK`H|lF zsPLecQq`5H+3+eir|f7*On85tlbhst%WPq3HNA73}3aw zno6*0w^C7>>gz>N^-oNEUfao5x783n^_kyE>9-~ty;L&6g#WM z*3T$-dY%<6K%8JO?$3*T<~}7Y@s9%#yR)?ER^<$+=3pP7F6-@(p|Q!czLc(wKy_%< zXuK2YHudl zn(Jj=rQ$^xLN!^?(;W}M9FmnK_D;mrAwo#sF?DZ;s#T6lx1t4QnV3qf4D7UzRA-hQ z5F`z)+!>eY4D6jEMLwclYAI{6UTHBG81XcMEJwst;8+=zKJ?hg_}-Gadyfbi8R_JnJcM#%-ttHTdCAAmfMp;HxWkK#T_?&cmS_ zr8bt{DW^C4=2%^zR-S(wrCSAye(bTT@A5sTdH}S?1MLIZlPWWV1CVr3s$#q-d9jW= z?f-(R@@X^z+?_R!aOQUgjU~(EFdwueYy1-&2Ay8Qr&%cVEXN_^x18X z9hMef=Tx|j{$S+Nea&TLWt0o<=u&}0awZF}^ev3(zxk?VFKhI8E?HZ;_epeDwRrkf zYHNgeug$*AG4g&l89BKcRWb+uyit-z%AI%%I$Wjc{7Q%|j|i=^uLbf->_-+R$JrCz zl~0_;izTCa{091WvL`-xPp0#PSV|`exf8Kg1T0K`0M9i!Z*Dd@LD8ye>fYAtirMDV zTr$Sp^Q_67^$!agKFCJEUq)C$uJL51s3MO$VZ?_&Fqer3NU9L;=}P-udmHHU)qMji zK?;OBSE$)t7QGpq*PsnMt|O6DT&mi$^bFa>6zbd-7Y)5@r|9jH#UTT$Z|koz1$^18 z*_`wYXS*q*Rs(&_>;bm|zYixGUPlD5w~0L2JM>vW)noD@Bk+jWP-!5zxf(DQZ`EaH zh%sg;%Koit4X+l)eW0)VXWix%Ja?3{U91sJLQID9uzH`Nh|%P}{S8m!5;LdO1|CB? zLBYumYtJUSaL&q`=Ius;mGu`*_61BFT^5jRT@ai~?BfnmC6e4fP2KKgKW-2l_cRg; z873Y;Hr`ZF{cVdN3RFXtT|GMct`>X=NVF|B%}rH$C_Ux*`5-ff*aAaXJxoW`7AV;{ zd`4TW*kl@wb#VWzArE+O1ARc4+r9J&cHx$C$kp8)I4h#+KX>%h*^7#g>Dbj|VCCD#w=(?DxKlLB(IV#6P2q2n9G7oyz{~nB} z0d0z02v6{vAZ+O=Gbv(go-0kAbXL({)DqQ&gua@I1{ZF|bI2g*_~AIfaqgJDFOynX z&^GPS@+4JC*gU7KqCuuAaCA5FLd?F3Yp>KL3}dCd|Az_PkSIIcqwwFi@HF&0KJTFa zO>ZM5?@>f;6qtq{T*MvRY9VJ1F$@%?az?+C8!?t4AAy7(sI}#CoReYoMOMqY79O*^ ztK{!_i6egCcXtT|MD8eL$UVdp7R5i3=`4C4-Em!3#QO!f7gcqgqq=^S$|VF%_e>t7 zkD$WNzQRr>6g=|$Ux6@5SV9<`ObotBm~t<;=M#WB`c;c-{UtYez~eOH(H%-3&4+4v zLv+4%Gb%;J{j9Eq%WXqR6a*fRvEMG zD_qKbWGM$uLtnqS1IPR3-x0%vWRsz{D=Uq8^veQFUJpUw4s#MgL{4S?u23iIA^M|0 z-$St9Q^>2J*$^Y@A^Jz8h0-0z;}CUoXwmSZUliL&Y1@xH&7v@Gja}0MjAh)sq?g|@ z?>g1n=G8XzCTow>c*4%vCpqFDG@ky)Ucg$*JnC-j`WPP9U42#)F$| zB6G%*l0Ag-G7O|We>I=64XiZy<8wJ7$ccxxqX-q&-GA^giAC#7!Zm@9ZS>WRfh*6? zK#F+01@&{7UXTVb$M6@B-;-a__fAk3uNBwR)M; zZD>HBz<`b=rt7U|y{@IkA4oJZ*o;jb@&FNS>wiM;MH4>4#oPVPV1Y4#LN-;M;AOaI z+)Z-fTUI}{7RS(=xFof{_v!v;M4~Aa(1CiNcBFd*w zr6Ry6^vuO1tep!gd3)iOFE6+s@|IjOVOc}2Eg3WX!dX(i{9!6zC4D=XOh^KM&DUjUeI2xyFwlAXJICXFIw?EBaq70y6+jhw;Um~~Hxm>$$JQmqzi zA}FH5+wD?eoo5beP;4#18L^liKst+P=l>2)={oMarVVB>aK?G)Ny=naUaUsB2qsGe zJ&I^1L}1>7UL7Rl2!f%F9wTrD| zGES7)4hH9vQZBu*^_N|pE<{Oq=f~inQOvffXrN-7y=&35HyGo4g2uj?!CC(F?+S$X z?u^DTn75b;p7mjm%`6QqO}RM`&{+o~+x2+?$is-SgnZdBTl6%#X!tb5s+VsC+n&&>_TVrP$ z=*9`1P9v?q>~1`NiVxpelGml=^#fiuX$|R%(tMg8zALWNG^iA6a0?RIVBpWXpihi^ zE^1D_`cdlhT7fV7nnwqV%4W!wck1Tj_Wu#~L;snnzlou{{-Dav|KLR0Ig>4|L{&B! zKRrpef-;JFQBez*0@Tuq)kjq5P61tJQD;+%{C_Sgy7Ao>vsVE>mTq~YyZ@IhrGllM zw!Ze+ns#qxb$kY z+f8+BG`hLjEqF0tPMiJUTu#T_8g=N|n@HWLVG@J7wrZfSI&fl49PhzqgiNc;Xs%6A zVx}B!>f-dIL{1*9bqw^xd#@~+?(KLoALEDXiy5pD9cm=E9H!$UwP%O1 zjDKp>WsRH(fhsiZ=)g*(5SjaRz1LBKSC_7D;~DTV!`Gy`gGY_e$^^Sj=5^X%2m0>A!BX7Q4Bm9M*5&O>a% zLY5`L>@Zui-K0Ah-}@r7;!H>pV{PEOP%-DqsZPUZ=(ZXCjmxLGXWzc-dW5;HMb&rB z3vFj=^d-|&?80lpYCN@+Ft_c;emv_c(~EJrpl^fMR~Psb*&>+G_ znT@4=dU8NOhxIX;W17~-P&)nydulFf_a3ucyqAjf9N(_UdG_{5fqCiOmK7iW`L|Mk zBDaww5J{J$y{I{Tz#xd~R5TKG`n%AZBHi;nP@@7`$|MJ>C1NmgM4vbD62nYq8JXUb zw|lr`g{7BgSBtz5^A$Dor`nqv1a4Q!j8qy8P_E#L^>W=#5}XMu1_T$EoExRi`L#Rx z9t_e4C@h?-V*sq8?0qgy<0vgji%|ZqDW|f&9k%!Fp zeIeuR(i;Vvt*Z{<^sHwLRAZzrs*!8dyOpMPUE*9H%e!eJ?TyCQR3*{TcT|MAj za|&9c1*Ym z4+S}(ov_!DFO1`v9zkb*YH)nxQN-YiZsF7uwj z$d| zSWTq?VMU9OmiP}$wRoXAZ98s3e957I9Y)X!RZ>>tB26$DFJscU$a+O zXZX#;7MwXq6O5ktbGCCP0Po}jvquor&7ao9cE$}_6Qmq{e}?xyp-A}I5ZK~=xd^d< z)>@8KWAZ=$|2eCn8SkSjq-nF*Ds@_4EY>f$J3OV_=-u(31NYM$aXt^5K);qm(WI^E z%?3nIEO%mku}f^+t|C>Qm@}%3e)01q|BNJC?SE0sltP)0KnV_!Xd0p2S2u{cV>71Ew;r@!NU z1l--IEGXgUND#y>Ex(Y4|F=4>*%O9veTuZowt@eF!UY=M>5(AOP2)A}ux@$l>8nym zZ*89!d@VhfKd$&K+cR04H;+uNHLZEET;y1g0jRLu8p4ryEmV1oG(e%PI1?0b8iYYr z7il;kq0N3CD~k6u^=FybWbN5oSewa6xD2iA7M$=CGV#f_xHhvn7EF1gf9%>P$&N&y zx;t#m*^dL^cAr?rk1GOO_^krlo8zy;XA3m(NFCBa*H9KVXKz)L{%mo+GW@^wXp7#! zc>gMvzPk(6fMHf_VLW1ct0sSI{SMRWT3>9697Fid$8#+`-pxz)Q-^yYn694`^WQqB zIEzFN_RZ8QJ=g|g4;AQ+A0*>65(jIM1%?({nn6FwYEXrO!>xPBo7rk_XLyWO;!80v zAWG#~S{^=xIYOWxff{>okAp9=pS)g&VUu-cIWx=uu3RL9BoF@MbC$O{o^_mE&h_3E z-(gbZBHi+evjz(%$`oQlV>{DaOD!d#c-vJntc{qzYrh3oX;5OW4wgr(nXHxqGy>D~T_?`K#2 z{EREfqwr3bE|M?8%PemJYd!73qF-iy23u(TNLS+TuGBgRe5Kfpl(kmAYopwt;e6tCLMugudlLDsp{5eZ~C(?QvXlgoEM#sZ> zosp)asF(&Cyo!Pj=SmQPQ|V?XUyvCxLb-0&l14(5dHy;uSk zGUNy6Tjflb3=*{4AGpRll8nhjiHjXBRg`YGD()bb1v}?prW)&G*?9JHzjucvrPQ`j z$Gpgi?vQq&GuLD2O*Y4~d>~TwmQ#`0t|JzUH~D2M+)c)k;=Pt9DQ;xL!vW==J*9EG zVg*8v%)OfFK0(OavY@U+Yr!sECy0oQ@~B*weGp+zz`s%rXDz-OvZ+8TO=jtnl4j{; z;&hcSqi@w)Rm?O7OPVIW9}yJL8)sMg^xA||CVR@`7>fsR$uQ`&?X&L1$#Qn2Pj_iSMVZ(WfO_ln-+xNUq)+vAUIlTR8Z zI*XLMmDDX%2JS^#%~ZStjL|xNM68MtH&{tP2a2gKEK;?3|8vC@#G{GH3`b7r_>@@_L>X5f$M zH1oDrZpI(enhLbnj)$jOkA6Kdt-5ET0q>Crf!Xf-jU{0Mexzt6d!Mr=hR`gYPmEF%7XGxcPucpAr%2{QuJ{A<$_>FLeMO5iUkT`5;X* zW*xBz^ylqzmheK@5!JZR)VyPz!`T~sMTI9e*dOdvUtQ!7erY|0EKK}hAF?p51Cd%D z_JoZ90G?SX^0?RWzs~aTwO7K9grsTR{L*!@neZRXJdt4dDfeV(ZU3~kKfQynFLp2= z>#LK`p2rG;*i^>Il?>h4KL`ngeVtQ>V3jYK%gvlf3X{)n5YzXtJbMT2Bh;>muR}@fsvt%- z)`+l=QCM8OJ~$Mt^4=4ADnNTL-_GQuI>AlfO|sLmbI}$1Y0U`7ZfyuY5)oEB7k%|N zsCi$=z!L_v&LU5>v!|5fY+TP7(Y&!5L}PQAwgyg(i)2l|p;(1mIsPK~qMt3`oPGJ#9j@z8gZP zu3Zj?0;EWXp^;yA4@O{e6|&rscAGy(yHKzZ5~oT3gQRO*W_y_%_qG@D*si`+4ssN^1m&Zy~-q^U;l>j*mnosJqO%r)lJaD9CM z%*hL4d+ymj`|@r?rs1{F=A~CyZ9;`OEgRX1&D#0bS7z+Bknb)Pr{O_#&M!b$Yx&*o z*K?ZUoxT?dCgMwNSjjFCgqT@oB@Mn!M|NST>sz&wjHNP^dVo&j8>$mEsy%Ch3;vCk zh&$6Td_sTG3`CeC`m;;8g-UsvCC&##Uf3V+8EG>`pNFxkTPYaKBxQfZh6NUuft&$I znb+Hewcj_t@)AqO799h*LGJQ3g1rKnEd3+O1?P*M#&{!wKSzJG!D-wZe2SfZY&&g) z%ht<(-g~}vV=;EA8ylsYYkvt2o7(+BK^2r9jn-jf(`OW>|FBA|Ywi9UVXIwZy||cz zrLvnlVU4Q2)Zfns1`|de`~X#jPi6qk&iz8Ozv>4tDsESAP>cvGmk3eVr8D*(DI?sg zgeO>j!qvmb2zs*>Y2A#6gB+};&O65HK439tqI;}WkgqSRu9a?2>IQ-x8c%2Xr_;T8 z8{hn^aDpjl*n^9S(yYw1{bBxIH(|w}AY(h*QXYor15UK4h|`t^qN?uIT|yP2z;;EQ zQ2r&`QB;=X=-rF%wN~S={IPn%BV_N*fM9hRLqFg5{IX|?#8ur{3hsz*zT=G`ZDf9S z;H>(Ptu(3t)HHUm3mc+Lv(nswc6B1h!8?>o6UhNJ!bL=&`moM!Mp$2^Z%opqvwKhd zA^t9Quu@*&%3?S+`Ks^F_mcHJ{vb#adL9gFU-5)(EeYG&8XA{tO#-(cWWB&OL{;&aw#diN0_xrS|x!cvL% zp1tk2DZVrkL)G-J=op!8d+)>&Q-`|(e5OB`+5dXLhL3Z``2hWGR~Yki zSWr-#9~rxc9AOB0uq$h$?hKs@QOtbY@OV5I*B%S|1e$-sSDYlPWJwrYoarV*&0dvD zr$9_8Jw&tThkbrRCctWM$588zsY)BmGYdJ=iATo+m*m(F_2M2Xl6&v#I{Q9oj(a$@ z((~xumV!4=pBzlvLBML?iWv@H16+IeqJ3McYW~>yW359xx~$c*75*m<>%gV0JnnZ z8)2Uu|H|R;u3e7@>#&;1gSUXLNVKl@v*n;9U>RhX#umD1sUCt z|4GN1up^;e^e~Tdtg9rE0W|6YcF=a*XI$ZSBWR9LAxG0ZxO53Pr;v`%>$;t|zT?5= zYBXJhLWK6slCe2j#}?kdI4xxP;P&*2j_C66$Qz@kV?|x2dMabjpfuKjG=sSG(`N|88L^N8nAlP1?4%;@p2S;cWo@i+$)Vw%dP+ub^7an`$PwS8( zeAC3Sh1xutl6Up!2ZzYOg>$ghm#$C$d5gX8Pvnt?R>IQX;dW7p<_qaoITE;-eMS4} z8p#c~{1Ar2!en1EHeRVU_Q`?3MzypqYnenC-{faJK}ckNB@P>YY>A!sa|c7yl%zAG zUApSGu}CZJ+I6k~#(W^`XrDDH!l;Xs1xT$4MSpX&h0%&mxE3KNyKal1{p&VQ?B?uk7s8mx~ ziDk)eUct0clxafZodqR3P{4)U*61f#Y^qv>Fd?~t2fLitsru=!ayF)y95dLCh>`aO zTcUgJ4OU6`J#Df)k^daglY51iPnAhleO}Of{kFm6fnGf1pDI<4S{qA_X-4|T;O$mY zUUiM3)P0Y=yn0RsE9H0OhtO{{;+Q`PlF^Uv4JP7NpSttU->HvQqj&+-^|}dZ;jZ+* zvniM;G0@6r-1{&rqO;uBZEPFT1IDhd3>!e$CQ^t9ZL%f`*7a)MWFC9E_Ko7RI_onT z8-*=Zrlsfq)+$%0h0Yezfvm*)=#^Z@$-9W`+-9ijq1D#)h$ly5{{2dFPt=MX^m8SU zBs$so*c_dA9qTGCq>H3?fhhpfwGo)EQ~NJ&Sc$BD?pGA1?_aN~j@-7G=TC5no><~y zDbHqV?{mLM-0;(W$DkChBRWAza%Nxb5I-XY$Jd*CNz%hX!c09w@0eLGPwsbq`EJ&M1jSUUh=)?43e|hpH_+ z&$P{PaiZ%O>fSf|r?Fy3bj2~;LE4{Ru_6YtU*i-HDi5a=4d0yJfUzqduDPkHbIf>s z=5XJhh3o=(8WEIl9kIG!uqI}7uk)j%j+0^QE2sa>yvI2*bUko{ zXhM$)txqZAC@L5rpAK;@3W8G?AK0D-U4Q6r#}BlqZ7f*gpaEVfso#8Q*H9kTW=93L z5ety}7g~|XZzxJcx`->Ea>?`Ysh7lH(uXt5vU=Ni$R+gV$ZNF+Uwi?+nTiLq;REcO zGaF-(r_mPe$Oyw)1*b@xp<8E#Tqcrn;`n{|qaSL$4lth;__i?q5 zoqt);WoVV!rSvM~I7XKJIQrR=bs!tT+R%>2eLc|CSJ?r$RoRHzgW?_@geUR>>Bt>u z8$Cl7DA80caCg^Fd(^Ww>Kplp>F_8hU}0f;U*GWuSV3}?{TQfCKX~~ox`bSvY3`Ti zulJcw@?GnizN<4*1T5oo`V)V$@kRE_kE4}cF*Tt4vM$Qe=!GD(1I?~qR6*w+fAcvO zvX2KVaSJQ11cuIRFIF;o!U3^>@!E;oMJgXxA7hipyk6TfZDb01hnYLJXJ$gzzM%r94kQOe}Et}Y(MUW8f>ad21l2Mz?P)de8 z%;g!YT1pl8GHHw~U*X=KIqtfGUOxWQS~E(&kk`i<4IlJtSRBy$Vxos@Wd@{;Q!Qg) zDy-HA%}UFk1hFr3{C}2xkL)9m*@58nuA3&14kz;uf@E4RTDWlIcTj)a9{*dnr30(H zPBK5@zls%@-jt+4Mx9%+(eiGBo*bOgos-lqE!~FpXVmKtHL~5k1pl4+PLHf;X$|ov(N_ zxP+Vlf!b2(_H{a7_xCpj5R8bI$6mgl#qJMsUPyo5M(OqG=I3xYM-KJA-7H~p_~8}M zJg>6e{eqY-Ork67SWLRaPQCc^V6K?J2K6r389=@& zTIAy2g4_!&okTy$9uZ&C$~c9L!SCxBHhqQEp;jK&gaSgvA==I833VAzeVqXrTz_nQ zygbD!rz+L!tgw{+F|(YGC`plbo9SSrsKW+m-)_wP8^o|Udy*J z_cYGRAg9CHO2t;(=>gp^<*MlSTSpoTuYjH9Bq4j*LU`}qt_x)wLSuq8X~H&f zi2V_m277Xs^N0@+zmK2gt-*fVW5lEXDeH^~x$A@sXBg{d{diE@aoeZGT5L$?!modN z%)Am>()_b(gILC_nuNk@c+R&SaZZcrFFeaOMr(DQfVZIWw4%7zA1){I7B3{Vs znZILF@o66H4q1em8W|IdYp?#sDaJO%Uj*BLnMqE^+Q6HKlI6_(tEy8a0~IiA^3vQ$ zvTy(2LSQG9i*~KzF8G>|=q46{Gp}+E^}gF29_>X>7CI5Th$pah@-t*;U;}?J*zU#n zi0Iitk@49bG(0h)g+5kKc40zk@`6)-ha#N!pg(^4hRI>a&`1ke#mIkgXE!efVL4AT+`>;4N9!yd; zJ&Y}*H|v`SCp)e>+$<}I!c(+hZhs@CRliSsU@K2^CnR)n!;KEp=UIY<%dzl{p|xOK zN+ncCe=$&awLijwKI~d+m-f~oK;Lgp>$ZsL=I8O$U?(KMftwi54?8og_To6_=dCe**-ahv&PW`rCia$;hJ;Oqv>p|QWP^$O&mG+vL@PrjLS zS@i@$j~Xq>8kc{BHu7%wJc4mOd-(_zvNU1Pqn#vZN6#1Wyo*xUfkk6}3bb%-pnrv* z3A>z|I%JS3!JPi00y;qxDLg0{5oF*!JbDe8{bN+y?!Gxw0vGKh#UN70qjPbfe8BlK@_cB# zk3GSRz&weH7X_#1ykW}A*9r8#J6{1p;_Lbf9`= z6$p-@Q*bIZwLBWI%d1Cee9;!IJUkyd^-k?L_cb;H0Wrs4FX1*nYOgfoQ6&7Y0zT-= zkw4-@z+PgvGE)ykt{DD*LPA}FCLFzDjq5wlbIrhKJ9=YQwGk(Q&t5`Brp}_rQA6>! zTpv?VdrMy6aIhsoa2I024b}74Bb3*Rt>qwg6lk8UCT3Y=#3iS?O~E?qeH#a} z&{s;1v5UHoP1em%N3|YatxD0469$)X8k)2|{{Qv_eyT)?R+nC+O)%VAh{0fKCkU}h z#V#`@*ICo(yXe9 z^TTc1TC@pO!7cX9TGR{4#yV?ZtMgID2Bh}2hUt}-e>c;_l9mGa2!lCiH552&6yK5a zgg$R-Jwsi4*Pq|zM`M0JaUzWL!|tDd-nKvdP38ChJI;6H5MQ>{l^^@9wk3oL+5T8# zdH$YJOz3{?@3&n#yX%*~a!>vK(@%eXcTqyFjU7e2);00ZxqlG{4($~Eetb5bm;trvd zZg*$*zn`PXt5?%*PK=pbs}49VbE8I_+7_iQs#21bI9q0ZYIMmi)ulIgmD)H4(2sXp zG)+HW;$}~72=yL|WnT|C3%N;dpxx^fIF5dK5ASceMLX66Q!heqqA)JOD8!2UVyo?U z$WNx*E3JZTqHOAo6HH`luQYi#=l?%loq0eL*V@MEXRWnXXfK~4vh~_qxhjf9AS?o< zSg2QlA|ycs5{iI`fEZ#Rn5Amf76a{dVGXFrmIM-Hi2(u;Dkxh3JA^f;VRI5f5keq9 z_)Zf2^xpFaPJ}r#XXc!D&UrJ>?~yxToUJWFZU+te7hI&%Gpv21eW9-Df>r%*qJ=f* z7x)pQTl7#Zw-`e+39EotLh-x?-`4G}hjkmd>y(rpZm|u#kPM2z>dm!#TdDNv-WJhliK~qk zp$L??RZ$7P5FcgxsK%#^>^SbS9D|M)4qXMj^&=cE+s!9bKbDy-d4+LstGV)rm^Jy zA@(}m0ySbb%|)EYB*$L+LQildF`sWbG$+*_jEa0C$@~AkdrX7*4*(huK23h3_HTcA znQvOB{Iyt5&z|AZDDOYc$AOA8rSc5{O#1=+IAd#s6qd#X@srwQ3BWXix6Vv^#jw9L z1kh_*E5e!08=m}+-We-Vd0bsOLA<~PlV)X|3CRz#)A9&DU(E+KvgG~uy(47Dj1 z>|t?5>(BSuqd0lFA;<50*>d-k6qSn`Lh75)tC z@(bFoLnUdQiQcOs0mC9UI^mi+?gFm_BP0Uz%MaS$ruGkPNqxdawQAbkzs(%kIh`Q0 z&?p~3;jn?$_FW}l3rnlR6g1`Kphru5fN);>N00cmMA*F5{P^lCAj_aCX2MeH`P?0# zj|D2aoFutYQ&KlQ-#$@4a1WT(^OC1N1^#^RjgkGAz0ekn273Tr1G{|W%y`ByLB(&= z$+NyG4p{SBm0b+ymc<<>h5dF&Pe23&-J%pz=L%JqMh1dCJTW;rS%YaHNdbvlFYx&L z{ZDXZP67e%n9?*0ZGBd$hyVmL4oPbxQc6x9BaN%jW5C{U-ka{d1 z^yrZLd!2_>m4V7<%F%pEn=IIlA}|DzOVF0|=c1|vHqipGGaS}b5)aMuPiO``?>8tN(GL$24~S| zgler`{>Ob}V+jBLp#Aqd^ssz~dZNs{^?ZP}*%zBQGd?A8A&T&;UEan&1CWUpnYV1Css?%yc#w$NiuAl!KdoCtJ(S6%fz*DKg<4h)B zBsKry0=v6r5RNbME-2S#OS)W_Wj`5~>9?BVjwGAx`~q5vlplvq zEI2GILrjzW>!0XhJ-Mgn2S|l4B#Z7i3HN$-@7|~ z4|wohiQ+M>P4f;16+bl~Brj0P2p~&<{WmlYcq#DM4d3$9WHR}=Ewg=7=wBDtf>OFT zgQUe~3prqPd_Rs$_I1rr$|V+SN%<>dmV1O?k&4|JUnV}W+IBJgDo}jNoi+V+FR*Bg zMjuQcmojdkT=h905=^u&01HlQ6_~ebUWHq%bgX@mkWf_cW{Si4?KUb*&V+QY=Dqk=@tDja=PpZ-FR7piEx>#;x6 zdwL&lU}z}ycwk3m|1x2^lK0fIZ_u_OFaO-oCS~sJgco@?;!;F;l^xSI(=i#Cm3Ex5rbSy6lwegsb6KY_m%cZrEvzA+nLiISFt-O=v5q^ z1~^A1O%Y|Xr}N~1)o<~a78Xs=)OBMr_@3@!yv~KykoQ(o=ZC};eEv<46i89VK+?LG z$yC=9{FCDnZAY=n-wUO+xaqCVTe_Yf^`*QFZw3r!JM{F!#f!L0ir<$J6TJ6or|6BqWh!|Jiivwxjmu#GY9X3b#Ahw`dFK81Zyw9A+2VYx*0&I9s$o{kFBO|_lUlLXYzR6*#GFE*lsL! zJlbk{DXf6Kr>0D&sij6#6$0i7d_Mnsv%VWkx$|a$L;D!y_^Hp1%>ND}A<0$!rsp*V zW>^c^kzJid>v1dyMn;OBFNFbzGw|LLknCEN>4Ol?LWs`H-@6hE z8SLyB0i&yDQgYaTq0Xyd#?YtSCujfiAMg|S1cW)7SqIh@v#R6J@zf5pVE&Kwt`$Vd zWfu!Hpj_NnO>e{@*m<_0Yq5%_8!IV)PN^xSKwOMdmYDAkUCZ|zJu=;Si-RXme<*sU>6D168!h=mm~-nGw9+(U#19&W~!^QO)To}5XIPEfM^;vFu65v z>O@XSDVQs)kKr8BRHfH`s(n$lP6Av2e%e1eHMSyvl`F=RYGMmd6wnAw-H)6^WmwyA z3;hy09zhwvz_}kdJfmppA)8&Tu}tTH6DF&OQ@zTALQ@O{^Z-#TAwd|!cxIGU=Tn!O z;%XNmOhLPimQbU6huPLomRK;|Ff#u@*V%6iUROOnSm*bFA5_hp*=s{s^|a(TvwKow zF$cqhlL(?r$T0spOpc<3-o&{pu6U-e{wN=way)llk#9JmZzfzl)uj6xR$!|vFSiNJ zx6>`)G>vkgs##9Os%1|~@p6+V#SqL5x?e$nC}ZbT-0Hy~80e_-SmW3%4VsfPy3Y(# zG$1ZH8M&2BPmuf5K1qm7xHPMH)*>;wtlHoTYNwvFx9GxjpikkQPfFw9-U+(=Un6MZnVmA2kPc{v2W)9IrYbX+>Nk-dF3Jv}t_MD?fgv~NGd1#6d1lVF_CvLLy12>fv!iB5} zFKAO#eW%-YOi{8IzS>w>Kb^EtzY7=7Y!H`RJ5}-T93Y~S!n>he{tE}A>L(G!9eb@d zXIEoS9pn7~^{tm4uDpuo8ncUCij?fM==qD1!;f7E(evA&lKNA}5`<_ImkGcTjcXd7 zV@McF@J_>txL}vYp4M0K3U&VE9t=g%lNRs%GjuKO;RWod+3(=2WS z3r1hm^ZXtOc^e#zG%eH=(~mK6^N13l1?PQ%(Ipe5TCFryF=--cx3w_A6ge`9vg=&Z z@O2!UEgmU36cuFL8y|r4wqEB{;TI;nl@mOSOIEy^Rdwu2k?&rYT^)Q%q)r{^Evbqc z42JN`f~d?ol;X~;%ENn~e5n!I%e(=0nd^olB#OO3^_*ATu2z1sdwBq}Eo#vRKX{pUDJQf;`v}!r22U1xYep=D*=!7^7ynWE5!#vJ zwx$|5cQzs$A$&MFlpX9@*`C1|>UuaCyM0UZ(Pi}#9YVKA**;mMpn`LEeSNZ)&neVJ zMqM4qNxR+dnp+bDo+RHL+CYovp3gAdI}ydGLrwZ-IoGNRW>r$qmSk^y0n7(v_YRj* zqBv<)0&&6E986+QL-}ZmTbOXvbaWwotBjBELJ_vjBp<`eFAqpj8TP~u>)id?b+iCJ zs{r~Ea;UoKtp@*HQ7;+icFewAWJCuLl@AH+|?BHO^A^a+w`X(>)t`T~fH0^A*-@zv^ z6j&&AKAe<9<)%>Zl35kYc%l9rUVM|6j;1(<`tII4c$`%0*CTZFUl?|3$g2)6iL~C9 zZBLknr)G&{5V@IN*G0v4yoZy!AUeYu0XFiWm=-C9;gh{T#k0WXz6g-qn}!V;b@sKR z2Jy4Yluoz@WFPEzI|X%2w}_1FmglrWB`hWQ7%8c(BLTt$SG=)^l7+E%@H_Rn;3 ze$Sr69vceguYp% zF#Ih~%edSzDeaRq{nC9o`?=98>PtC%>?z08Yvs^XlrK8lhL8$3NE9BH9U*Muqa>31 zjntITfvNDNS34<3Eksi%vyg5>FVk0xU_(Gg+6c8<%9aGPv2s=(2zW45LHo7Xr?kO2 zY~k8zTs8mcxIa@oJX}J<9I4cYSUW_BO` zb`qxMjLSo*c&aZ}Twk?}8!=HQo~SW@aNnhzJd$IJ;6@+TtU2^J=*F)b8lv&4n*)2$ zZs)O*PbKwpUCVziOb=O~G7~o*vF6WrV$`oQtk2L#xc)wl<4rlkV>{#H=p*b31dVMF zO&@7IvX3{0(Xjq-K~YNSC5x5kY2j&(6W*;Bl$BfCSH&ks9F}F3jpY8YtFh5^Qb&uv zZt?N%UI_l7TL3PMjMF|r7dggU$a*{ndJ(i}(^*RMu}7qOA)JiI(sto20D3L9tids2 zp-duO%d{bo6|1wv-f8PIc3GNn&FgO3ac|l`ZrNU<00DVL0sjsaYJ$^Km?lwt$)x1& zU#^&4X+#sU_bW5uXxgtd^B+htzwC4{#3Z2l9{jo1H8C@^*cz`w^@5 z(%Zr3{DCoM-&aQRj?UPiqSlkICiz3e?z>0Cn#C`buet<(P9xVOzqmOdGGBCPemFxS z;2$d6{*E*K7#J72oIdBcnmvEn@?e-wEPAVKkpa&9lG>MZmj{XvdcB8g;$K=J&()~q7mlo;#)gi01ddKs`tJ% z5)8IAY}0U&=lVlvdYCX|@4m|0o2J`kRXgs*1R#vr2;a=QV|Bj$%1b+qlm{F9Hg)CV;K*M0PhALLz zX~>RXwjwtkeYIYgJb3*C=aE^k88SA0ez@*Tzk`2lsoW{?nw{*4XGD~FbD?vslWqF= zxy4d=Fxe`jxw=(k-dB}`j#+SG{l9wwW zjBn-|7r<79NM-(hf)f6k)<(?>qBlGlo&bE^O-yT)qKtX2)_R9SDY8Fh23g-V-`d3Ladcmn^V+dc4rPnr6D zZO_H<4BsV$h>=#_m1{$ogWV+xoTX+qo>WhcPO+AQ0123zf1Ym0HgS4FcEV5T#r%=} zUwLnE7!zI8*TZHj1C-v$29=K0PBnfX2yX-MmOC_Ayi#XCTkD)G>X7k>{>=Hkw+CA@HA%BkDiYbW$AOGvm0V`m6?H6kEYTgU=ispBRzw + ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ DEPENDS ${NAME} ) diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h index ddbd186b6..81908265f 100644 --- a/test/unit/common/cutlass_unit_test.h +++ b/test/unit/common/cutlass_unit_test.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp index 3bc2823c6..0c548bdf8 100644 --- a/test/unit/common/filter_architecture.cpp +++ b/test/unit/common/filter_architecture.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,6 +71,7 @@ void FilterArchitecture() { { "SM61*", 61, kMaxDevice}, { "SM70*", 70, 75}, { "SM75*", 75, kMaxDevice}, + { "SM80*", 80, kMaxDevice}, { 0, 0, false } }; diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt index a7d0e2116..d72f42fb0 100644 --- a/test/unit/core/CMakeLists.txt +++ b/test/unit/core/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -24,6 +24,8 @@ cutlass_test_unit_add_executable( cutlass_test_unit_core array.cu half.cu + bfloat16.cu + tfloat32.cu complex.cu predicate_vector.cu tensor_ref.cu diff --git a/test/unit/core/array.cu b/test/unit/core/array.cu index 72f5b5a83..5a8cc855b 100644 --- a/test/unit/core/array.cu +++ b/test/unit/core/array.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -228,6 +228,14 @@ TEST(Array, Float16x8) { } #endif +TEST(Array, FloatBF16x8) { + TestArray().run(); +} + +TEST(Array, FloatTF32x4) { + TestArray().run(); +} + TEST(Array, Float32x4) { TestArray().run(); } diff --git a/test/unit/core/bfloat16.cu b/test/unit/core/bfloat16.cu new file mode 100644 index 000000000..9fa99ebb7 --- /dev/null +++ b/test/unit/core/bfloat16.cu @@ -0,0 +1,209 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types + and is safe to use in a union. +*/ + +#include "../common/cutlass_unit_test.h" + +#include "cutlass/array.h" +#include "cutlass/core_io.h" +#include "cutlass/numeric_types.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/layout/matrix.h" + +#include "cutlass/util/device_memory.h" +#include "cutlass/util/host_tensor.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convert_bf16_f32(cutlass::bfloat16_t *output, float const *input, int N) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < N) { + output[tid] = static_cast(input[tid]); + } +} + +__global__ void convert_and_pack_bf16(cutlass::bfloat16_t *output, float const *input, int N) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid * 2 < N) { + + cutlass::NumericArrayConverter convert; + + cutlass::Array *dst_ptr = + reinterpret_cast *>(output + tid * 2); + + cutlass::Array const *src_ptr = + reinterpret_cast const *>(input + tid * 2); + + *dst_ptr = convert(*src_ptr); + } +} + +TEST(bfloat16_t, device_conversion) { + using T = cutlass::bfloat16_t; + using S = float; + + int const N = 256; + + cutlass::HostTensor destination({N, 1}); + cutlass::HostTensor source({N, 1}); + + for (int i = 0; i < N; ++i) { + source.at({i, 0}) = float(i - 128); + destination.at({i, 0}) = T(0); + } + + source.sync_device(); + destination.sync_device(); + + convert_bf16_f32<<< dim3(1,1), dim3(N, 1) >>>(destination.device_data(), source.device_data(), N); + + ASSERT_EQ(cudaGetLastError(), cudaSuccess) << "Kernel launch error."; + + destination.sync_host(); + + int errors = 0; + for (int i = 0; i < N; ++i) { + T got = destination.at({i, 0}); + S expected = source.at({i, 0}); + + if (S(got) != expected) { + ++errors; + if (errors < 10) { + std::cerr << "Basic conversion error - [" << i << "] - got " << got << ", expected " << expected << "\n"; + } + } + + destination.at({i, 0}) = T(0); + } + + destination.sync_device(); + + convert_and_pack_bf16<<< dim3(1,1), dim3(N, 1) >>>(destination.device_data(), source.device_data(), N); + + ASSERT_EQ(cudaGetLastError(), cudaSuccess) << "Kernel launch error."; + + destination.sync_host(); + + for (int i = 0; i < N; ++i) { + T got = destination.at({i, 0}); + S expected = source.at({i, 0}); + + if (S(got) != expected) { + ++errors; + if (errors < 10) { + std::cerr << "Convert and pack error - [" << i << "] - got " << got << ", expected " << expected << "\n"; + } + } + } + + EXPECT_EQ(errors, 0); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Host +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(bfloat16_t, host_conversion) { + for (int i = -128; i < 128; ++i) { + float f = static_cast(i); + + cutlass::bfloat16_t x = static_cast(i); + cutlass::bfloat16_t y = static_cast(f); + + EXPECT_TRUE(static_cast(x) == i); + EXPECT_TRUE(static_cast(y) == f); + } + + // Try out user-defined literals + EXPECT_TRUE(cutlass::bfloat16_t(7) == 7_bf16); + EXPECT_TRUE(7 == static_cast(7_bf16)); +} + +TEST(bfloat16_t, host_arithmetic) { + + for (int i = -100; i < 100; ++i) { + for (int j = -100; j < 100; ++j) { + + cutlass::bfloat16_t x = static_cast(i); + cutlass::bfloat16_t y = static_cast(j); + + EXPECT_TRUE(static_cast(x + y) == (i + j)); + } + } +} + +TEST(bfloat16_t, host_round) { + + struct { + uint32_t f32_bits; + uint16_t expected; + } tests[] = { + {0x40040000, 0x4004}, // M=0, R=0, S=0 => rtz + {0x40048000, 0x4004}, // M=0, R=1, S=0 => rtz + {0x40040001, 0x4004}, // M=0, R=1, S=1 => +inf + {0x4004c000, 0x4005}, // M=0, R=1, S=1 => +inf + {0x4004a000, 0x4005}, // M=0, R=1, S=1 => +inf + {0x40050000, 0x4005}, // M=1, R=0, S=0 => rtz + {0x40054000, 0x4005}, // M=1, R=0, S=1 => rtz + {0x40058000, 0x4006}, // M=1, R=1, S=0 => +inf + {0x40058001, 0x4006}, // M=1, R=1, S=1 => +inf + {0x7f800000, 0x7f80}, // +inf + {0xff800000, 0xff80}, // -inf + {0x7fffffff, 0x7fff}, // canonical NaN + {0x7ff00001, 0x7fff}, // NaN -> canonical NaN + {0xfff00010, 0x7fff}, // Nan -> canonical NaN + {0, 0} + }; + + bool running = true; + for (int i = 0; running; ++i) { + + float f32 = reinterpret_cast(tests[i].f32_bits); + + cutlass::bfloat16_t bf16 = cutlass::bfloat16_t(f32); + + bool passed = (tests[i].expected == bf16.raw()); + + EXPECT_TRUE(passed) + << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits + << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << bf16.raw(); + + if (!tests[i].f32_bits) { + running = false; + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Device +// +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/core/complex.cu b/test/unit/core/complex.cu index 946e2f262..9f70708d3 100644 --- a/test/unit/core/complex.cu +++ b/test/unit/core/complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu index ba7966553..ab843154e 100644 --- a/test/unit/core/functional.cu +++ b/test/unit/core/functional.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -411,3 +411,13 @@ TEST(Functional, multiply_add_f16x17) { ///////////////////////////////////////////////////////////////////////////////////////////////// +TEST(Functional, multiply_add_bf16x16) { + Functional_multiply_add_TxN(); +} + +TEST(Functional, multiply_add_bf16x17) { + Functional_multiply_add_TxN(); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/core/half.cu b/test/unit/core/half.cu index a0dcd9669..be5e9b433 100644 --- a/test/unit/core/half.cu +++ b/test/unit/core/half.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/matrix_coord.cu b/test/unit/core/matrix_coord.cu index 676bd2c03..841d4cb72 100644 --- a/test/unit/core/matrix_coord.cu +++ b/test/unit/core/matrix_coord.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/numeric_conversion.cu b/test/unit/core/numeric_conversion.cu index ea062b737..5f8f38398 100644 --- a/test/unit/core/numeric_conversion.cu +++ b/test/unit/core/numeric_conversion.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/predicate_vector.cu b/test/unit/core/predicate_vector.cu index 17de2cd2d..f9a0675c0 100644 --- a/test/unit/core/predicate_vector.cu +++ b/test/unit/core/predicate_vector.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tensor_ref.cu b/test/unit/core/tensor_ref.cu index aa8a5633e..6bedddc57 100644 --- a/test/unit/core/tensor_ref.cu +++ b/test/unit/core/tensor_ref.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tensor_view.cu b/test/unit/core/tensor_view.cu index b660b3d67..b35fc426b 100644 --- a/test/unit/core/tensor_view.cu +++ b/test/unit/core/tensor_view.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/test_unit_core.cpp b/test/unit/core/test_unit_core.cpp index 3823bd76e..a6dfbf4bb 100644 --- a/test/unit/core/test_unit_core.cpp +++ b/test/unit/core/test_unit_core.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tfloat32.cu b/test/unit/core/tfloat32.cu new file mode 100644 index 000000000..32155df7c --- /dev/null +++ b/test/unit/core/tfloat32.cu @@ -0,0 +1,197 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types + and is safe to use in a union. +*/ + +#include "../common/cutlass_unit_test.h" + +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/util/device_memory.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Host +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(tfloat32_t, host_conversion) { + for (int i = -1024; i < 1024; ++i) { + float f = static_cast(i); + + cutlass::tfloat32_t x = static_cast(i); + cutlass::tfloat32_t y = static_cast(f); + + EXPECT_TRUE(static_cast(x) == i); + EXPECT_TRUE(static_cast(y) == f); + } + + // Try out user-defined literals + EXPECT_TRUE(cutlass::tfloat32_t(7) == 7_tf32); + EXPECT_TRUE(7 == static_cast(7_tf32)); +} + +TEST(tfloat32_t, host_arithmetic) { + + for (int i = -100; i < 100; ++i) { + for (int j = -100; j < 100; ++j) { + + cutlass::tfloat32_t x = static_cast(i); + cutlass::tfloat32_t y = static_cast(j); + + EXPECT_TRUE(static_cast(x + y) == (i + j)); + } + } +} + +TEST(tfloat32_t, host_round_nearest) { + + struct { + uint32_t f32_bits; + uint32_t expected; + } tests[] = { + {0x40000000, 0x40000000}, // M=0, R=0, S=0 => rtz + {0x40001000, 0x40000000}, // M=0, R=1, S=0 => rtz + {0x40000001, 0x40000000}, // M=0, R=0, S=1 => rtz + {0x40001001, 0x40002000}, // M=0, R=1, S=1 => +inf + {0x40002000, 0x40002000}, // M=1, R=0, S=0 => rtz + {0x40002001, 0x40002000}, // M=1, R=0, S=1 => rtz + {0x40003000, 0x40004000}, // M=1, R=1, S=0 => +inf + {0x40003001, 0x40004000}, // M=1, R=1, S=1 => +inf + {0x7f800000, 0x7f800000}, // +inf + {0xff800000, 0xff800000}, // -inf + {0x7fffffff, 0x7fffffff}, // canonical NaN to canonical NaN + {0x7f800001, 0x7fffffff}, // NaN to canonical NaN + {0xff800001, 0x7fffffff}, // NaN to canonical NaN + {0, 0} + }; + + bool running = true; + for (int i = 0; running; ++i) { + + float f32 = reinterpret_cast(tests[i].f32_bits); + + cutlass::NumericConverter< + cutlass::tfloat32_t, + float, + cutlass::FloatRoundStyle::round_to_nearest> converter; + + cutlass::tfloat32_t tf32 = converter(f32); + + // note, we must explicitly truncate the low-order bits since they are not defined in TF32. + if (cutlass::isfinite(tf32)) { + tf32.storage &= 0xffffe000; + } + + bool passed = (tests[i].expected == tf32.raw()); + + EXPECT_TRUE(passed) + << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits + << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << tf32.raw(); + + if (!tests[i].f32_bits) { + running = false; + } + } +} + +namespace test { +namespace core { + +__global__ void convert_tf32_half_ulp(cutlass::tfloat32_t *out, float const *in) { + + cutlass::NumericConverter< + cutlass::tfloat32_t, + float, + cutlass::FloatRoundStyle::round_half_ulp_truncate> convert; + + *out = convert(*in); +} + +} +} + + +TEST(tfloat32_t, host_round_half_ulp) { + + struct { + uint32_t f32_bits; + uint32_t expected; + } tests[] = { + {0x40001fff, 0x40002000}, + {0x40000000, 0x40000000}, // M=0, R=0, S=0 => rtz + {0x40001000, 0x40002000}, // M=0, R=1, S=0 => rtz - this difers from RNE + {0x40000001, 0x40000000}, // M=0, R=0, S=1 => rtz + {0x40001001, 0x40002000}, // M=0, R=1, S=1 => +inf + {0x40002000, 0x40002000}, // M=1, R=0, S=0 => rtz + {0x40002001, 0x40002000}, // M=1, R=0, S=1 => rtz + {0x40003000, 0x40004000}, // M=1, R=1, S=0 => +inf + {0x40003001, 0x40004000}, // M=1, R=1, S=1 => +inf + {0x7f800000, 0x7f800000}, // +inf + {0xff800000, 0xff800000}, // -inf + {0x7fffffff, 0x7fffffff}, // canonical NaN to canonical NaN + {0x7f800001, 0x7f800001}, // NaN to NaN + {0xff800001, 0xff800001}, // NaN to NaN + {0, 0} + }; + + cutlass::NumericConverter< + cutlass::tfloat32_t, + float, + cutlass::FloatRoundStyle::round_half_ulp_truncate> convert; + + bool running = true; + for (int i = 0; running; ++i) { + + float f32 = reinterpret_cast(tests[i].f32_bits); + + cutlass::tfloat32_t tf32 = convert(f32); + + // note, for this test, we must explicitly truncate the low-order bits since they are not + // defined in TF32. + if (cutlass::isfinite(tf32)) { + tf32.storage &= 0xffffe000; + } + + bool passed = (tests[i].expected == tf32.raw()); + + EXPECT_TRUE(passed) + << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits + << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << tf32.raw(); + + if (!tests[i].f32_bits) { + running = false; + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Device +// +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/epilogue/CMakeLists.txt b/test/unit/epilogue/CMakeLists.txt index 8597a79f6..9de2d56ed 100755 --- a/test/unit/epilogue/CMakeLists.txt +++ b/test/unit/epilogue/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/CMakeLists.txt b/test/unit/epilogue/thread/CMakeLists.txt index 81b168a23..9b04f7752 100644 --- a/test/unit/epilogue/thread/CMakeLists.txt +++ b/test/unit/epilogue/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/linear_combination.cu b/test/unit/epilogue/thread/linear_combination.cu index cf0d1ea56..6518e9873 100644 --- a/test/unit/epilogue/thread/linear_combination.cu +++ b/test/unit/epilogue/thread/linear_combination.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/linear_combination_planar_complex.cu b/test/unit/epilogue/thread/linear_combination_planar_complex.cu index c90b8ad07..89d1be5e0 100644 --- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu +++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/CMakeLists.txt b/test/unit/epilogue/threadblock/CMakeLists.txt index 6e10e15ca..cb8b7a62d 100755 --- a/test/unit/epilogue/threadblock/CMakeLists.txt +++ b/test/unit/epilogue/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu index de2f86967..76b70f506 100644 --- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu +++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu index 0d4f9ae5b..935a81242 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu index 3dd0fdd6c..25cd8933c 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu index 0151f1d8e..fcc8426ca 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu index 530ca8f47..db8e68a3a 100644 --- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -758,6 +758,65 @@ TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x128_64x64x16) { EXPECT_TRUE(passed); } +TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_64x128_64x64x16) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = int8_t; + using ElementAccumulator = int; + using ElementCompute = float; + int const kElementsPerAccess = 128 / cutlass::sizeof_bits::value; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<128, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + using Element = ElementOutput; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementAccumulator, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x64_64x32x16) { // @@ -2516,6 +2575,249 @@ TEST(SM75_Epilogue_threadblock_epilogue, f16_tensor_op_128x64_64x32x8) { } ///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_64x64_32x32x4) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = double; + using ElementAccumulator = double; + using ElementCompute = double; + int const kElementsPerAccess = 1; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = ElementAccumulator; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + using LayoutC = cutlass::layout::RowMajor; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + LayoutC>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_128x64_64x32x4) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = double; + using ElementAccumulator = double; + using ElementCompute = double; + int const kElementsPerAccess = 1; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = ElementAccumulator; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + using LayoutC = cutlass::layout::RowMajor; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + LayoutC>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_64x128_32x64x4) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = double; + using ElementAccumulator = double; + using ElementCompute = double; + int const kElementsPerAccess = 1; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = ElementAccumulator; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + using LayoutC = cutlass::layout::RowMajor; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + LayoutC>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_128x128_32x64x4) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = double; + using ElementAccumulator = double; + using ElementCompute = double; + int const kElementsPerAccess = 1; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<128, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = ElementAccumulator; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + using LayoutC = cutlass::layout::RowMajor; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + LayoutC>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + ///////////////////////////////////////////////////////////////////////////////////////////////// TEST(SM75_Epilogue_threadblock_epilogue, vec1_mixed_f16_f32_tensor_op_128x128_64x64x8) { diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu index 99b7ae117..88fa98cf0 100644 --- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu index 3d1fdf0dd..24752a1df 100644 --- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu +++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu index 549e6e4d4..6e6e96e71 100644 --- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu +++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu index 7fcdd8e46..40874f7bf 100644 --- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu +++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/testbed.h b/test/unit/epilogue/threadblock/testbed.h index c888b9a2d..1dc9baa31 100644 --- a/test/unit/epilogue/threadblock/testbed.h +++ b/test/unit/epilogue/threadblock/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/testbed_planar_complex.h b/test/unit/epilogue/threadblock/testbed_planar_complex.h index fca543ae7..6afa60329 100644 --- a/test/unit/epilogue/threadblock/testbed_planar_complex.h +++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/CMakeLists.txt b/test/unit/epilogue/warp/CMakeLists.txt index 89d693e3e..dbd7ee65b 100644 --- a/test/unit/epilogue/warp/CMakeLists.txt +++ b/test/unit/epilogue/warp/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu index 4881e5cc9..9e94616f7 100644 --- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu index a89ec49c8..3522c9e92 100644 --- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu index a3a406dc7..4931d9371 100644 --- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/CMakeLists.txt b/test/unit/gemm/CMakeLists.txt index 4d42c000f..4ac245716 100644 --- a/test/unit/gemm/CMakeLists.txt +++ b/test/unit/gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt index 750a497bb..f536b1136 100644 --- a/test/unit/gemm/device/CMakeLists.txt +++ b/test/unit/gemm/device/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -26,6 +26,64 @@ cutlass_test_unit_add_executable( BATCH_SOURCES ON BATCH_SIZE 4 + gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu + gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu + gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu + + gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu + gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu + gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu + gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu + + gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu + gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu + + gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu + gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu + + gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu + gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu + + gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu + gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu + gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu + gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu + gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu + gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu + gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu + gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu + gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu + gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu + gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu + gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu + gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu + gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu + gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu + + gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu + gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu + + simt_sgemm_nt_sm80.cu + simt_sgemm_tn_sm80.cu + + gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu + gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu + gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu + gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu + gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu + gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu + gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu + + gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu + gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu + + gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu + gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu + + gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu + gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu + + gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu @@ -149,4 +207,5 @@ cutlass_test_unit_add_executable( gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu + ) diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu index fb7fe985d..fc887bce3 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -62,7 +62,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -84,7 +84,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -106,7 +106,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -128,7 +128,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -150,7 +150,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -172,7 +172,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..d8b907273 --- /dev/null +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu @@ -0,0 +1,373 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x1024_32x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 1024>, + cutlass::gemm::GemmShape<32, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x1024_64x32x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 1024>, + cutlass::gemm::GemmShape<64, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x1024_32x32x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<32, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 512>, + cutlass::gemm::GemmShape<32, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 512>, + cutlass::gemm::GemmShape<64, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 512>, + cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu index 099c46398..03f0b7525 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x256x512_64x64x512_8x8 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc >; @@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -166,7 +166,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -197,7 +197,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -228,7 +228,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x1 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu index f88a73d9a..77777a66f 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -62,7 +62,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x256x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -84,7 +84,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 256x128x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -106,7 +106,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -128,7 +128,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -150,7 +150,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x64x512_64x32x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -172,7 +172,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..f6862b0d2 --- /dev/null +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu @@ -0,0 +1,374 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface + +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x256x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x128x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x128x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x64x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x256x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x128x1024_32x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 1024>, + cutlass::gemm::GemmShape<32, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x64x1024_64x32x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 1024>, + cutlass::gemm::GemmShape<64, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x64x1024_32x32x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<32, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x256x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x128x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x64x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x256x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 512>, + cutlass::gemm::GemmShape<32, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x64x512_64x32x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 512>, + cutlass::gemm::GemmShape<64, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 512>, + cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu index 1254a19b3..b4fb7eba0 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x256x512_64x64x512_8x8 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc >; @@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -166,7 +166,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -197,7 +197,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -228,7 +228,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x1 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; diff --git a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..3da9cdbb5 --- /dev/null +++ b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,353 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..b0dbbdc85 --- /dev/null +++ b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu @@ -0,0 +1,337 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu new file mode 100644 index 000000000..b15af1076 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu @@ -0,0 +1,253 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Operands data type: complex +// Rounding: float -> tfloat32_t (half_ulp_truncate) +// Instruction operand data type: tfloat32_t (real part) and tfloat32_t (imaginary part) +// Math instruction: MMA.1688.F32.TF32 +// Instruction output/accumulation data type: f32 (real part) and f32 (imaginary part) +// Output data type: complex +///////////////////////////////////////////////////////////////////////////////////////////////// + + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x64x16_16x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x64x16_32x32x16) { + + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 128x128x16_32x64x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu new file mode 100644 index 000000000..cec5ce60a --- /dev/null +++ b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu @@ -0,0 +1,252 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Operands data type: complex +// Rounding: float -> tfloat32_t (round to nearest) +// Instruction operand data type: tfloat32_t (real part) and tfloat32_t (imaginary part) +// Math instruction: MMA.1688.F32.TF32 +// Instruction output/accumulation data type: f32 (real part) and f32 (imaginary part) +// Output data type: complex +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x64x16_16x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x64x16_32x32x16) { + + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x128x16_32x64x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu new file mode 100644 index 000000000..c7df15d14 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -0,0 +1,192 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 32x32x8_16x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<16, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 64x64x16_16x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 64x64x8_16x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu new file mode 100644 index 000000000..5113d2f80 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu @@ -0,0 +1,246 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 32x32x8_16x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<16, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x16_16x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x8_16x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x16_32x32x16) { + + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x8_32x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu new file mode 100644 index 000000000..427c1e0e1 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -0,0 +1,191 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 32x32x8_16x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<16, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 64x64x8_32x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 64x64x16_32x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu new file mode 100644 index 000000000..74fbc1f54 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu @@ -0,0 +1,299 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 32x32x8_16x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<16, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x64x8_32x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x128x8_32x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 128x64x8_32x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x128x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 128x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu index b40f29453..ea3da85d5 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu index 479004e51..167949d8c 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu index 6e42c5de2..ae72cade2 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..858fd301f --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu index 1ea87c43f..2dc224ab2 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu index 67f959874..71f21444c 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu index 6e07cc8c3..bb1665062 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -320,7 +320,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -354,7 +354,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -388,7 +388,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu index 6b6d66f55..3e8b96584 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..cd6e48a3a --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu @@ -0,0 +1,337 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu index c42771b98..a9f9ea997 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -108,7 +108,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -142,7 +142,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu index d94a7f0df..d797ed557 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..7cf1fad24 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,340 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x64x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x256x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x64_32x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x64_64x32x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x64_32x32x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x64x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x256x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu index abe553224..be764f528 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu index ab15f1c59..25d3e5bee 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu index 5dd4e2f87..f7c8fb23f 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu index 81ee6d714..279800769 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu index 30ddd06a9..b4114ffe5 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k, 64x64x64_64x32x32) ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu new file mode 100644 index 000000000..6ca8ada8a --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu @@ -0,0 +1,82 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k, 128x64x64_64x64x32) { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu index 3f96597bc..64b697af8 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu new file mode 100644 index 000000000..cff507059 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x64x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64> , + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x64_32x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x64_64x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x64_32x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x64x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x32_32x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x32_64x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x32_32x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..8a760b02a --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu @@ -0,0 +1,77 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + /* + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); + */ +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu index dbf02b24b..9f2c2c542 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -63,7 +63,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -94,7 +94,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -125,7 +125,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -156,7 +156,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -187,7 +187,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -218,7 +218,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -249,7 +249,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu index 031e22683..aa9260616 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu index 235c13969..dac3675b8 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu index 41824839b..74434cc9f 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -108,7 +108,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -142,7 +142,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu index 38337c642..176112d10 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..47e927d45 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,339 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x256x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x64_32x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x64_64x32x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x64_32x32x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x256x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED + diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu index d2f58b1ca..de19ca004 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -63,7 +63,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -94,7 +94,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -125,7 +125,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -156,7 +156,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -187,7 +187,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -218,7 +218,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -249,7 +249,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu index b5ff3b993..0b83c6cbb 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu index 3bfe6d8fe..a81684241 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x256x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x64x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -137,7 +137,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x128x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -170,7 +170,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x64x32_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -202,7 +202,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x64x64_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -234,7 +234,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x64_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -270,7 +270,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -305,7 +305,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu index 7455a1bdd..585b1df17 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu index a2374a618..ab030e5a9 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 128x128x32_64x64x16_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x16_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -140,7 +140,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x16_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu index 5629dc98c..b8fa4dad8 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x256x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x64x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -137,7 +137,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x128x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -170,7 +170,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x64x32_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -202,7 +202,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x64x64_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -234,7 +234,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x64_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -270,7 +270,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -305,7 +305,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu index d78d34e68..358aacecd 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k, 64x64x64_64x32x32) ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu new file mode 100644 index 000000000..957bcd2ab --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu @@ -0,0 +1,83 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface + +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k, 128x64x64_64x64x32) { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu index 8463e9e31..7c0f3b406 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu new file mode 100644 index 000000000..972756bba --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu @@ -0,0 +1,339 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x64x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x64_32x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x64_64x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x64_32x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x64x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x32_32x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x32_64x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x32_32x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED + diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu index 68d551a1c..14030b1d4 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu index 6a66888f2..9a1918db4 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu index a7c61a1a4..51a09194e 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -319,7 +319,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -353,7 +353,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -387,7 +387,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu index 34859eddc..74d64af70 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu index ca63f26df..d4bc720bc 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -74,7 +74,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x64x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 64x128x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -138,7 +138,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 64x64x32_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -174,7 +174,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -209,7 +209,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu index f941832da..dd0976d9f 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..83c5cd147 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED + diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu index 90e44ee51..6d78dc9a9 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu index 05374010b..5ea2f9ce0 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu index 3f922ebad..0f773de4f 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu index c4ab9f4df..54d6229a0 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -140,7 +140,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu index 748f64d19..d123931e1 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu index 037efb823..b1286accd 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -320,7 +320,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -354,7 +354,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -388,7 +388,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu index d7474d87a..5a511540f 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..26f41ac2b --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu index da55acbda..06498afb9 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu index 30bb55833..e377980bb 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..96f5dcc94 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu index 8418381c7..0f94d589c 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu index 2d9d41678..2163711b8 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu new file mode 100644 index 000000000..91095a945 --- /dev/null +++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu @@ -0,0 +1,87 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface using BF16. +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/arch/mma.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 4, + 4, + false, + cutlass::arch::OpMultiplyAddFastBF16 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..2108eeb4e --- /dev/null +++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,82 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu new file mode 100644 index 000000000..64fe313c5 --- /dev/null +++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu @@ -0,0 +1,212 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 32x32x16_16x16x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 64x64x16_32x32x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 128x64x16_64x32x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 64x128x16_32x64x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 128x128x16_32x64x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu new file mode 100644 index 000000000..63c765c55 --- /dev/null +++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu @@ -0,0 +1,212 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 32x32x16_16x16x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 64x64x16_32x32x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 64x128x16_32x64x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 128x64x16_64x32x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 128x128x16_32x64x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu index aecee047b..99303712e 100644 --- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -63,7 +63,7 @@ using gemm_planar_complex_s884_tn_base = typename cutlass::gemm::kernel::Default float, float >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, cutlass::arch::OpMultiplyAdd >::GemmKernel; @@ -107,7 +107,7 @@ using gemm_planar_complex_s884_nt_base = typename cutlass::gemm::kernel::Default float, float >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, cutlass::arch::OpMultiplyAdd >::GemmKernel; diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu new file mode 100644 index 000000000..993b0b9d5 --- /dev/null +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu @@ -0,0 +1,217 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-level GEMM API for Planar Complex. +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" +#include "cutlass/gemm/device/gemm_universal_base.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + +#include "testbed_planar_complex.h" + + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s1688_tn_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + 8, + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kNone, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s1688_tn : gemm_planar_complex_s1688_tn_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmPlanarComplex_f16t_f16n_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s1688_hc_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + 8, + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kConjugate, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s1688_hc : gemm_planar_complex_s1688_hc_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmPlanarComplex_f16h_f16c_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s1688_nt_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kNone, + 8, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s1688_nt : gemm_planar_complex_s1688_nt_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmPlanarComplex_f16n_f16t_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s1688_ch_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kConjugate, + 8, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s1688_ch : gemm_planar_complex_s1688_ch_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmPlanarComplex_f16c_f16h_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu new file mode 100644 index 000000000..25fd50cfc --- /dev/null +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu @@ -0,0 +1,216 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-level GEMM API for Planar Complex. +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + +#include "testbed_planar_complex.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s16816_tn_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + 8, + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kNone, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s16816_tn : gemm_planar_complex_s16816_tn_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmPlanarComplex_f16t_f16n_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s16816_hc_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + 8, + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kConjugate, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s16816_hc : gemm_planar_complex_s16816_hc_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmPlanarComplex_f16h_f16c_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s16816_nt_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kNone, + 8, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s16816_nt : gemm_planar_complex_s16816_nt_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmPlanarComplex_f16n_f16t_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s16816_ch_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kConjugate, + 8, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s16816_ch : gemm_planar_complex_s16816_ch_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmPlanarComplex_f16c_f16h_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu index 832981f9c..4cc406817 100644 --- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -143,7 +143,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 256x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -179,7 +179,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..d53e3c076 --- /dev/null +++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu @@ -0,0 +1,213 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "multistage_testbed_interleaved.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) { + + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, + cutlass::layout::ColumnMajorInterleaved<64>, + cutlass::int4b_t, + cutlass::layout::RowMajorInterleaved<64>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + 32, + 32, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x128x128_64x64x128) { + + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, + cutlass::layout::ColumnMajorInterleaved<64>, + cutlass::int4b_t, + cutlass::layout::RowMajorInterleaved<64>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 32, + 32, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 256x128x128_64x64x128) { + + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, + cutlass::layout::ColumnMajorInterleaved<64>, + cutlass::int4b_t, + cutlass::layout::RowMajorInterleaved<64>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 32, + 32, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) { + + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, + cutlass::layout::ColumnMajorInterleaved<64>, + cutlass::int4b_t, + cutlass::layout::RowMajorInterleaved<64>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 32, + 32, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu index feb248d26..983dff337 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x128_32x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x128_64x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x128_32x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..8dd541838 --- /dev/null +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu @@ -0,0 +1,354 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x64x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x256x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x256_32x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 256>, + cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x256_64x32x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 256>, + cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x256_32x32x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 256>, + cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x64x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x256x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x128_32x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x128_64x32x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x128_32x32x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu index 22a6d7f45..01a65b32a 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x256x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 256x128x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x128x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -168,7 +168,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 64x128x128_32x64x128_8x8x ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -200,7 +200,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x64x128_64x32x128_8x8x ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -232,7 +232,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 64x64x128_32x32x128_8x8x3 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu index a59789338..33f3b07a2 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x128x128_32x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x128_64x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x128_32x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..1a3f7dba8 --- /dev/null +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu @@ -0,0 +1,357 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x64x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x128x256_32x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 256>, + cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x256_64x32x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 256>, + cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x256_32x32x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 256>, + cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x64x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x128_32x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x128_64x32x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x128_32x32x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu index 47f959e0c..857df472a 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x256x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 256x128x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x128x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -168,7 +168,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 64x128x128_32x64x128_8x8x ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -200,7 +200,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x64x128_64x32x128_8x8x ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -232,7 +232,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 64x64x128_32x32x128_8x8x3 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu index 3766c11e8..51d182cd6 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x256x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x128x128_32x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x64x128_64x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x64x128_32x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu index 5def3a2b8..90fe6bcfd 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu index 12b4effe4..393e68bfd 100644 --- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 32x64x64_16x32x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x64x64_32x32x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -137,7 +137,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x64x64_64x32x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -171,7 +171,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x128x64_32x64x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x128x64_64x64x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -239,7 +239,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x128x64_64x64x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -273,7 +273,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..c4900e489 --- /dev/null +++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu @@ -0,0 +1,361 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "multistage_testbed_interleaved.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x64x64_32x32x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x64x64_64x32x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x128x64_32x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x128x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x128x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x256x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x64x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu index d30a644eb..6ac9b71bf 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x256x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 256x128x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x128x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 64x128x64_32x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x64x64_64x32x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 64x64x64_32x32x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu index 53fcbd238..cc6e4c3a5 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu index 15bdacc0d..86a678d22 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x64_32x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x64_64x32x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x64_32x32x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..a86dc2442 --- /dev/null +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu @@ -0,0 +1,355 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x64x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x256x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x128_32x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x128_64x32x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x128_32x32x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x64x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x256x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x64_32x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x64_64x32x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x64_32x32x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu index dd88e87fd..d53571a2d 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -137,7 +137,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -171,7 +171,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu index 4aa799e5f..024cba0a4 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -59,7 +59,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -78,7 +78,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); } ) @@ -96,7 +96,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -115,7 +115,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x64_32x64x64, cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -146,7 +146,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x64_64x32x64, ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -179,7 +179,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..2d6db336f --- /dev/null +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu @@ -0,0 +1,368 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "multistage_testbed.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x64x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x256x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x128_32x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x128_64x32x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x128_32x32x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x64x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x256x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x64_32x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x64_64x32x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +//////////////////////////////////////////////////////////////////////////////// +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu index 34a1f3beb..ac5757e0e 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16x ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -98,7 +98,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x1 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -130,7 +130,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x16 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -162,7 +162,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x16 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu index a881ca27d..93642e64b 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -59,7 +59,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); } ) @@ -77,7 +77,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); } ) @@ -95,7 +95,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -114,7 +114,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x64_32x64x64, cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); } ) @@ -133,7 +133,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x64_64x32x64, cutlass::epilogue::thread::LinearCombinationClamp< ElementOutput, 32 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; test::gemm::device::Testbed testbed; @@ -154,7 +154,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, { cutlass::epilogue::thread::LinearCombinationClamp< ElementOutput, 32 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; test::gemm::device::Testbed testbed; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu new file mode 100644 index 000000000..197e69b71 --- /dev/null +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu @@ -0,0 +1,368 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "multistage_testbed.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x64x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x256x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x128_32x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x128_64x32x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x128_32x32x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x64x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x256x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x64_32x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x64_64x32x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +//////////////////////////////////////////////////////////////////////////////// +#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu index d2078582d..719e2ac76 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16x ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -99,7 +99,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x1 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -131,7 +131,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x16 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x16 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu index 224c8fbdc..e7a01bed6 100644 --- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu +++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -92,7 +92,7 @@ TEST(SM75_Device_GemmSplitKSerial_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages, kAlignmentA, kAlignmentB, diff --git a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu index c35535dd6..39b5f10a7 100644 --- a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu +++ b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu index 725b5feb5..42e991ed0 100644 --- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu +++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -42,7 +42,7 @@ #include "testbed_splitk.h" -// These tests cannot run unless CUDA 10.1 Toolkit or later is used. +// These operators are assert(0) unless extended PTX is used. #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu index 71b606da9..3381f1703 100644 --- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu +++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -42,7 +42,7 @@ #include "testbed_splitk.h" -// These tests cannot run unless CUDA 10.2 Toolkit or later is used. +// These operators are assert(0) unless extended PTX is used. #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..78c6e8657 --- /dev/null +++ b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,549 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x64x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x128x32_32x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x64x32_64x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x64x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x128x16_32x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x64x16_64x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x64x16_32x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 10 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..11af88897 --- /dev/null +++ b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,549 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x64x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x128x32_32x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x64x32_64x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x64x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x128x16_32x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x64x16_64x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x64x16_32x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 10 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..a28101f3d --- /dev/null +++ b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,487 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x64x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x128x32_32x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x64x32_64x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x64x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x128x16_32x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x64x16_64x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x64x16_32x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 10 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..a1a0fd7e3 --- /dev/null +++ b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,550 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x64x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x128x32_32x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x64x32_64x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x64x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x128x16_32x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x64x16_64x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x64x16_32x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 10 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu index 4d31c0896..a63163680 100644 --- a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -170,7 +170,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..e32441941 --- /dev/null +++ b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu @@ -0,0 +1,193 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_universal.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf32n_cf32t_cf32n_tensor_op_f32, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf32n_cf32h_cf32n_tensor_op_f32, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf32h_cf32t_cf32n_tensor_op_f32, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf32h_cf32c_cf32n_tensor_op_f32, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu new file mode 100644 index 000000000..301cce785 --- /dev/null +++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -0,0 +1,194 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_universal.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64n_cf64t_cf64n_tensor_op_f64_gaussian, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddGaussianComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64n_cf64h_cf64n_tensor_op_f64_gaussian, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddGaussianComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64h_cf64t_cf64n_tensor_op_f64_gaussian, 64x32x32_32x16x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<32, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddGaussianComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64h_cf64c_cf64n_tensor_op_f64_gaussian, 64x64x32_32x16x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddGaussianComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu new file mode 100644 index 000000000..df28110a3 --- /dev/null +++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu @@ -0,0 +1,194 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_universal.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64n_cf64t_cf64n_tensor_op_f64, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64n_cf64h_cf64n_tensor_op_f64, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64h_cf64t_cf64n_tensor_op_f64, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64h_cf64c_cf64n_tensor_op_f64, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 000000000..e7b4405a0 --- /dev/null +++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,111 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface + +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_universal.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 2>; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + + +TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32_updated_batch_count) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 2, + 1, + 1>; + + EXPECT_TRUE(test::gemm::device::TestGemmUniversal( + {128, 128, 2}, + cutlass::gemm::GemmUniversalMode::kGemm, + 15)); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h new file mode 100644 index 000000000..bdc4b7708 --- /dev/null +++ b/test/unit/gemm/device/multistage_testbed.h @@ -0,0 +1,251 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#pragma once + +#include +#include +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_utils.h" + +namespace test { +namespace gemm { +namespace device { + +//////////////////////////////////////////////////////////////////////////////// + +template +struct MultistageTestbed { + using ElementAccumulator = typename Gemm::ElementAccumulator; + using ElementCompute = + typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + MultistageTestbed( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080) + : init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {} + + /// Helper to initialize a tensor view + template + bool initialize_tensor(cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, uint64_t seed) { + if (dist_kind == cutlass::Distribution::Uniform) { + int scope = (cutlass::sizeof_bits::value == 8) ? 2 : 8; + cutlass::reference::host::TensorFillRandomUniform(view, seed, scope, + -scope, 0); + } else if (dist_kind == cutlass::Distribution::Gaussian) { + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, -1); + } else if (dist_kind == cutlass::Distribution::Identity) { + cutlass::reference::host::TensorFillIdentity(view); + } else if (dist_kind == cutlass::Distribution::Sequential) { + cutlass::reference::host::BlockFillSequential(view.data(), + view.capacity()); + } else { + // TODO: Implement the rest + EXPECT_TRUE(false) << "Not implemented"; + return false; + } + + return true; + } + + /// Executes one test + bool run(cutlass::gemm::GemmCoord problem_size, + ElementCompute alpha = ElementCompute(1), + ElementCompute beta = ElementCompute(0)) { + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor + tensor_A(problem_size.mk()); + + cutlass::HostTensor + tensor_B(problem_size.kn()); + + cutlass::HostTensor + tensor_C(problem_size.mn()); + + cutlass::HostTensor + tensor_D(problem_size.mn()); + + cutlass::HostTensor + reference_D(problem_size.mn(), false); + + EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019)); + EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018)); + EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017)); + + cutlass::reference::host::TensorCopy(reference_D.host_view(), + tensor_C.host_view()); + + tensor_A.sync_device(); + tensor_B.sync_device(); + tensor_C.sync_device(); + tensor_D.sync_device(); + + // + // Initialize the GEMM operator + // + + typename Gemm::Arguments arguments{ + problem_size, tensor_A.device_ref(), tensor_B.device_ref(), + tensor_C.device_ref(), tensor_D.device_ref(), {alpha, beta}}; + + Gemm gemm_op; + + cutlass::Status status = gemm_op.initialize(arguments); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + + // + // Run the GEMM + // + + status = gemm_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + + // + // Verify + // + + cutlass::reference::host::Gemm< + typename Gemm::ElementA, typename Gemm::LayoutA, + typename Gemm::ElementB, typename Gemm::LayoutB, + typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm::Operator> + reference_gemm; + + reference_gemm( + problem_size, alpha, tensor_A.host_ref(), tensor_B.host_ref(), beta, + reference_D.host_ref(), ElementAccumulator(0)); + + tensor_D.sync_host(); + + EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0); + EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D.host_view(), tensor_D.host_view()); + + EXPECT_TRUE(passed); + if (!passed) { + std::stringstream fname; + + fname << "error_Gemm_device_" << problem_size.m() << "x" + << problem_size.n() << "x" << problem_size.k() << "_" + << Gemm::ThreadblockShape::kM << "x" << Gemm::ThreadblockShape::kN + << "x" << Gemm::ThreadblockShape::kK << "_" << Gemm::WarpShape::kM + << "x" << Gemm::WarpShape::kN << "x" << Gemm::WarpShape::kK + << ".txt"; + + std::ofstream file(fname.str()); + + file << "problem: " << problem_size << ", alpha: " << alpha + << ", beta: " << beta << "\n\n"; + + file << "A =\n" + << tensor_A.host_view() << "\nB =\n" + << tensor_B.host_view() << "\nC =\n" + << tensor_C.host_view() << "\n\nReference =\n" + << reference_D.host_view() << "\nComputed =\n" + << tensor_D.host_view(); + } + + return passed; + } + + /// Runs a set of problem sizes + bool run_all() { + bool passed = true; + + int problem_size_m[] = {16, 528}; + + int problem_size_n[] = {16, 528}; + + int problem_size_k[] = {Gemm::InstructionShape::kK, + Gemm::ThreadblockShape::kK * Gemm::kStages + + Gemm::InstructionShape::kK}; + + double problem_alpha[] = {1.0}; + + // TODO Try non zero beta value after multistaged epilogue is implemented + double problem_beta[] = {0.0}; + + for (int m : problem_size_m) { + for (int n : problem_size_n) { + for (int k : problem_size_k) { + for (double alpha : problem_alpha) { + for (double beta : problem_beta) { + passed = + run({m, n, k}, ElementCompute(alpha), ElementCompute(beta)); + + if (!passed) { + return false; + } + } + } + } + } + } + + return true; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace gemm +} // namespace test + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/multistage_testbed_interleaved.h b/test/unit/gemm/device/multistage_testbed_interleaved.h new file mode 100644 index 000000000..c98264de0 --- /dev/null +++ b/test/unit/gemm/device/multistage_testbed_interleaved.h @@ -0,0 +1,303 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#pragma once + +#include +#include +#include + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/host_reorder.h" + +namespace test { +namespace gemm { +namespace device { + +//////////////////////////////////////////////////////////////////////////////// + +template +struct MultistageInterleavedTestbed { + + using ElementAccumulator = typename Gemm::ElementAccumulator; + using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + MultistageInterleavedTestbed( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + EXPECT_TRUE(false) << "Not implemented"; + return false; + } + + return true; + } + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size, + ElementCompute alpha = ElementCompute(1), + ElementCompute beta = ElementCompute(0)) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename Gemm::ElementA, + typename Gemm::LayoutA> tensor_A(problem_size.mk()); + + cutlass::HostTensor< + typename Gemm::ElementB, + typename Gemm::LayoutB> tensor_B(problem_size.kn()); + + cutlass::HostTensor< + typename Gemm::ElementB, + typename Gemm::LayoutB> tensor_B_reordered(problem_size.kn()); + + cutlass::HostTensor< + typename Gemm::ElementC, + typename Gemm::LayoutC> tensor_C(problem_size.mn()); + + cutlass::HostTensor< + typename Gemm::ElementC, + typename Gemm::LayoutC> tensor_D(problem_size.mn()); + + cutlass::HostTensor< + typename Gemm::ElementC, + typename Gemm::LayoutC> reference_D(problem_size.mn(), false); + + EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019)); + EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018)); + EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017)); + + cutlass::reorder_column( + tensor_B_reordered.host_ref(), tensor_B.host_ref(), problem_size); + + cutlass::reference::host::TensorCopy( + reference_D.host_view(), + tensor_C.host_view()); + + tensor_A.sync_device(); + tensor_B_reordered.sync_device(); + tensor_C.sync_device(); + tensor_D.sync_device(); + + // + // Initialize the GEMM operator + // + + typename Gemm::Arguments arguments{ + problem_size, + tensor_A.device_ref(), + tensor_B_reordered.device_ref(), + tensor_C.device_ref(), + tensor_D.device_ref(), + {alpha, beta} + }; + + Gemm gemm_op; + + cutlass::Status status = gemm_op.initialize(arguments); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + + // + // Run the GEMM + // + + status = gemm_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + + // + // Verify + // + + cutlass::reference::host::Gemm< + typename Gemm::ElementA, typename Gemm::LayoutA, + typename Gemm::ElementB, typename Gemm::LayoutB, + typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm::Operator> + reference_gemm; + + reference_gemm( + problem_size, + alpha, + tensor_A.host_ref(), + tensor_B.host_ref(), + beta, + reference_D.host_ref(), + ElementAccumulator(0) + ); + + tensor_D.sync_host(); + + EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0); + EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D.host_view(), + tensor_D.host_view()); + + EXPECT_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_Gemm_device_" + << problem_size.m() << "x" + << problem_size.n() << "x" + << problem_size.k() << "_" + << Gemm::ThreadblockShape::kM << "x" + << Gemm::ThreadblockShape::kN << "x" + << Gemm::ThreadblockShape::kK << "_" + << Gemm::WarpShape::kM << "x" + << Gemm::WarpShape::kN << "x" + << Gemm::WarpShape::kK << ".txt"; + + std::ofstream file(fname.str()); + + file + << "problem: " << problem_size + << ", alpha: " << alpha << ", beta: " << beta << "\n\n"; + + file + << "A =\n" << tensor_A.host_view() + << "\nB =\n" << tensor_B.host_view() + << "\nB_reordered =\n" << tensor_B_reordered.host_view() + << "\nC =\n" << tensor_C.host_view() + << "\n\nReference =\n" << reference_D.host_view() + << "\nComputed =\n" << tensor_D.host_view(); + } + + return passed; + } + + /// Runs a set of problem sizes + bool run_all() { + bool passed = true; + + int problem_size_m[] = { + InterleavedK, 512 + InterleavedK + }; + + int problem_size_n[] = { + InterleavedK, 512 + InterleavedK + }; + + int problem_size_k[] = { + InterleavedK, Gemm::ThreadblockShape::kK * Gemm::kStages + InterleavedK + }; + + double problem_alpha[] = { + 1.0 + }; + + double problem_beta[] = { + 0.0 + }; + + for (int m : problem_size_m) { + for (int n : problem_size_n) { + for (int k : problem_size_k) { + for (double alpha : problem_alpha) { + for (double beta : problem_beta) { + + passed = run( + {m, n, k}, + ElementCompute(alpha), + ElementCompute(beta) + ); + + if (!passed) { + return false; + } + } + } + } + } + } + + return true; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace gemm +} // namespace test + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu index d399b766a..5aabfca58 100644 --- a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu index 7c1922416..c5265ce2b 100644 --- a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu index 89728ba20..9db96c996 100644 --- a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu index 8d4c9fddc..0ac7b4c9f 100644 --- a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu index 3d5c52ed9..1efa9d044 100644 --- a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu index 05fa3c94a..886c0f9c7 100644 --- a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu index f0f253007..a43d0afd5 100644 --- a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu index 38066b946..0175978d0 100644 --- a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu index 79af9b47d..a3aa5ce84 100644 --- a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x128x8_32x128x1_8x16_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x64x8_64x64x1_16x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x32x8_128x32x1_16x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x256x8_32x128x1_8x16_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 64x128x8_64x64x1_16x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x64x8_64x64x1_16x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x32x8_128x32x1_16x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x256x8_32x128x1_8x16_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x128x8_64x64x1_16x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x64x8_128x32x1_16x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x256x8_64x64x1_16x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 256x128x8_64x64x1_16x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x256x8_32x64x1_8x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x128x8_64x32x1_8x8_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu index 1401d2fa9..d5541939e 100644 --- a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x128x8_32x128x1_8x16_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x64x8_64x64x1_16x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x32x8_128x32x1_16x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x256x8_32x128x1_8x16_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 64x128x8_64x64x1_16x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x64x8_64x64x1_16x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x32x8_128x32x1_16x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x256x8_32x128x1_8x16_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x128x8_64x64x1_16x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x64x8_128x32x1_16x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x256x8_64x64x1_16x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 256x128x8_64x64x1_16x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x256x8_32x64x1_8x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x128x8_64x32x1_8x8_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu index f1b7a043f..526bc01a4 100644 --- a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x128x8_32x128x1_8x16_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x64x8_64x64x1_16x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x32x8_128x32x1_16x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x256x8_32x128x1_8x16_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 64x128x8_64x64x1_16x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x64x8_64x64x1_16x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x32x8_128x32x1_16x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x256x8_32x128x1_8x16_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x128x8_64x64x1_16x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x64x8_128x32x1_16x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x256x8_64x64x1_16x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 256x128x8_64x64x1_16x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x256x8_32x64x1_8x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x128x8_64x32x1_8x8_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu index 4c1b59136..ad464b301 100644 --- a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x128x8_32x128x1_8x16_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x64x8_64x64x1_16x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x32x8_128x32x1_16x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x256x8_32x128x1_8x16_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 64x128x8_64x64x1_16x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x64x8_64x64x1_16x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x32x8_128x32x1_16x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x256x8_32x128x1_8x16_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x128x8_64x64x1_16x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x64x8_128x32x1_16x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x256x8_64x64x1_16x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 256x128x8_64x64x1_16x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x256x8_32x64x1_8x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x128x8_64x32x1_8x8_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_igemm_nn_sm50.cu b/test/unit/gemm/device/simt_igemm_nn_sm50.cu index 59a8dbfe1..3db133ebf 100644 --- a/test/unit/gemm/device/simt_igemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_igemm_nt_sm50.cu b/test/unit/gemm/device/simt_igemm_nt_sm50.cu index 7ff0c5cd2..01f56ea03 100644 --- a/test/unit/gemm/device/simt_igemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_igemm_tn_sm50.cu b/test/unit/gemm/device/simt_igemm_tn_sm50.cu index 392db59e2..3692ec2c3 100644 --- a/test/unit/gemm/device/simt_igemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_igemm_tt_sm50.cu b/test/unit/gemm/device/simt_igemm_tt_sm50.cu index 3fdc8e271..2254669b3 100644 --- a/test/unit/gemm/device/simt_igemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61.cu b/test/unit/gemm/device/simt_int8_igemm_sm61.cu index d1a8821ac..1364a38cf 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ ElementAccumulator, \ ElementCompute \ >, \ - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, \ + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, \ 2 \ >; \ EXPECT_TRUE(test::gemm::device::TestAllGemm()); \ diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu index 0c1449e0d..4e4308ff3 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_perf, 128x256x32_64x64x8) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -109,7 +109,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_perf, 128x256x32_64x64x8) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -145,7 +145,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_perf, 128x256x32_64x64x8) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -181,7 +181,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_perf, 128x256x32_64x64x8) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu index 9e1c21e9e..88c72aee4 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -133,7 +133,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -197,7 +197,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -229,7 +229,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -261,7 +261,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu index a81dd4dbd..0412d751c 100644 --- a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu index 81c21edaf..1adb9b5ae 100644 --- a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu new file mode 100644 index 000000000..7d2ab45b6 --- /dev/null +++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu @@ -0,0 +1,249 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 32x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 64x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x128x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 64x128x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x128x8_64x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x256x8_64x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu index 20a2eddbe..0c00e5608 100644 --- a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu new file mode 100644 index 000000000..00461d2e0 --- /dev/null +++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu @@ -0,0 +1,249 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface + +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 32x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 64x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x128x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 64x128x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x64x8_64x32x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x128x8_64x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x256x8_64x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu index 22e846b97..ce7ab9a7e 100644 --- a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_sm50.py b/test/unit/gemm/device/simt_sm50.py index ba6ec3c29..f53dae271 100644 --- a/test/unit/gemm/device/simt_sm50.py +++ b/test/unit/gemm/device/simt_sm50.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -123,7 +123,7 @@ for precision in precisions: # write file header out.write("/***************************************************************************************************\n" -" * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.\n" +" * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.\n" " *\n" " * Redistribution and use in source and binary forms, with or without modification, are permitted\n" " * provided that the following conditions are met:\n" diff --git a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu index 7145b3953..7731559a8 100644 --- a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu index ffe8c0dda..17ea98203 100644 --- a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu index 2d4799eb9..175c31286 100644 --- a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu index ba2447bce..544e626c5 100644 --- a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h index 57108530b..b8c739a7e 100644 --- a/test/unit/gemm/device/testbed.h +++ b/test/unit/gemm/device/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h index 1eff58a2d..65c0fdfb4 100644 --- a/test/unit/gemm/device/testbed_complex.h +++ b/test/unit/gemm/device/testbed_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h index 34d61383b..3cbd720bd 100644 --- a/test/unit/gemm/device/testbed_interleaved.h +++ b/test/unit/gemm/device/testbed_interleaved.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_planar_complex.h b/test/unit/gemm/device/testbed_planar_complex.h index 5642020ba..0e4e561e4 100644 --- a/test/unit/gemm/device/testbed_planar_complex.h +++ b/test/unit/gemm/device/testbed_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h new file mode 100644 index 000000000..025fb3874 --- /dev/null +++ b/test/unit/gemm/device/testbed_sanity.h @@ -0,0 +1,233 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include +#include +#include + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/core_io.h" + +#include "testbed.h" + + +namespace test { +namespace gemm { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// +// List of Gemm internal paramters this testbed supports user verification +// +enum class ParameterID { + + // Threadblock-level parameters + kSmemASize, + kSmemBSize, + + // Warp-level parameters + kWarpFragmentASize, + kWarpFragmentBSize, + kWarpFragmentCSize, + kInvalid +}; + +struct Reference { + ParameterID parameter_id; + + union { + int value; + + struct { + int m, n, k; + } gemm_shape; + + struct { + int row, column; + } matrix_shape; + }; + + std::string error_msg; + + Reference( + ParameterID parameter_id_, + int value_=-1, + std::string const &error_msg_="") : parameter_id(parameter_id_), value(value_), error_msg(error_msg_) {} +}; + + +template +struct TestbedSanity { + + // + // Type definitions (All Gemm types top down) + // + + // Unpacking Gemm types in the following order + // Kernel-level > Threadblock-level > Warp-level > Instruction-level + + // kernel-level cutlass Gemm + using GemmKernel = typename Gemm::GemmKernel; + + // + // Threadblock-level gemm types + // + using MmaThreadBlock = typename GemmKernel::Mma; + + // Threadblock-level gemm shape covering one stage + using ThreadblockShape = typename MmaThreadBlock::Shape; + + // Shared memory size covering all stages + using SmemShapeA = typename MmaThreadBlock::Base::SharedStorage::ShapeA; + using SmemPaddingA = typename MmaThreadBlock::Policy::SmemPaddingA; + using SmemShapeB = typename MmaThreadBlock::Base::SharedStorage::ShapeB; + using SmemPaddingB = typename MmaThreadBlock::Policy::SmemPaddingB; + + + /// Number of stages + static int const kStages = MmaThreadBlock::Base::kStages; + + /// Number of warp-level GEMM oeprations + static int const kWarpGemmIterations = MmaThreadBlock::kWarpGemmIterations; + + + // + // Warp-level gemm types + // + + // Warp-level gemm operator + using MmaWarp = typename MmaThreadBlock::Operator; + + // Warp-level gemm shape covering all kgroups + using WarpShape = typename MmaWarp::Shape; + + // Warp-level framents holding operands A & B operand and destination C + using WarpFragmentA = typename MmaWarp::FragmentA; + using WarpFragmentB = typename MmaWarp::FragmentB; + using WarpFragmentC = typename MmaWarp::FragmentC; + + // + // Instruction-level gemm types + // + + // Instruction-level gemm operator + using MmaInstruction = typename MmaWarp::Policy::Operator; + + // Instruction shape + using InstructionShape = typename MmaInstruction::Shape; + + // Instruction-level framents holding operands A & B operand and destination C + using InstructionFragmentA = typename MmaInstruction::FragmentA; + using InstructionFragmentB = typename MmaInstruction::FragmentB; + using InstructionFragmentC = typename MmaInstruction::FragmentC; + + // + // Testbed types + // + + // Vector of values holding user provided reference + using ReferenceVector = std::vector; + + // + // Data members + // + ReferenceVector references; + + // + // Methods + // + + TestbedSanity(ReferenceVector const &references_ = ReferenceVector()) : references(references_){ } + + // verify all parameter in ReferenceVector + bool verify() { + for(auto ref : references) + verify_parameter(ref); + return true; + } + + // verify parameter of type Reference + void verify_parameter(Reference const& ref) { + switch(ref.parameter_id) { + case ParameterID::kWarpFragmentASize : EXPECT_TRUE(WarpFragmentA::kElements == ref.value) << *this; break; + case ParameterID::kWarpFragmentBSize : EXPECT_TRUE(WarpFragmentB::kElements == ref.value) << *this; break; + case ParameterID::kWarpFragmentCSize : EXPECT_TRUE(WarpFragmentC::kElements == ref.value) << *this; break; + } + } + +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Overload output operators for TesbedSanity +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +std::ostream & operator<<(std::ostream &out, TestbedSanity const &test) { + + + out << "Gemm internal parameters" << std::endl + << " Threadblock-level parameters:" << std::endl + << " ThreadblockShape = " << typename TestbedSanity::ThreadblockShape() << std::endl + << " kStages = " << TestbedSanity::kStages << std::endl + << " kWarpGemmIterations = "<< TestbedSanity::kWarpGemmIterations << std::endl + <<" Shared memory sizes:" << std::endl + <<" SmemPaddingA = " << typename TestbedSanity::SmemPaddingA() << std::endl + <<" SmemPaddingB = " << typename TestbedSanity::SmemPaddingB() << std::endl + <<" SmemShapeA = " << typename TestbedSanity::SmemShapeA() << std::endl + <<" SmemShapeB = " << typename TestbedSanity::SmemShapeB() << std::endl + <<" Warp-level parameters" << std::endl + <<" WarpShape = " << typename TestbedSanity::WarpShape() << std::endl + <<" Fragment sizes:" << std::endl + <<" WarpFragmentA::kElements = " << TestbedSanity::WarpFragmentA::kElements << std::endl + <<" WarpFragmentB::kElements = " << TestbedSanity::WarpFragmentB::kElements << std::endl + <<" WarpFragmentC::kElements = " << TestbedSanity::WarpFragmentC::kElements << std::endl + <<" Instruction-level parameters" << std::endl + <<" InstructionShape = " << typename TestbedSanity::InstructionShape() << std::endl + <<" Fragment sizes:" << std::endl + <<" InstructionFragmentA::kElements = " << TestbedSanity::InstructionFragmentA::kElements << std::endl + <<" InstructionFragmentB::kElements = " << TestbedSanity::InstructionFragmentB::kElements << std::endl + <<" InstructionFragmentC::kElements = " << TestbedSanity::InstructionFragmentC::kElements << std::endl; + + return out; +} + +} // namespace device +} // namespace gemm +} // namespace test + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h index c8ae4b4ab..792d73923 100644 --- a/test/unit/gemm/device/testbed_splitk.h +++ b/test/unit/gemm/device/testbed_splitk.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h index 44503e0a3..a83c27cda 100644 --- a/test/unit/gemm/device/testbed_universal.h +++ b/test/unit/gemm/device/testbed_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_utils.h b/test/unit/gemm/device/testbed_utils.h index 5a76c3be7..9325b40fe 100644 --- a/test/unit/gemm/device/testbed_utils.h +++ b/test/unit/gemm/device/testbed_utils.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -41,6 +41,7 @@ inline char const *to_string(cutlass::Status status) { case cutlass::Status::kErrorWorkspaceNull: return "kErrorWorkspaceNull"; case cutlass::Status::kErrorInternal: return "kErrorInternal"; case cutlass::Status::kInvalid: return "kInvalid"; + default: break; } return "invalid"; } diff --git a/test/unit/gemm/thread/CMakeLists.txt b/test/unit/gemm/thread/CMakeLists.txt index 11d450c7d..48ca11572 100644 --- a/test/unit/gemm/thread/CMakeLists.txt +++ b/test/unit/gemm/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm50.cu b/test/unit/gemm/thread/gemm_sm50.cu index 969580f58..426592284 100644 --- a/test/unit/gemm/thread/gemm_sm50.cu +++ b/test/unit/gemm/thread/gemm_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm60.cu b/test/unit/gemm/thread/gemm_sm60.cu index 19b846192..b0b9fdb5b 100644 --- a/test/unit/gemm/thread/gemm_sm60.cu +++ b/test/unit/gemm/thread/gemm_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm61.cu b/test/unit/gemm/thread/gemm_sm61.cu index f8cbf2b81..f6e7724dd 100644 --- a/test/unit/gemm/thread/gemm_sm61.cu +++ b/test/unit/gemm/thread/gemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/CMakeLists.txt b/test/unit/gemm/thread/host/CMakeLists.txt index 75f76c928..c58540264 100644 --- a/test/unit/gemm/thread/host/CMakeLists.txt +++ b/test/unit/gemm/thread/host/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/gemm_sm60_host.cu b/test/unit/gemm/thread/host/gemm_sm60_host.cu index df2d233a6..346b80cbe 100644 --- a/test/unit/gemm/thread/host/gemm_sm60_host.cu +++ b/test/unit/gemm/thread/host/gemm_sm60_host.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/testbed_host.h b/test/unit/gemm/thread/host/testbed_host.h index d2835efec..4d5e441dd 100644 --- a/test/unit/gemm/thread/host/testbed_host.h +++ b/test/unit/gemm/thread/host/testbed_host.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/testbed.h b/test/unit/gemm/thread/testbed.h index 1b1082a5d..bdfb8278f 100644 --- a/test/unit/gemm/thread/testbed.h +++ b/test/unit/gemm/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/CMakeLists.txt b/test/unit/gemm/threadblock/CMakeLists.txt index 7ec75510a..f208b9ef1 100644 --- a/test/unit/gemm/threadblock/CMakeLists.txt +++ b/test/unit/gemm/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/batched_gemv.cu b/test/unit/gemm/threadblock/batched_gemv.cu index 79b5ac4e5..94ae947bd 100644 --- a/test/unit/gemm/threadblock/batched_gemv.cu +++ b/test/unit/gemm/threadblock/batched_gemv.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/epilogue_workspace.cu b/test/unit/gemm/threadblock/epilogue_workspace.cu index c1967e43f..1301aeb4d 100644 --- a/test/unit/gemm/threadblock/epilogue_workspace.cu +++ b/test/unit/gemm/threadblock/epilogue_workspace.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_simt.cu b/test/unit/gemm/threadblock/mma_pipelined_simt.cu index b5c1a58b7..522b029ad 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu index b9302ef33..c9c714bcf 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu index 5585f23f6..e4125eb4f 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -231,6 +231,7 @@ TEST(SM75_gemm_threadblock_congruous, } //////////////////////////////////////////////////////////////////////////////// + TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x32_64x64x32_16x8x8) { using ElementA = cutlass::half_t; using LayoutA = cutlass::layout::RowMajor; @@ -562,6 +563,7 @@ TEST(SM75_gemm_threadblock_crosswise, } //////////////////////////////////////////////////////////////////////////////// + TEST(SM75_gemm_threadblock_interleaved, tensor_op_32x32x64_16x16x64_8x8x16) { using ElementA = uint8_t; using LayoutA = cutlass::layout::ColumnMajorInterleaved<32>; @@ -1785,4 +1787,337 @@ TEST(SM75_gemm_threadblock_interleaved, } //////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x512_64x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(64, 64, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 1, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_32x32x512_16x16x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(32, 32, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 512>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x32x512_32x16x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(64, 32, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 512>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_32x64x512_16x32x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(32, 64, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 512>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x512_32x32x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(64, 64, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_128x64x512_64x32x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(128, 64, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore component + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x128x512_32x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(64, 128, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 512>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_128x128x512_64x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(128, 128, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, + multicta_256x256x1536_128x128x512_64x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(256, 256, 1536); + + using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(2, 2); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, + multicta_512x256x6144_256x128x512_64x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(512, 256, 6144); + + using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(2, 2); + dim3 block(32, 8, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + #endif diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h index 498ca4967..8190c50a4 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h +++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu index 3c1720a1a..4fb964c1a 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu index e3d900d53..fd2ae356f 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h index 5838e4f3b..148e34d95 100644 --- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h +++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu index ba54249d9..8c687f881 100644 --- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu +++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu index d1c060834..262269b75 100644 --- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu +++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/CMakeLists.txt b/test/unit/gemm/warp/CMakeLists.txt index 600d1d8ed..695508fa5 100644 --- a/test/unit/gemm/warp/CMakeLists.txt +++ b/test/unit/gemm/warp/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -27,6 +27,9 @@ cutlass_test_unit_add_executable( gemm_sm61.cu gemm_sm70.cu gemm_sm75.cu + gemm_sm80.cu + gemm_complex_sm80.cu + gemm_gaussian_complex_sm80.cu wmma_sm70.cu wmma_sm72.cu wmma_sm75.cu diff --git a/test/unit/gemm/warp/gemm_complex_sm80.cu b/test/unit/gemm/warp/gemm_complex_sm80.cu new file mode 100644 index 000000000..3fcd70c8d --- /dev/null +++ b/test/unit/gemm/warp/gemm_complex_sm80.cu @@ -0,0 +1,635 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + + \brief Unit tests for thread-level GEMM +*/ + +#include "cutlass/cutlass.h" +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/half.h" + +#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" + +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// complex * complex => complex +// Input data type: complex +// Math instruction: MMA.884.F64.F64 +// Output data type: complex +/////////////////////////////////////////////////////////////////////////////////////////////////// +TEST(SM80_warp_gemm_complex_tensor_op_f64, 8x8x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<8, 8, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x16x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x32x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x16x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_nh) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_ct) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 8x8x4_8x8x4_tn) { + + using Shape = cutlass::gemm::GemmShape<8, 8, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x16x4_8x8x4_tn) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// complex * complex => complex +// Input data type: complex +// Math instruction: MMA.1688.F32.TF32 +// Output data type: complex +// Shared memory layout: Congrous +//////////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x8_16x8x8_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x16_16x8x8_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 16> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x32x8_16x8x8_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 32, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x16x8_16x16x8_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 16, 8> >() + .run(); +} + + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_nh) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_ct) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >() + .run(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// complex * complex => complex +// Input data type: complex +// Math instruction: MMA.1688.F32.TF32 +// Output data type: complex +// Shared memory layout: Crosswise +//////////////////////////////////////////////////////////////////////////////////////////////////// +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x8_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 8> >() + .run(); +} + +// TEST FAILS crosswise complex TN MMA.1688.F32.TF32 test fails for k = 2*8 = 16 +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x16_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 16> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x64x8_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 64, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 64x32x8_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<64, 32, 8> >() + .run(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu new file mode 100644 index 000000000..43ad2dfd8 --- /dev/null +++ b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu @@ -0,0 +1,281 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + + \brief Unit tests for thread-level GEMM +*/ + +#include "cutlass/cutlass.h" +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/half.h" + +#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" + +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 8x8x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<8, 8, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x16x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x32x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x16x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_nh) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_ct) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x16x4_8x8x4_tn) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/warp/gemm_sm50.cu b/test/unit/gemm/warp/gemm_sm50.cu index f6410d1d4..bb4ba5be5 100644 --- a/test/unit/gemm/warp/gemm_sm50.cu +++ b/test/unit/gemm/warp/gemm_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm60.cu b/test/unit/gemm/warp/gemm_sm60.cu index cf59d442e..4f2f3f158 100644 --- a/test/unit/gemm/warp/gemm_sm60.cu +++ b/test/unit/gemm/warp/gemm_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm61.cu b/test/unit/gemm/warp/gemm_sm61.cu index 98a16046e..63e07165b 100644 --- a/test/unit/gemm/warp/gemm_sm61.cu +++ b/test/unit/gemm/warp/gemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu index d97effeab..16f1427e5 100644 --- a/test/unit/gemm/warp/gemm_sm70.cu +++ b/test/unit/gemm/warp/gemm_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu index 7c32de4ac..144475cae 100644 --- a/test/unit/gemm/warp/gemm_sm75.cu +++ b/test/unit/gemm/warp/gemm_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -109,6 +109,8 @@ TEST(SM75_warp_gemm_tensor_op_congruous_f16, 128x128x32_32x32x32_16x8x8) { .run(); } +//////////////////////////////////////////////////////////////////////////////// + TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x64x32_16x8x8) { using Shape = cutlass::gemm::GemmShape<64, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; @@ -317,6 +319,8 @@ TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x64_16x16x64_16x8x8) { .run(); } +//////////////////////////////////////////////////////////////////////////////// + TEST(SM75_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x64x64_8x8x16) { using Shape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; diff --git a/test/unit/gemm/warp/gemm_sm80.cu b/test/unit/gemm/warp/gemm_sm80.cu new file mode 100644 index 000000000..377e760c6 --- /dev/null +++ b/test/unit/gemm/warp/gemm_sm80.cu @@ -0,0 +1,1782 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + + \brief Unit tests for thread-level GEMM +*/ + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/half.h" + +#include "cutlass/gemm/warp/default_mma_tensor_op.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" + +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x64x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x32x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_32x32x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_32x16x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 16, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_16x16x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<16, 16, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_64x64x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_64x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_32x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_32x16x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_16x16x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<16, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_64x64x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_64x32x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_32x32x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_32x16x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_16x16x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<16, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_64x64x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_64x32x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_32x32x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_32x16x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 16, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_16x16x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<16, 16, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x32_64x64x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x32_32x32x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x64_64x64x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x64_32x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x16_64x64x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x16_32x32x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x32_64x64x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x32_32x32x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_tn, tf32_round_128x128x32_16x16x32_16x8x8) { + + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = float; + using ElementC = float; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::TransformTestbed >() + .run(); +} + +TEST(SM80_warp_gemm_tensor_op_nt, tf32_round_128x128x32_16x16x32_16x8x8) { + + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = float; + using ElementC = float; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::TransformTestbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_16x16x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<16, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x16x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x64x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_16x16x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<16, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x16x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x32x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x32x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x64x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x64x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x32x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_32x32x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_32x16x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_16x16x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<16, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_64x64x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 64, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_64x32x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_32x32x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_32x16x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_16x16x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<16, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_64x64x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 64, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_64x32x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_32x32x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_32x16x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_16x16x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<16, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_64x64x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 64, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_64x32x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 32, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_32x32x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 32, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_32x16x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 16, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_16x16x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<16, 16, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x64x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x32x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<64, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_32x32x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<32, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_32x16x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<32, 16, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_16x16x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<16, 16, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_64x64x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<64, 64, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_64x32x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<64, 32, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_32x32x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<32, 32, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_32x16x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<32, 16, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_16x16x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<16, 16, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f64, 16x16x4_16x16x4_8x8x4) { + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x16x4_32x16x4_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x32x4_32x32x4_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x64x4_32x64x4_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 64, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 16x16x16_16x16x16_8x8x4) { + using Shape = cutlass::gemm::GemmShape<16, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 32x32x16_32x32x16_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 64x32x16_64x32x16_8x8x4) { + using Shape = cutlass::gemm::GemmShape<64, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 32x64x16_32x64x16_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_16x16x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<16, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_32x16x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_32x32x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_64x32x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_64x64x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 64, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h index 9560b9103..8a565fd9f 100644 --- a/test/unit/gemm/warp/testbed.h +++ b/test/unit/gemm/warp/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -996,7 +996,6 @@ struct TransformedTestbedComplex { ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace warp } // namespace gemm } // namespace test diff --git a/test/unit/gemm/warp/wmma_sm70.cu b/test/unit/gemm/warp/wmma_sm70.cu index d5e1107c1..5b9ce63db 100644 --- a/test/unit/gemm/warp/wmma_sm70.cu +++ b/test/unit/gemm/warp/wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/wmma_sm72.cu b/test/unit/gemm/warp/wmma_sm72.cu index 4f81bbe26..89bfbb594 100644 --- a/test/unit/gemm/warp/wmma_sm72.cu +++ b/test/unit/gemm/warp/wmma_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/wmma_sm75.cu b/test/unit/gemm/warp/wmma_sm75.cu index a041610db..3818793e8 100644 --- a/test/unit/gemm/warp/wmma_sm75.cu +++ b/test/unit/gemm/warp/wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/CMakeLists.txt b/test/unit/layout/CMakeLists.txt index ab34df0ca..29ebdbdd3 100644 --- a/test/unit/layout/CMakeLists.txt +++ b/test/unit/layout/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/layout/matrix.cu b/test/unit/layout/matrix.cu index 0adddb891..2f8d0ea2b 100644 --- a/test/unit/layout/matrix.cu +++ b/test/unit/layout/matrix.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu index a6b3f7cff..b4a43fb3a 100644 --- a/test/unit/layout/tensor.cu +++ b/test/unit/layout/tensor.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu index 697f753da..46482b2b2 100644 --- a/test/unit/layout/tensor_nhwc.cu +++ b/test/unit/layout/tensor_nhwc.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/CMakeLists.txt b/test/unit/nvrtc/CMakeLists.txt index 7261da968..668ea35eb 100644 --- a/test/unit/nvrtc/CMakeLists.txt +++ b/test/unit/nvrtc/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/nvrtc/cutlass/nvrtc/environment.h b/test/unit/nvrtc/cutlass/nvrtc/environment.h index e3d493ab9..27e999348 100644 --- a/test/unit/nvrtc/cutlass/nvrtc/environment.h +++ b/test/unit/nvrtc/cutlass/nvrtc/environment.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/test/unit/nvrtc/kernel/thread/testbed_kernel.h index c75823516..500870581 100644 --- a/test/unit/nvrtc/kernel/thread/testbed_kernel.h +++ b/test/unit/nvrtc/kernel/thread/testbed_kernel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/stdlib/stdint.h b/test/unit/nvrtc/stdlib/stdint.h index 50ed027d9..380216811 100644 --- a/test/unit/nvrtc/stdlib/stdint.h +++ b/test/unit/nvrtc/stdlib/stdint.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/CMakeLists.txt b/test/unit/nvrtc/thread/CMakeLists.txt index f1d2b7a12..2e12ccfa8 100644 --- a/test/unit/nvrtc/thread/CMakeLists.txt +++ b/test/unit/nvrtc/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/gemm_nvrtc.cu b/test/unit/nvrtc/thread/gemm_nvrtc.cu index bf57f1d3d..785ebcb2c 100644 --- a/test/unit/nvrtc/thread/gemm_nvrtc.cu +++ b/test/unit/nvrtc/thread/gemm_nvrtc.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/testbed.h b/test/unit/nvrtc/thread/testbed.h index 69bf81f47..41ba503ad 100644 --- a/test/unit/nvrtc/thread/testbed.h +++ b/test/unit/nvrtc/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt index ba1b2a99e..7b4f26706 100644 --- a/test/unit/reduction/CMakeLists.txt +++ b/test/unit/reduction/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/CMakeLists.txt b/test/unit/reduction/kernel/CMakeLists.txt index 9ef27c84e..e1983153d 100644 --- a/test/unit/reduction/kernel/CMakeLists.txt +++ b/test/unit/reduction/kernel/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/reduce_splitk.cu b/test/unit/reduction/kernel/reduce_splitk.cu index f4a7f07db..b169cb60f 100644 --- a/test/unit/reduction/kernel/reduce_splitk.cu +++ b/test/unit/reduction/kernel/reduce_splitk.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/reduce_splitk_testbed.h b/test/unit/reduction/kernel/reduce_splitk_testbed.h index c5cbbd58d..8e7040706 100644 --- a/test/unit/reduction/kernel/reduce_splitk_testbed.h +++ b/test/unit/reduction/kernel/reduce_splitk_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/thread/CMakeLists.txt b/test/unit/reduction/thread/CMakeLists.txt index f42276f76..0641590e8 100644 --- a/test/unit/reduction/thread/CMakeLists.txt +++ b/test/unit/reduction/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/thread/reduction_thread.cu b/test/unit/reduction/thread/reduction_thread.cu index ece493459..f71e30f53 100644 --- a/test/unit/reduction/thread/reduction_thread.cu +++ b/test/unit/reduction/thread/reduction_thread.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/thread/testbed.h b/test/unit/reduction/thread/testbed.h index 3646e5bf0..919839b3d 100644 --- a/test/unit/reduction/thread/testbed.h +++ b/test/unit/reduction/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/test_unit.cpp b/test/unit/test_unit.cpp index fc386250c..3bb8ac138 100644 --- a/test/unit/test_unit.cpp +++ b/test/unit/test_unit.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/transform/CMakeLists.txt b/test/unit/transform/CMakeLists.txt index ee865cd4a..a7b881ae2 100644 --- a/test/unit/transform/CMakeLists.txt +++ b/test/unit/transform/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/CMakeLists.txt b/test/unit/transform/threadblock/CMakeLists.txt index e849dc8a4..0d5e5c44a 100644 --- a/test/unit/transform/threadblock/CMakeLists.txt +++ b/test/unit/transform/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/predicated_tile_iterator.cu b/test/unit/transform/threadblock/predicated_tile_iterator.cu index 70502f73a..562c7888a 100644 --- a/test/unit/transform/threadblock/predicated_tile_iterator.cu +++ b/test/unit/transform/threadblock/predicated_tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu index e032383ee..e52af8edf 100644 --- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu +++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/util/complex.cu b/test/unit/util/complex.cu index e4867e19e..319bbb2aa 100644 --- a/test/unit/util/complex.cu +++ b/test/unit/util/complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 0aa594b0c..5c140a9a7 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt index 8c8c5c47d..37bb89901 100644 --- a/tools/library/CMakeLists.txt +++ b/tools/library/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -22,7 +22,7 @@ include(GNUInstallDirs) -find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED) +find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED) add_library(cutlass_library_includes INTERFACE) add_library(nvidia::cutlass::library::includes ALIAS cutlass_library_includes) @@ -59,7 +59,7 @@ cutlass_add_library( src/operation_table.cu src/singleton.cu src/util.cu - + ) file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/*.py) diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h index 1b60eb7df..58c6b30c7 100644 --- a/tools/library/include/cutlass/library/handle.h +++ b/tools/library/include/cutlass/library/handle.h @@ -45,6 +45,9 @@ private: /// Host workspace static int const kHostWorkspaceSize = (4 << 10); + /// Provider of operations + Provider provider_; + /// CUDA device properties cudaDeviceProp device_; @@ -90,6 +93,12 @@ public: /// Gets the current CUDA stream cudaStream_t get_stream() const; + /// Gets the current provider + Provider get_provider() const; + + /// Sets the provider of operations + void set_provider(Provider provider); + /// Gets the device workspace size size_t get_workspace_size() const; @@ -149,6 +158,56 @@ public: void * ptr_D, /// Pointer to D matrix int ldd /// Leading dimension of D matrix ); + + /// Executes a GEMM computation: D <= alpha * A*B + beta * C. + // + // Supports batched-strided, batched array or split-K serial or split-K parallel. + // + Status gemm_universal( + + GemmUniversalMode mode, /// indicates the mode in which the kUniversal GEMM is launched + + int M, /// GEMM M dimension + int N, /// GEMM N dimension + int K, /// GEMM K dimension + + NumericTypeID element_compute, /// Data type of internal accumulation + + NumericTypeID element_scalar, /// Data type of alpha/beta scalars + + void const *alpha, /// Pointer to alpha scalar + + NumericTypeID element_A, /// Data type of A matrix elements + LayoutTypeID layout_A, /// Layout of A matrix + ComplexTransform transform_A, /// Complex transformation applied to A matrix - ignored for real-valued matrices + + void const * ptr_A, /// Pointer to A matrix in Global Memory + int lda, /// Leading dimension of A matrix + + NumericTypeID element_B, /// Data type of B matrix elements + LayoutTypeID layout_B, /// Layout of B matrix + ComplexTransform transform_B, /// Complex transformation applied to B matrix - ignored for real-valued matrices + + void const * ptr_B, /// Pointer to B matrix in Global Memory + int ldb, /// Leading dimension of B matrix + + void const * beta, /// Pointer to beta scalar + + NumericTypeID element_C, /// Data type of C and D matrices + + void const * ptr_C, /// Pointer to C matrix + int ldc, /// Leading dimension of C matrix + + void * ptr_D, /// Pointer to D matrix + int ldd, /// Leading dimension of D matrix + + int batch_count = 1, /// Batch count or number of split-K slices + + int64_t batch_stride_A = 0, /// Batch stride of A operand + int64_t batch_stride_B = 0, /// Batch stride of B operand + int64_t batch_stride_C = 0, /// Batch stride of C operand + int64_t batch_stride_D = 0 /// Batch stride of D operand + ); /// Planar complex GEMM /// @@ -276,7 +335,6 @@ public: using HandlePtr = std::unique_ptr; ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace library } // namespace cutlass diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h index f58e3a45e..d093b6118 100644 --- a/tools/library/include/cutlass/library/library.h +++ b/tools/library/include/cutlass/library/library.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "cutlass/cutlass.h" @@ -93,10 +94,14 @@ enum class NumericTypeID { kS32, kS64, kF16, + kBF16, + kTF32, kF32, kF64, kCF16, + kCBF16, kCF32, + kCTF32, kCF64, kCS4, kCS8, @@ -120,6 +125,7 @@ enum class ComplexTransform { /// Providers enum class Provider { + kNone, kCUTLASS, kReferenceHost, kReferenceDevice, @@ -132,6 +138,8 @@ enum class Provider { /// Enumeration indicating the kind of operation enum class OperationKind { kGemm, + kEqGemm, + kReduction, kInvalid }; @@ -160,9 +168,11 @@ enum class OpcodeClassID { }; enum class MathOperationID { + kAdd, kMultiplyAdd, kMultiplyAddSaturate, kMultiplyAddComplex, + kMultiplyAddGaussianComplex, kXorPopc, kInvalid }; @@ -180,12 +190,17 @@ enum class GemmKind { kInvalid }; -/// Mode of GEMM -enum class GemmUniversalMode { - kGemm, - kGemmSplitKParallel, - kBatched, - kArray, +/// Mode of Universal GEMM +using GemmUniversalMode = cutlass::gemm::GemmUniversalMode; + +enum class EpilogueKind { + kUnknown, + kConversion, + kLinearCombination, + kLinearCombinationClamp, + kLinearCombinationPlanarComplex, + kLinearCombinationRelu, + kLinearCombinationSigmoid, kInvalid }; @@ -220,6 +235,22 @@ struct MathInstructionDescription { opcode_class(opcode_class), math_operation(math_operation) {} + // Equality operator + inline + bool operator==(MathInstructionDescription const& rhs) const{ + return ( + (instruction_shape == rhs.instruction_shape) && + (element_accumulator == rhs.element_accumulator) && + (opcode_class == rhs.opcode_class) && + (math_operation == rhs.math_operation)); + } + + // Inequality operator + inline + bool operator!=(MathInstructionDescription const& rhs) const { + return !(*this == rhs); + } + }; /// Structure describing the tiled structure of a GEMM-like computation @@ -261,6 +292,24 @@ struct TileDescription { math_instruction(math_instruction), minimum_compute_capability(minimum_compute_capability), maximum_compute_capability(maximum_compute_capability) { } + + // Equality operator + inline + bool operator==(TileDescription const& rhs) const{ + return ( + (threadblock_shape == rhs.threadblock_shape) && + (threadblock_stages == rhs.threadblock_stages) && + (warp_count == rhs.warp_count) && + (math_instruction == rhs.math_instruction) && + (minimum_compute_capability == rhs.minimum_compute_capability) && + (maximum_compute_capability == rhs.maximum_compute_capability)); + } + + // Inequality operator + inline + bool operator!=(TileDescription const& rhs) const { + return !(*this == rhs); + } }; /// High-level description of an operation @@ -379,6 +428,20 @@ struct GemmDescription : public OperationDescription { transform_B(transform_B) {} }; + +/// Description of all Reduction operations +struct ReductionDescription : public OperationDescription { + + /// Describes the data type of workspace + NumericTypeID element_workspace; + + /// Describes the data type of final output + NumericTypeID element_output; + + /// Describes the data type of the scalars passed to the epilogue + NumericTypeID element_epilogue; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -549,6 +612,42 @@ struct GemmArrayArguments { ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Universal GEMM supporting multiple split-K modes, multiple batched modes, real and complex +// +// OperationKind: Gemm +// GemmKind: Universal + +struct GemmUniversalConfiguration { + + GemmUniversalMode mode; + gemm::GemmCoord problem_size; + int batch_count; + + int64_t lda; + int64_t ldb; + int64_t ldc; + int64_t ldd; +}; + +struct GemmUniversalArguments { + + void const *A; + void const *B; + void const *C; + void *D; + + void const *alpha; + void const *beta; + ScalarPointerMode pointer_mode; + + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Complex valued GEMM in which real and imaginary parts are separated by a stride // // OperationKind: Gemm @@ -648,7 +747,6 @@ struct GemmPlanarComplexArrayArguments { ScalarPointerMode pointer_mode; }; - ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace library diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h index eaa90b3ac..54e51c1fd 100644 --- a/tools/library/include/cutlass/library/manifest.h +++ b/tools/library/include/cutlass/library/manifest.h @@ -45,6 +45,13 @@ namespace cutlass { namespace library { /////////////////////////////////////////////////////////////////////////////////////////////////// +// Forward declaration +class Manifest; + +// init and insert all cutlass gemm and conv2d op in manifest object (procedurally generated using generator.py) +void initialize_all(Manifest &manifest); + +///////////////////////////////////////////////////////////////////////////////////////////////////////// /// List of operations using OperationVector = std::vector>; diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h index 80ce1e15e..3821f65ac 100644 --- a/tools/library/include/cutlass/library/operation_table.h +++ b/tools/library/include/cutlass/library/operation_table.h @@ -29,24 +29,28 @@ */ #pragma once - +#include #include #include #include #include "cutlass/library/library.h" #include "cutlass/library/manifest.h" - +#include "cutlass/library/util.h" ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { namespace library { +///////////////////////////////////////////////////////////////////////////////////////////////// +// Data Structures for Gemm Functional Maps ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Tuple uniquely identifying functional behavior +/// Tuple uniquely identifying Gemm functional behavior struct GemmFunctionalKey { + Provider provider; + GemmKind gemm_kind; NumericTypeID element_compute; NumericTypeID element_scalar; NumericTypeID element_A; @@ -63,6 +67,8 @@ struct GemmFunctionalKey { inline GemmFunctionalKey( + Provider provider, + GemmKind gemm_kind = GemmKind::kGemm, NumericTypeID element_compute = NumericTypeID::kF32, NumericTypeID element_scalar = NumericTypeID::kF32, NumericTypeID element_A = NumericTypeID::kF16, @@ -73,6 +79,8 @@ struct GemmFunctionalKey { ComplexTransform transform_B = ComplexTransform::kNone, NumericTypeID element_C = NumericTypeID::kF16 ): + provider(provider), + gemm_kind(gemm_kind), element_compute(element_compute), element_scalar(element_scalar), element_A(element_A), @@ -87,6 +95,8 @@ struct GemmFunctionalKey { inline bool operator==(GemmFunctionalKey const &rhs) const { return + (provider == rhs.provider) && + (gemm_kind == rhs.gemm_kind) && (element_compute == rhs.element_compute) && (element_scalar == rhs.element_scalar) && (element_A == rhs.element_A) && @@ -104,6 +114,28 @@ struct GemmFunctionalKey { } }; + +///////////////////////////////////////////////////////////////////////////////////////////////// +inline +std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k) { + + out << "{\n" + << " provider: " << to_string(k.provider) << "\n" + << " gemm_kind: " << to_string(k.gemm_kind) << "\n" + << " element_compute: " << to_string(k.element_compute) << "\n" + << " element_scalar: " << to_string(k.element_scalar) << "\n" + << " element_A: " << to_string(k.element_A) << "\n" + << " layout_A: " << to_string(k.layout_A) << "\n" + << " transform_A: " << to_string(k.transform_A) << "\n" + << " element_B: " << to_string(k.element_B) << "\n" + << " layout_B: " << to_string(k.layout_B) << "\n" + << " transform_B: " << to_string(k.transform_B) << "\n" + << " element_C: " << to_string(k.element_C) << "\n" + << "}"; + + return out; +} + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Hash function for GemmFunctionalKey @@ -120,15 +152,17 @@ struct GemmFunctionalKeyHasher { IntHash hash; return - rotl(hash(int(key.element_compute)), 2) ^ - rotl(hash(int(key.element_scalar)), 3) ^ - rotl(hash(int(key.element_A)), 4) ^ - rotl(hash(int(key.layout_A)), 5) ^ - rotl(hash(int(key.transform_A)), 6) ^ - rotl(hash(int(key.element_B)), 7) ^ - rotl(hash(int(key.layout_B)), 8) ^ - rotl(hash(int(key.transform_B)), 9) ^ - rotl(hash(int(key.element_C)), 10); + rotl(hash(int(key.provider)), 1) ^ + rotl(hash(int(key.gemm_kind)), 2) ^ + rotl(hash(int(key.element_compute)), 3) ^ + rotl(hash(int(key.element_scalar)), 4) ^ + rotl(hash(int(key.element_A)), 5) ^ + rotl(hash(int(key.layout_A)), 6) ^ + rotl(hash(int(key.transform_A)), 7) ^ + rotl(hash(int(key.element_B)), 8) ^ + rotl(hash(int(key.layout_B)), 9) ^ + rotl(hash(int(key.transform_B)), 10) ^ + rotl(hash(int(key.element_C)), 11); } }; @@ -172,6 +206,7 @@ using GemmOperationFunctionalMap = std::unordered_map< GemmOperationVectorMap, GemmFunctionalKeyHasher >; +///////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -179,15 +214,10 @@ using GemmOperationFunctionalMap = std::unordered_map< class OperationTable { public: - /// Map of all operations of type kGemm and gemm_kind of type kGemm + /// Map of all operations of type kGemm + // provider (kCUTLASS) GemmOperationFunctionalMap gemm_operations; - /// Map of all operations of type kGemm and gemm_kind of type kPlanarComplex - GemmOperationFunctionalMap gemm_planar_complex_operations; - - /// Map of all operations of type kGemm and gemm_kind of type kPlanarComplexArray - GemmOperationFunctionalMap gemm_planar_complex_array_operations; - public: void append(Manifest const &manifest); @@ -202,4 +232,3 @@ public: ///////////////////////////////////////////////////////////////////////////////////////////////// std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k); - diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h index 5ff678e88..526f836b2 100644 --- a/tools/library/include/cutlass/library/util.h +++ b/tools/library/include/cutlass/library/util.h @@ -49,6 +49,9 @@ char const *to_string(Provider provider, bool pretty = false); /// Parses a Provider enumerant from a string template <> Provider from_string(std::string const &str); +/// Converts a GemmKind enumerant to a string +char const *to_string(GemmKind type, bool pretty = false); + /// Converts a NumericType enumerant to a string char const *to_string(OperationKind type, bool pretty = false); @@ -111,6 +114,14 @@ char const *to_string(ComplexTransform type, bool pretty = false); template <> ComplexTransform from_string(std::string const &str); + +/// Converts a SplitKMode enumerant to a string +char const *to_string(SplitKMode split_k_mode, bool pretty = false); + +/// Converts a SplitKMode enumerant from a string +template <> +SplitKMode from_string(std::string const &str); + /// Lexical cast from int64_t to string std::string lexical_cast(int64_t int_value); diff --git a/tools/library/scripts/gemm_operation.py b/tools/library/scripts/gemm_operation.py index cc7d35d25..66ecc05e6 100644 --- a/tools/library/scripts/gemm_operation.py +++ b/tools/library/scripts/gemm_operation.py @@ -23,7 +23,7 @@ from library import * class GemmOperation: # def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \ - epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Cohort): + epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8): self.operation_kind = OperationKind.Gemm self.arch = arch @@ -40,6 +40,7 @@ class GemmOperation: def is_complex(self): complex_operators = [ MathOperation.multiply_add_complex, + MathOperation.multiply_add_complex_gaussian ] return self.tile_description.math_instruction.math_operation in complex_operators @@ -58,6 +59,8 @@ class GemmOperation: # def short_math_name(self): + if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian: + return "g%s" % ShortDataTypeNames[self.accumulator_type()] return ShortDataTypeNames[self.accumulator_type()] @@ -259,6 +262,135 @@ class EmitGemmInstance: ################################################################################################### +# +class EmitGemmUniversalInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.gemm_template = """ +// Gemm operator ${operation_name} +using ${operation_name}_base = + typename cutlass::gemm::kernel::DefaultGemmUniversal< + ${element_b}, ${layout_b}, ${transform_b}, ${align_b}, // transposed B operand + ${element_a}, ${layout_a}, ${transform_a}, ${align_a}, // transposed A operand + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${math_operation} +>::GemmKernel; + +// Define named type +struct ${operation_name} : + public ${operation_name}_base { }; +""" + self.gemm_template_interleaved = """ +// Gemm operator ${operation_name} +using ${operation_name}_base = + typename cutlass::gemm::kernel::DefaultGemmUniversal< + ${element_a}, ${layout_a}, ${transform_a}, ${align_a}, + ${element_b}, ${layout_b}, ${transform_b}, ${align_b}, + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${math_operation} +>::GemmKernel; + +// Define named type +struct ${operation_name} : + public ${operation_name}_base { }; +""" + + def emit(self, operation): + + threadblock_shape = operation.tile_description.threadblock_shape + warp_count = operation.tile_description.warp_count + + warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)] + warp_shape[2] = operation.tile_description.threadblock_shape[2] + + epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element]) + + transpose_layouts = { + LayoutType.ColumnMajor: LayoutType.RowMajor, + LayoutType.RowMajor: LayoutType.ColumnMajor + } + + if operation.A.layout in transpose_layouts.keys() and \ + operation.B.layout in transpose_layouts.keys() and \ + operation.C.layout in transpose_layouts.keys(): + + instance_layout_A = transpose_layouts[operation.A.layout] + instance_layout_B = transpose_layouts[operation.B.layout] + instance_layout_C = transpose_layouts[operation.C.layout] + + gemm_template = self.gemm_template + else: + instance_layout_A, instance_layout_B, instance_layout_C = \ + (operation.A.layout, operation.B.layout, operation.C.layout) + + gemm_template = self.gemm_template_interleaved + # + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[instance_layout_A], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[instance_layout_B], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[instance_layout_C], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'align_a': str(operation.A.alignment), + 'align_b': str(operation.B.alignment), + 'transform_a': ComplexTransformTag[operation.A.complex_transform], + 'transform_b': ComplexTransformTag[operation.B.complex_transform], + 'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation] + } + + return SubstituteTemplate(gemm_template, values) + +################################################################################################### + # class EmitGemmPlanarComplexInstance: ''' Responsible for emitting a CUTLASS template definition''' @@ -282,12 +414,13 @@ class EmitGemmPlanarComplexInstance: ${element_accumulator}, ${element_epilogue} >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, ${stages}, ${math_operator} >::GemmKernel; - struct ${operation_name} : public Operation_${operation_name} { }; + struct ${operation_name} : + public Operation_${operation_name} { }; """ def emit(self, operation): @@ -355,7 +488,7 @@ class EmitGemmPlanarComplexArrayInstance: ${element_accumulator}, ${element_epilogue} >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, ${stages}, ${math_operator} >::GemmArrayKernel; @@ -419,12 +552,14 @@ class EmitGemmConfigurationLibrary: self.instance_emitter = { GemmKind.Gemm: EmitGemmInstance, + GemmKind.Universal: EmitGemmUniversalInstance, GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance, GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance } self.gemm_kind_wrappers = { GemmKind.Gemm: 'GemmOperation', + GemmKind.Universal: 'GemmUniversalOperation', GemmKind.PlanarComplex: 'GemmPlanarComplexOperation', GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation' } @@ -436,6 +571,13 @@ class EmitGemmConfigurationLibrary: ${compile_guard_start} manifest.append(new ${gemm_kind}("${operation_name}")); ${compile_guard_end} +""", + GemmKind.Universal: """ +${compile_guard_start} + manifest.append(new ${gemm_kind}< + cutlass::gemm::device::GemmUniversalAdapter<${operation_name}> + >("${operation_name}")); +${compile_guard_end} """, GemmKind.PlanarComplex: """ ${compile_guard_start} @@ -542,3 +684,4 @@ void initialize_${configuration_name}(Manifest &manifest) { ################################################################################################### ################################################################################################### + diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py index 4b1a483e6..295786456 100644 --- a/tools/library/scripts/generator.py +++ b/tools/library/scripts/generator.py @@ -18,7 +18,7 @@ from gemm_operation import * def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): # by default, use the latest CUDA Toolkit version - cuda_version = [10, 2, 82] + cuda_version = [11, 0, 132] # Update cuda_version based on parsed string if semantic_ver_string != '': @@ -36,7 +36,7 @@ def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): # def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \ - swizzling_functor = SwizzlingFunctor.Cohort): + swizzling_functor = SwizzlingFunctor.Identity8): if complex_transforms is None: complex_transforms = [(ComplexTransform.none, ComplexTransform.none),] @@ -61,7 +61,7 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ B = TensorDescription(element_b, layout[1], alignment, complex_transform[1]) C = TensorDescription(element_c, layout[2], alignment_c) - new_operation = GemmOperation(GemmKind.Gemm, tile_description.minimum_compute_capability, \ + new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \ tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor) manifest.append(new_operation) @@ -466,6 +466,9 @@ def GenerateSM70_WmmaTensorOp_161616(manifest, args): def GenerateSM70(manifest, args): GenerateSM70_TensorOp_884(manifest, args) GenerateSM70_PlanarComplexTensorOp_884(manifest, args) + + # To limit build size, WMMA GEMMs are disabled for now. + # #GenerateSM70_WmmaTensorOp_161616(manifest, args) ################################################################################################### @@ -621,6 +624,11 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args): DataType.s8, DataType.s8, DataType.s32, \ OpcodeClass.TensorOp, \ MathOperation.multiply_add_saturate), + MathInstruction( \ + [8, 8, 16], \ + DataType.u8, DataType.u8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), ] min_cc = 75 @@ -654,7 +662,7 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args): data_type_mixed = [ math_inst.element_a, math_inst.element_b, - math_inst.element_a, + DataType.s8, DataType.f32, ] @@ -687,6 +695,11 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args): DataType.s8, DataType.s8, DataType.s32, \ OpcodeClass.TensorOp, \ MathOperation.multiply_add_saturate), + MathInstruction( \ + [8, 8, 16], \ + DataType.u8, DataType.u8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), ] min_cc = 75 @@ -712,8 +725,7 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args): ] operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ - data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, \ - SwizzlingFunctor.Identity) + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) for op in operations: op.C.alignment = 8 @@ -736,6 +748,11 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args): DataType.s4, DataType.s4, DataType.s32, \ OpcodeClass.TensorOp, \ MathOperation.multiply_add_saturate), + MathInstruction( \ + [8, 8, 32], \ + DataType.u4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), ] min_cc = 75 @@ -769,7 +786,7 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args): data_type_mixed = [ math_inst.element_a, math_inst.element_b, - math_inst.element_a, + DataType.s4, DataType.f32, ] @@ -804,6 +821,11 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args): DataType.s4, DataType.s4, DataType.s32, \ OpcodeClass.TensorOp, \ MathOperation.multiply_add_saturate), + MathInstruction( \ + [8, 8, 32], \ + DataType.u4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), ] min_cc = 75 @@ -832,8 +854,7 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args): ] operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ - data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, \ - SwizzlingFunctor.Identity) + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) for op in operations: op.C.alignment = 16 @@ -911,6 +932,831 @@ def GenerateSM75(manifest, args): ################################################################################################### ################################################################################################### +# +def GenerateSM80_TensorOp_16816(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 16], \ + DataType.f16, DataType.f16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 16], \ + DataType.f16, DataType.f16, DataType.f16, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 16], \ + DataType.bf16, DataType.bf16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [8, 4, 2] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 3, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 3, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 4, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 5, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 64], 3, [1, 4, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) + + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) + if math_inst.element_a != math_inst.element_accumulator: + + data_type_mixed = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_a, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints) + +# + +# +def GenerateSM80_PlanarComplexTensorOp_16816(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 16], \ + DataType.f16, DataType.f16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 16], \ + DataType.bf16, DataType.bf16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 16], \ + DataType.f16, DataType.f16, DataType.f16, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [8, ] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([ 64, 128, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, complex_transforms) + + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) + if math_inst.element_a != math_inst.element_accumulator: + + data_type_mixed = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_a, + math_inst.element_accumulator, + ] + + CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, complex_transforms) + +# +def GenerateSM80_TensorOp_16832_TN(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 32], \ + DataType.s8, DataType.s8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + MathInstruction( \ + [16, 8, 32], \ + DataType.u8, DataType.u8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [16,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 256, 128], 3, [1, 4, 1], math_inst, min_cc, max_cc), + ] + + data_type = [math_inst.element_a, math_inst.element_b, DataType.s32, DataType.s32] + data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s8, DataType.f32] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + operations = [] + + operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + for op in operations: + if op.tile_description.threadblock_shape[1] >= 128: + op.C.alignment = 16 + else: + op.C.alignment = 8 + +# + +# +def GenerateSM80_TensorOp_16832_Interleaved(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 32], \ + DataType.s8, DataType.s8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + MathInstruction( \ + [16, 8, 32], \ + DataType.u8, DataType.u8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [16,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s8, DataType.f32] + + operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + for op in operations: + op.C.alignment = 8 + +# + +# +def GenerateSM80_TensorOp_16864_TN(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 64], \ + DataType.s4, DataType.s4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + MathInstruction( \ + [16, 8, 64], \ + DataType.u4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [32,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 256], 5, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [math_inst.element_a, math_inst.element_b, DataType.s32, DataType.s32] + data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s4, DataType.f32] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + operations = [] + + operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + for op in operations: + if op.tile_description.threadblock_shape[1] >= 128: + op.C.alignment = 8 + elif op.tile_description.threadblock_shape[1] == 64: + op.C.alignment = 8 + else: + op.C.alignment = 4 +# + +# +def GenerateSM80_TensorOp_16864_Interleaved(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 64], \ + DataType.s4, DataType.s4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + MathInstruction( \ + [16, 8, 64], \ + DataType.u4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [32,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s4, DataType.f32] + + operations = [] + + operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + for op in operations: + op.C.alignment = 16 +# + +# +def GenerateSM80_TensorOp_168256(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 256], \ + DataType.b1, DataType.b1, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.xor_popc), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [128,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 512], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 512], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 1024], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 1024], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 1024], 5, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + +# + +# +def GenerateSM80_TensorOp_1688(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 8], \ + DataType.tf32, DataType.tf32, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add) + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [4, 2, 1] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 3, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 3, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 4, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 5, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 32], 3, [1, 4, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + data_type_mixed = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_a, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints) + +# + +# +def GenerateSM80_TensorOp_1688_fast_math(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 8], \ + DataType.tf32, DataType.tf32, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 8], \ + DataType.f16, DataType.f16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_fast_f16), + MathInstruction( \ + [16, 8, 8], \ + DataType.bf16, DataType.bf16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_fast_bf16) + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [4, 2, 1] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 3, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 3, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 4, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 5, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 32], 3, [1, 4, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) + +# + +# +def GenerateSM80_TensorOp_1688_complex(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_inst = MathInstruction( \ + [16, 8, 8], \ + DataType.f32, DataType.f32, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_complex) + + min_cc = 80 + max_cc = 1024 + + tile_descriptions = [ + TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32 + ] + + alignment_constraints = [1,] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, complex_transforms) +# + +# +def GenerateSM80_TensorOp_884(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_inst = \ + MathInstruction( \ + [8, 8, 4], \ + DataType.f64, DataType.f64, DataType.f64, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add) + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + tile_descriptions = [ + TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) +# + +# +def GenerateSM80_TensorOp_884_complex(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_inst = \ + MathInstruction( \ + [8, 8, 4], \ + DataType.f64, DataType.f64, DataType.f64, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_complex) + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + tile_descriptions = [ + TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, complex_transforms) + +# +def GenerateSM80_TensorOp_884_complex_gaussian(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_inst = \ + MathInstruction( \ + [8, 8, 4], \ + DataType.f64, DataType.f64, DataType.f64, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_complex_gaussian) + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + tile_descriptions = [ + TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, complex_transforms) +# + +################################################################################################### + +# +def GenerateSM80_Simt(manifest, args): + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [1, 1, 1], \ + DataType.f32, DataType.f32, DataType.f32, \ + OpcodeClass.Simt, \ + MathOperation.multiply_add), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 8], 5, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 8], 4, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([128, 32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) +# + +################################################################################################### + +# +def GenerateSM80(manifest, args): + + GenerateSM80_TensorOp_16816(manifest, args) + GenerateSM80_PlanarComplexTensorOp_16816(manifest, args) + GenerateSM80_TensorOp_1688(manifest, args) + GenerateSM80_TensorOp_1688_fast_math(manifest, args) + GenerateSM80_TensorOp_1688_complex(manifest, args) + GenerateSM80_TensorOp_884(manifest, args) + GenerateSM80_TensorOp_884_complex(manifest, args) + GenerateSM80_TensorOp_884_complex_gaussian(manifest, args) + GenerateSM80_TensorOp_16832_TN(manifest, args) + GenerateSM80_TensorOp_16832_Interleaved(manifest, args) + GenerateSM80_TensorOp_16864_TN(manifest, args) + GenerateSM80_TensorOp_16864_Interleaved(manifest, args) + GenerateSM80_TensorOp_168256(manifest, args) + GenerateSM80_Simt(manifest, args) +# + ################################################################################################### if __name__ == "__main__": @@ -920,7 +1766,7 @@ if __name__ == "__main__": parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory") parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory") parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.") - parser.add_argument("--architectures", default='50;60;61;75', help="Target compute architectures") + parser.add_argument("--architectures", default='53;60;61;70;75;80', help="Target compute architectures") parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.') parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit") @@ -933,6 +1779,8 @@ if __name__ == "__main__": GenerateSM61(manifest, args) GenerateSM70(manifest, args) GenerateSM75(manifest, args) + GenerateSM80(manifest, args) + if 'library' in args.generator_target.split(','): manifest.emit(GeneratorTarget.Library) diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py index 71f521e6e..bdc434830 100644 --- a/tools/library/scripts/library.py +++ b/tools/library/scripts/library.py @@ -4,14 +4,32 @@ # \brief Generates the CUTLASS Library's instances # -import enum import re ################################################################################################### +import enum + +# The following block implements enum.auto() for Python 3.5 variants that don't include it such +# as the default 3.5.2 on Ubuntu 16.04. +# +# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility + +try: + from enum import auto as enum_auto +except ImportError: + __cutlass_library_auto_enum = 0 + def enum_auto() -> int: + global __cutlass_library_auto_enum + i = __cutlass_library_auto_enum + __cutlass_library_auto_enum += 1 + return i + +################################################################################################### + # class GeneratorTarget(enum.Enum): - Library = enum.auto() + Library = enum_auto() # GeneratorTargetNames = { GeneratorTarget.Library: 'library' @@ -22,33 +40,37 @@ GeneratorTargetNames = { # class DataType(enum.Enum): - b1 = enum.auto() - u4 = enum.auto() - u8 = enum.auto() - u16 = enum.auto() - u32 = enum.auto() - u64 = enum.auto() - s4 = enum.auto() - s8 = enum.auto() - s16 = enum.auto() - s32 = enum.auto() - s64 = enum.auto() - f16 = enum.auto() - f32 = enum.auto() - f64 = enum.auto() - cf16 = enum.auto() - cf32 = enum.auto() - cf64 = enum.auto() - cs4 = enum.auto() - cs8 = enum.auto() - cs16 = enum.auto() - cs32 = enum.auto() - cs64 = enum.auto() - cu4 = enum.auto() - cu8 = enum.auto() - cu16 = enum.auto() - cu32 = enum.auto() - cu64 = enum.auto() + b1 = enum_auto() + u4 = enum_auto() + u8 = enum_auto() + u16 = enum_auto() + u32 = enum_auto() + u64 = enum_auto() + s4 = enum_auto() + s8 = enum_auto() + s16 = enum_auto() + s32 = enum_auto() + s64 = enum_auto() + f16 = enum_auto() + bf16 = enum_auto() + f32 = enum_auto() + tf32 = enum_auto() + f64 = enum_auto() + cf16 = enum_auto() + cbf16 = enum_auto() + cf32 = enum_auto() + ctf32 = enum_auto() + cf64 = enum_auto() + cs4 = enum_auto() + cs8 = enum_auto() + cs16 = enum_auto() + cs32 = enum_auto() + cs64 = enum_auto() + cu4 = enum_auto() + cu8 = enum_auto() + cu16 = enum_auto() + cu32 = enum_auto() + cu64 = enum_auto() # ShortDataTypeNames = { @@ -74,10 +96,14 @@ DataTypeNames = { DataType.s32: "s32", DataType.s64: "s64", DataType.f16: "f16", + DataType.bf16: "bf16", DataType.f32: "f32", + DataType.tf32: "tf32", DataType.f64: "f64", DataType.cf16: "cf16", + DataType.cbf16: "cbf16", DataType.cf32: "cf32", + DataType.ctf32: "ctf32", DataType.cf64: "cf64", DataType.cu4: "cu4", DataType.cu8: "cu8", @@ -104,10 +130,14 @@ DataTypeTag = { DataType.s32: "int32_t", DataType.s64: "int64_t", DataType.f16: "cutlass::half_t", + DataType.bf16: "cutlass::bfloat16_t", DataType.f32: "float", + DataType.tf32: "cutlass::tfloat32_t", DataType.f64: "double", DataType.cf16: "cutlass::complex", + DataType.cbf16: "cutlass::complex", DataType.cf32: "cutlass::complex", + DataType.ctf32: "cutlass::complex", DataType.cf64: "cutlass::complex", DataType.cu4: "cutlass::complex", DataType.cu8: "cutlass::complex", @@ -134,10 +164,14 @@ DataTypeSize = { DataType.s32: 32, DataType.s64: 64, DataType.f16: 16, + DataType.bf16: 16, DataType.f32: 32, + DataType.tf32: 32, DataType.f64: 64, DataType.cf16: 32, + DataType.cbf16: 32, DataType.cf32: 64, + DataType.ctf32: 32, DataType.cf64: 128, DataType.cu4: 8, DataType.cu8: 16, @@ -155,8 +189,8 @@ DataTypeSize = { # class ComplexTransform(enum.Enum): - none = enum.auto() - conj = enum.auto() + none = enum_auto() + conj = enum_auto() # ComplexTransformTag = { @@ -194,40 +228,47 @@ def get_real_from_complex(complex_type): # class ComplexMultiplyOp(enum.Enum): - multiply_add = enum.auto() - gaussian = enum.auto() + multiply_add = enum_auto() + gaussian = enum_auto() ################################################################################################### # class MathOperation(enum.Enum): - multiply_add = enum.auto() - multiply_add_saturate = enum.auto() - xor_popc = enum.auto() - multiply_add_complex = enum.auto() + multiply_add = enum_auto() + multiply_add_saturate = enum_auto() + xor_popc = enum_auto() + multiply_add_fast_bf16 = enum_auto() + multiply_add_fast_f16 = enum_auto() + multiply_add_complex = enum_auto() + multiply_add_complex_gaussian = enum_auto() + # MathOperationTag = { MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd', MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate', MathOperation.xor_popc: 'cutlass::arch::OpXorPopc', + MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16', + MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16', MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex', + MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex', } ################################################################################################### # class LayoutType(enum.Enum): - ColumnMajor = enum.auto() - RowMajor = enum.auto() - ColumnMajorInterleaved32 = enum.auto() - RowMajorInterleaved32 = enum.auto() - ColumnMajorInterleaved64 = enum.auto() - RowMajorInterleaved64 = enum.auto() - TensorNHWC = enum.auto() - TensorNCHW = enum.auto() - TensorNGHWC = enum.auto() - TensorNCxHW32 = enum.auto() - TensorNCxHW64 = enum.auto() + ColumnMajor = enum_auto() + RowMajor = enum_auto() + ColumnMajorInterleaved32 = enum_auto() + RowMajorInterleaved32 = enum_auto() + ColumnMajorInterleaved64 = enum_auto() + RowMajorInterleaved64 = enum_auto() + TensorNHWC = enum_auto() + TensorNCHW = enum_auto() + TensorNGHWC = enum_auto() + TensorNCxHW32 = enum_auto() + TensorNCxHW64 = enum_auto() # LayoutTag = { @@ -282,9 +323,9 @@ ShortComplexLayoutNames = { # class OpcodeClass(enum.Enum): - Simt = enum.auto() - TensorOp = enum.auto() - WmmaTensorOp = enum.auto() + Simt = enum_auto() + TensorOp = enum_auto() + WmmaTensorOp = enum_auto() OpcodeClassNames = { OpcodeClass.Simt: 'simt', @@ -302,7 +343,7 @@ OpcodeClassTag = { # class OperationKind(enum.Enum): - Gemm = enum.auto() + Gemm = enum_auto() # OperationKindNames = { OperationKind.Gemm: 'gemm' @@ -310,7 +351,7 @@ OperationKindNames = { # class Target(enum.Enum): - library = enum.auto() + library = enum_auto() ArchitectureNames = { 50: 'maxwell', @@ -318,6 +359,7 @@ ArchitectureNames = { 61: 'pascal', 70: 'volta', 75: 'turing', + 80: 'ampere', } ################################################################################################### @@ -340,27 +382,27 @@ def SubstituteTemplate(template, values): # class GemmKind(enum.Enum): - Gemm = enum.auto() - Batched = enum.auto() - Array = enum.auto() - Universal = enum.auto() - PlanarComplex = enum.auto() - PlanarComplexArray = enum.auto() + Gemm = enum_auto() + Batched = enum_auto() + Array = enum_auto() + Universal = enum_auto() + PlanarComplex = enum_auto() + PlanarComplexArray = enum_auto() # GemmKindNames = { GemmKind.Gemm: "gemm", GemmKind.Batched: "gemm_batched", GemmKind.Array: "gemm_array", - GemmKind.Universal: "gemm_universal", + GemmKind.Universal: "gemm", GemmKind.PlanarComplex: "gemm_planar_complex", GemmKind.PlanarComplexArray: "gemm_planar_complex_array", } # class EpilogueFunctor(enum.Enum): - LinearCombination = enum.auto() - LinearCombinationClamp = enum.auto() + LinearCombination = enum_auto() + LinearCombinationClamp = enum_auto() # EpilogueFunctorTag = { @@ -370,13 +412,17 @@ EpilogueFunctorTag = { # class SwizzlingFunctor(enum.Enum): - Cohort = enum.auto() - Identity = enum.auto() + Identity1 = enum_auto() + Identity2 = enum_auto() + Identity4 = enum_auto() + Identity8 = enum_auto() # SwizzlingFunctorTag = { - SwizzlingFunctor.Cohort: 'cutlass::gemm::threadblock::GemmCohortThreadblockSwizzle<${layout_a}, ${layout_b}>', - SwizzlingFunctor.Identity: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle', + SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>', + SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>', + SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>', + SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>', } ################################################################################################### diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py index 38182a1bb..756ddc726 100644 --- a/tools/library/scripts/manifest.py +++ b/tools/library/scripts/manifest.py @@ -127,7 +127,7 @@ class Manifest: if args.kernels == 'all': self.kernel_names = [] else: - self.kernel_names = args.kernels.split(',') + self.kernel_names = [x for x in args.kernels.split(',') if x != ''] self.operation_count = 0 self.operations_by_name = {} diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h index 102c549a0..23781b25e 100644 --- a/tools/library/src/gemm_operation.h +++ b/tools/library/src/gemm_operation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -29,13 +29,14 @@ #pragma once #include "cutlass/cutlass.h" -#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" #include "cutlass/gemm/device/gemm.h" #include "cutlass/gemm/device/gemm_complex.h" #include "cutlass/gemm/device/gemm_batched.h" #include "cutlass/gemm/device/gemm_array.h" #include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/default_gemm_universal.h" +#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" #include "cutlass/library/library.h" #include "library_internal.h" @@ -104,10 +105,10 @@ public: MathOperationMap::kId; description_.tile_description.minimum_compute_capability = - ArchMap::kMin; + ArchMap::kMin; description_.tile_description.maximum_compute_capability = - ArchMap::kMax; + ArchMap::kMax; description_.A = make_TensorDescription(Operator::kAlignmentA); description_.B = make_TensorDescription(Operator::kAlignmentB); @@ -698,6 +699,201 @@ public: } }; +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +class GemmUniversalOperation : public GemmOperationBase { +public: + + using Operator = Operator_; + using ElementA = typename Operator::ElementA; + using LayoutA = typename Operator::LayoutA; + using ElementB = typename Operator::ElementB; + using LayoutB = typename Operator::LayoutB; + using ElementC = typename Operator::ElementC; + using LayoutC = typename Operator::LayoutC; + using ElementAccumulator = typename Operator::ElementAccumulator; + using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; + + using OperatorArguments = typename Operator::Arguments; + +public: + + /// Constructor + GemmUniversalOperation(char const *name = "unknown_gemm"): + GemmOperationBase(name) { + + this->description_.gemm_kind = GemmKind::kUniversal; + } + +protected: + + /// Constructs the arguments structure given the configuration and arguments + static Status construct_arguments_( + OperatorArguments &operator_args, + GemmUniversalConfiguration const *configuration) { + + operator_args.mode = configuration->mode; + + operator_args.problem_size = configuration->problem_size; + operator_args.batch_count = configuration->batch_count; + + operator_args.lda = int(configuration->lda); + operator_args.ldb = int(configuration->ldb); + operator_args.ldc = int(configuration->ldc); + operator_args.ldd = int(configuration->ldd); + + return Status::kSuccess; + } + + /// Constructs the arguments structure given the configuration and arguments + static Status update_arguments_( + OperatorArguments &operator_args, + GemmUniversalArguments const *arguments) { + + if (arguments->pointer_mode == ScalarPointerMode::kHost) { + typename Operator::EpilogueOutputOp::Params params( + *static_cast(arguments->alpha), + *static_cast(arguments->beta) + ); + operator_args.epilogue = params; + } + else if (arguments->pointer_mode == ScalarPointerMode::kDevice){ + typename Operator::EpilogueOutputOp::Params params( + static_cast(arguments->alpha), + static_cast(arguments->beta) + ); + operator_args.epilogue = params; + } + else { + return Status::kErrorInvalidProblem; + } + + // update arguments + operator_args.ptr_A = arguments->A; + operator_args.ptr_B = arguments->B; + operator_args.ptr_C = arguments->C; + operator_args.ptr_D = arguments->D; + + operator_args.batch_stride_A = arguments->batch_stride_A; + operator_args.batch_stride_B = arguments->batch_stride_B; + operator_args.batch_stride_C = arguments->batch_stride_C; + operator_args.batch_stride_D = arguments->batch_stride_D; + + return Status::kSuccess; + } + +public: + + /// Returns success if the operation can proceed + virtual Status can_implement( + void const *configuration_ptr, + void const *arguments_ptr) const { + + GemmUniversalConfiguration const *configuration = + static_cast(configuration_ptr); + + GemmUniversalArguments const *arguments = + static_cast(arguments_ptr); + + OperatorArguments args; + + Status status = construct_arguments_(args, configuration); + + if (status != Status::kSuccess) { + return status; + } + + status = update_arguments_(args, arguments); + + if (status != Status::kSuccess) { + return status; + } + + return Operator::can_implement(args); + } + + /// Gets the host-side workspace + virtual uint64_t get_host_workspace_size( + void const *configuration) const { + + return sizeof(Operator); + } + + /// Gets the device-side workspace + virtual uint64_t get_device_workspace_size( + void const *configuration_ptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return 0; + } + + uint64_t size = Operator::get_workspace_size(args); + + return size; + } + + /// Initializes the workspace + virtual Status initialize( + void const *configuration_ptr, + void *host_workspace, + void *device_workspace, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = new (host_workspace) Operator; + + status = op->initialize(args, device_workspace, stream); + + return status; + } + + /// Runs the kernel + virtual Status run( + void const *arguments_ptr, + void *host_workspace, + void *device_workspace = nullptr, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = update_arguments_( + args, + static_cast(arguments_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = static_cast(host_workspace); + + status = op->update(args, device_workspace); + + if (status != Status::kSuccess) { + return status; + } + + status = op->run(stream); + + return status; + } +}; + /////////////////////////////////////////////////////////////////////////////////////////////////// template diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu index b2345932c..bdddf2d7c 100644 --- a/tools/library/src/handle.cu +++ b/tools/library/src/handle.cu @@ -26,7 +26,7 @@ /*! \file \brief CUTLASS Library handle. */ - +#include #include #include @@ -43,7 +43,8 @@ namespace library { Handle::Handle( cudaStream_t stream, size_t workspace_size -): +): + provider_(Provider::kCUTLASS), stream_(stream), workspace_(nullptr), workspace_size_(0), @@ -95,6 +96,7 @@ Handle::Handle(Handle && handle) { /// Move assignment operator Handle & Handle::operator=(Handle && handle) { + provider_ = handle.provider_; device_ = handle.device_; workspace_size_ = handle.workspace_size_; workspace_ = handle.workspace_; @@ -121,6 +123,16 @@ cudaStream_t Handle::get_stream() const { return stream_; } +/// Gets the current provider +Provider Handle::get_provider() const { + return provider_; +} + +/// Sets the provider of operations +void Handle::set_provider(Provider provider) { + provider_ = provider; +} + /// Gets the device workspace size size_t Handle::get_workspace_size() const { return workspace_size_; @@ -351,6 +363,8 @@ Status Handle::gemm( // GemmFunctionalKey key( + provider_, + GemmKind::kGemm, element_compute, element_scalar, element_A, @@ -457,6 +471,188 @@ Status Handle::gemm( /////////////////////////////////////////////////////////////////////////////////////////////////// +/// Executes a GEMM computation: D <= alpha * A*B + beta * C. +// +// Supports batched-strided, batched array or split-K serial or split-K parallel. +// +Status Handle::gemm_universal( + + GemmUniversalMode mode, /// indicates the mode in which the kUniversal GEMM is launched + + int M, /// GEMM M dimension + int N, /// GEMM N dimension + int K, /// GEMM K dimension + + NumericTypeID element_compute, /// Data type of internal accumulation + + NumericTypeID element_scalar, /// Data type of alpha/beta scalars + + void const *alpha, /// Pointer to alpha scalar + + NumericTypeID element_A, /// Data type of A matrix elements + LayoutTypeID layout_A, /// Layout of A matrix + ComplexTransform transform_A, /// Complex transformation applied to A matrix - ignored for real-valued matrices + + void const * ptr_A, /// Pointer to A matrix in Global Memory + int lda, /// Leading dimension of A matrix + + NumericTypeID element_B, /// Data type of B matrix elements + LayoutTypeID layout_B, /// Layout of B matrix + ComplexTransform transform_B, /// Complex transformation applied to B matrix - ignored for real-valued matrices + + void const * ptr_B, /// Pointer to B matrix in Global Memory + int ldb, /// Leading dimension of B matrix + + void const * beta, /// Pointer to beta scalar + + NumericTypeID element_C, /// Data type of C and D matrices + + void const * ptr_C, /// Pointer to C matrix + int ldc, /// Leading dimension of C matrix + + void * ptr_D, /// Pointer to D matrix + int ldd, /// Leading dimension of D matrix + + int batch_count, /// Batch count or number of split-K slices + + int64_t batch_stride_A, /// Batch stride of A operand + int64_t batch_stride_B, /// Batch stride of B operand + int64_t batch_stride_C, /// Batch stride of C operand + int64_t batch_stride_D /// Batch stride of D operand +) { + + // + // Find the operation + // + + GemmFunctionalKey key( + provider_, + GemmKind::kUniversal, + element_compute, + element_scalar, + element_A, + layout_A, + transform_A, + element_B, + layout_B, + transform_B, + element_C + ); + + auto operators_it = Singleton::get().operation_table.gemm_operations.find(key); + + if (operators_it == Singleton::get().operation_table.gemm_operations.end()) { + return cutlass::Status::kErrorNotSupported; + } + + if (operators_it->second.empty()) { + return cutlass::Status::kErrorNotSupported; + } + + // + // Compute the largest alignment restriction the kernel can satisfy. + // + + // Maximum alignment expectation among all kernels (in units of bytes) + int const kMaximumAlignmentSize = 16; + + void const *ptr_A_check = ptr_A; + void const *ptr_B_check = ptr_B; + void const *ptr_C_check = ptr_C; + void * ptr_D_check = ptr_D; + + // Ignore alignment of pointers to pointers. We can't check this from the host, + // as each batch index has its own pointer in device memory. + if (mode == GemmUniversalMode::kArray) { + ptr_A_check = nullptr; + ptr_B_check = nullptr; + ptr_C_check = nullptr; + ptr_D_check = nullptr; + } + + int alignment = gemm_problem_alignment( + M, N, K, + element_A, ptr_A_check, lda, 0, + element_B, ptr_B_check, ldb, 0, + element_C, ptr_C_check, ldc, 0, + ptr_D_check, ldd, 0, kMaximumAlignmentSize + ); + + // + // Find the best kernel in descending order of preference. + // + + GemmPreferenceKey preference_key(compute_capability(), alignment); + + Operation const *operation = find_gemm_operation(operators_it, preference_key); + + if (!operation) { + return cutlass::Status::kErrorNotSupported; + } + + last_operation_ = operation; + + // + // Configure operation + // + + GemmUniversalConfiguration configuration{ + mode, + {M, N, K}, + batch_count, + lda, + ldb, + ldc, + ldd + }; + + // Query host work space size + uint64_t host_workspace_size_needed = operation->get_host_workspace_size(&configuration); + + if (uint64_t(kHostWorkspaceSize) < host_workspace_size_needed) { + return cutlass::Status::kErrorNotSupported; + } + + char host_workspace[kHostWorkspaceSize]; + + // Query device workspace size + uint64_t device_workspace_size_needed = operation->get_device_workspace_size(&configuration); + + if (uint64_t(workspace_size_) < device_workspace_size_needed) { + return cutlass::Status::kErrorNotSupported; + } + + // Initialize host and device workspaces + Status status = operation->initialize( + &configuration, + host_workspace, + workspace_, + stream_); + + if (status != cutlass::Status::kSuccess) { + return status; + } + + // Run the operator + GemmUniversalArguments arguments{ + ptr_A, + ptr_B, + ptr_C, + ptr_D, + alpha, + beta, + scalar_pointer_mode_, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D + }; + + return operation->run(&arguments, host_workspace, workspace_, stream_); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + /// Planar complex GEMM Status Handle::gemm_planar_complex( @@ -522,6 +718,8 @@ Status Handle::gemm_planar_complex( // GemmFunctionalKey key( + provider_, + GemmKind::kPlanarComplex, element_compute, element_scalar, element_A, @@ -533,9 +731,9 @@ Status Handle::gemm_planar_complex( element_C ); - auto operators_it = Singleton::get().operation_table.gemm_planar_complex_operations.find(key); + auto operators_it = Singleton::get().operation_table.gemm_operations.find(key); - if (operators_it == Singleton::get().operation_table.gemm_planar_complex_operations.end()) { + if (operators_it == Singleton::get().operation_table.gemm_operations.end()) { return cutlass::Status::kErrorNotSupported; } @@ -714,6 +912,8 @@ Status Handle::gemm_planar_complex_array( // GemmFunctionalKey key( + provider_, + GemmKind::kPlanarComplexArray, element_compute, element_scalar, element_A, @@ -725,9 +925,9 @@ Status Handle::gemm_planar_complex_array( element_C ); - auto operators_it = Singleton::get().operation_table.gemm_planar_complex_array_operations.find(key); + auto operators_it = Singleton::get().operation_table.gemm_operations.find(key); - if (operators_it == Singleton::get().operation_table.gemm_planar_complex_array_operations.end()) { + if (operators_it == Singleton::get().operation_table.gemm_operations.end()) { return cutlass::Status::kErrorNotSupported; } @@ -837,7 +1037,6 @@ Status Handle::gemm_planar_complex_array( } ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace library } // namespace cutlass diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h index 252d474e2..73847b117 100644 --- a/tools/library/src/library_internal.h +++ b/tools/library/src/library_internal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -125,6 +125,14 @@ template <> struct NumericTypeMap > { static NumericTypeID const kId = NumericTypeID::kCF64; }; +template <> struct NumericTypeMap { + static NumericTypeID const kId = NumericTypeID::kBF16; +}; + +template <> struct NumericTypeMap { + static NumericTypeID const kId = NumericTypeID::kTF32; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// template struct MathOperationMap { @@ -143,6 +151,10 @@ template <> struct MathOperationMap { static MathOperationID const kId = MathOperationID::kMultiplyAddComplex; }; +template <> struct MathOperationMap { + static MathOperationID const kId = MathOperationID::kMultiplyAddGaussianComplex; +}; + template <> struct MathOperationMap { static MathOperationID const kId = MathOperationID::kXorPopc; }; @@ -217,33 +229,43 @@ template <> struct ComplexTransformMap { ///////////////////////////////////////////////////////////////////////////////////////////////// -template struct ArchMap; +template struct ArchMap; -template <> struct ArchMap { +template <> struct ArchMap { static int const kMin = 50; static int const kMax = 1024; }; -template <> struct ArchMap { +template <> struct ArchMap { static int const kMin = 60; static int const kMax = 1024; }; -template <> struct ArchMap { +template <> struct ArchMap { static int const kMin = 61; static int const kMax = 1024; }; -template <> struct ArchMap { +template <> struct ArchMap { + static int const kMin = 70; + static int const kMax = 1024; +}; + +template <> struct ArchMap { static int const kMin = 70; static int const kMax = 75; }; -template <> struct ArchMap { +template struct ArchMap { static int const kMin = 75; static int const kMax = 1024; }; +template struct ArchMap { + static int const kMin = 80; + static int const kMax = 1024; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// template diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp index ca6d1781e..d4e8a884b 100644 --- a/tools/library/src/manifest.cpp +++ b/tools/library/src/manifest.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -37,11 +37,6 @@ namespace library { ////////////////////////////////////////////////////////////////////////////////////////////////////////// -// init and insert all cutlass op in manifest object (procedurally generated using generator.py) -void initialize_all(Manifest &manifest); - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - /// Top-level initialization Status Manifest::initialize() { @@ -49,13 +44,8 @@ Status Manifest::initialize() { operations_.clear(); } - switch(provider_) { - case Provider::kCUTLASS: - initialize_all(*this); break; - - default: - break; - } + // initialize procedurally generated cutlass op in manifest object + initialize_all(*this); return Status::kSuccess; } diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu index 8fb0fe633..64e4f264c 100644 --- a/tools/library/src/operation_table.cu +++ b/tools/library/src/operation_table.cu @@ -28,30 +28,7 @@ instances may be queried. */ -#include - -#include "cutlass/library/library.h" #include "cutlass/library/operation_table.h" -#include "cutlass/library/util.h" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k) { - - out << "{\n" - << " element_compute: " << to_string(k.element_compute) << "\n" - << " element_scalar: " << to_string(k.element_scalar) << "\n" - << " element_A: " << to_string(k.element_A) << "\n" - << " layout_A: " << to_string(k.layout_A) << "\n" - << " transform_A: " << to_string(k.transform_A) << "\n" - << " element_B: " << to_string(k.element_B) << "\n" - << " layout_B: " << to_string(k.layout_B) << "\n" - << " transform_B: " << to_string(k.transform_B) << "\n" - << " element_C: " << to_string(k.element_C) << "\n" - << "}"; - - return out; -} ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -67,85 +44,38 @@ void OperationTable::append(Manifest const &manifest) { OperationDescription const &desc = operation->description(); + // insert all gemm operation into operation table if (desc.kind == OperationKind::kGemm) { GemmDescription const &gemm_desc = static_cast(desc); - if (gemm_desc.gemm_kind == GemmKind::kGemm) { - GemmFunctionalKey functional_key( - gemm_desc.tile_description.math_instruction.element_accumulator, - gemm_desc.element_epilogue, - gemm_desc.A.element, - gemm_desc.A.layout, - gemm_desc.transform_A, - gemm_desc.B.element, - gemm_desc.B.layout, - gemm_desc.transform_B, - gemm_desc.C.element - ); + GemmFunctionalKey functional_key( + gemm_desc.provider, + gemm_desc.gemm_kind, + gemm_desc.tile_description.math_instruction.element_accumulator, + gemm_desc.element_epilogue, + gemm_desc.A.element, + gemm_desc.A.layout, + gemm_desc.transform_A, + gemm_desc.B.element, + gemm_desc.B.layout, + gemm_desc.transform_B, + gemm_desc.C.element + ); - Operation const *op = operation.get(); + Operation const *op = operation.get(); - int cc = gemm_desc.tile_description.minimum_compute_capability; + int cc = gemm_desc.tile_description.minimum_compute_capability; - int alignment = std::max(std::max( - gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment); + int alignment = std::max(std::max( + gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment); - GemmPreferenceKey preference_key(cc, alignment); + GemmPreferenceKey preference_key(cc, alignment); - gemm_operations[functional_key][preference_key].push_back(op); - } - else if (gemm_desc.gemm_kind == GemmKind::kPlanarComplex) { - - GemmFunctionalKey functional_key( - gemm_desc.tile_description.math_instruction.element_accumulator, - gemm_desc.element_epilogue, - gemm_desc.A.element, - gemm_desc.A.layout, - gemm_desc.transform_A, - gemm_desc.B.element, - gemm_desc.B.layout, - gemm_desc.transform_B, - gemm_desc.C.element - ); - - Operation const *op = operation.get(); - - int cc = gemm_desc.tile_description.minimum_compute_capability; - - int alignment = std::max(std::max( - gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment); - - GemmPreferenceKey preference_key(cc, alignment); - - gemm_planar_complex_operations[functional_key][preference_key].push_back(op); - } - else if (gemm_desc.gemm_kind == GemmKind::kPlanarComplexArray) { - - GemmFunctionalKey functional_key( - gemm_desc.tile_description.math_instruction.element_accumulator, - gemm_desc.element_epilogue, - gemm_desc.A.element, - gemm_desc.A.layout, - gemm_desc.transform_A, - gemm_desc.B.element, - gemm_desc.B.layout, - gemm_desc.transform_B, - gemm_desc.C.element - ); - - Operation const *op = operation.get(); - - int cc = gemm_desc.tile_description.minimum_compute_capability; - - int alignment = std::max(std::max( - gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment); - - GemmPreferenceKey preference_key(cc, alignment); - - gemm_planar_complex_array_operations[functional_key][preference_key].push_back(op); - } + gemm_operations[functional_key][preference_key].push_back(op); } + + } } diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu index 127572920..427f0a2c5 100644 --- a/tools/library/src/util.cu +++ b/tools/library/src/util.cu @@ -45,6 +45,7 @@ static struct { Provider enumerant; } Provider_enumerants[] = { + {"none", "None", Provider::kNone}, {"cutlass", "CUTLASS", Provider::kCUTLASS}, {"host", "reference_host", Provider::kReferenceHost}, {"device", "reference_device", Provider::kReferenceDevice}, @@ -83,6 +84,38 @@ Provider from_string(std::string const &str) { } +/////////////////////////////////////////////////////////////////////////////////////////////////// + +static struct { + char const *text; + char const *pretty; + GemmKind enumerant; +} +GemmKind_enumerants[] = { + {"gemm", "", GemmKind::kGemm}, + {"batched", "", GemmKind::kBatched}, + {"array", "", GemmKind::kArray}, + {"universal", "", GemmKind::kUniversal}, + {"planar_complex", "", GemmKind::kPlanarComplex}, + {"planar_complex_array", "", GemmKind::kPlanarComplexArray}, +}; + +/// Converts a ConvKind enumerant to a string +char const *to_string(GemmKind type, bool pretty) { + + for (auto const & possible : GemmKind_enumerants) { + if (type == possible.enumerant) { + if (pretty) { + return possible.pretty; + } + else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -92,6 +125,7 @@ static struct { OperationKind enumerant; } OperationKind_enumerants[] = { + {"eq_gemm", "EqGemm", OperationKind::kEqGemm}, {"gemm", "Gemm", OperationKind::kGemm}, }; @@ -194,10 +228,14 @@ NumericTypeID_enumerants[] = { {"s32", "S32", NumericTypeID::kS32}, {"s64", "S64", NumericTypeID::kS64}, {"f16", "F16", NumericTypeID::kF16}, + {"bf16", "BF16", NumericTypeID::kBF16}, {"f32", "F32", NumericTypeID::kF32}, + {"tf32", "TF32", NumericTypeID::kTF32}, {"f64", "F64", NumericTypeID::kF64}, {"cf16", "CF16", NumericTypeID::kCF16}, + {"cbf16", "CBF16", NumericTypeID::kCBF16}, {"cf32", "CF32", NumericTypeID::kCF32}, + {"ctf32", "CTF32", NumericTypeID::kCTF32}, {"cf64", "CF64", NumericTypeID::kCF64}, {"cu4", "CU4", NumericTypeID::kCU4}, {"cu8", "CU8", NumericTypeID::kCU8}, @@ -249,10 +287,14 @@ NumericTypeID from_string(std::string const &str) { int sizeof_bits(NumericTypeID type) { switch (type) { case NumericTypeID::kF16: return 16; + case NumericTypeID::kBF16: return 16; + case NumericTypeID::kTF32: return 32; case NumericTypeID::kF32: return 32; case NumericTypeID::kF64: return 64; case NumericTypeID::kCF16: return 32; + case NumericTypeID::kCBF16: return 32; case NumericTypeID::kCF32: return 64; + case NumericTypeID::kCTF32: return 64; case NumericTypeID::kCF64: return 128; case NumericTypeID::kS4: return 4; case NumericTypeID::kS8: return 8; @@ -276,6 +318,8 @@ bool is_complex_type(NumericTypeID type) { case NumericTypeID::kCF16: return true; case NumericTypeID::kCF32: return true; case NumericTypeID::kCF64: return true; + case NumericTypeID::kCBF16: return true; + case NumericTypeID::kCTF32: return true; default: break; } return false; @@ -287,6 +331,8 @@ NumericTypeID get_real_type(NumericTypeID type) { case NumericTypeID::kCF16: return NumericTypeID::kF16; case NumericTypeID::kCF32: return NumericTypeID::kF32; case NumericTypeID::kCF64: return NumericTypeID::kF64; + case NumericTypeID::kCBF16: return NumericTypeID::kBF16; + case NumericTypeID::kCTF32: return NumericTypeID::kTF32; default: break; } return type; @@ -314,6 +360,8 @@ bool is_integer_type(NumericTypeID type) { bool is_signed_type(NumericTypeID type) { switch (type) { case NumericTypeID::kF16: return true; + case NumericTypeID::kBF16: return true; + case NumericTypeID::kTF32: return true; case NumericTypeID::kF32: return true; case NumericTypeID::kF64: return true; case NumericTypeID::kS4: return true; @@ -340,9 +388,13 @@ bool is_unsigned_integer(NumericTypeID type) { bool is_float_type(NumericTypeID type) { switch (type) { case NumericTypeID::kF16: return true; + case NumericTypeID::kBF16: return true; + case NumericTypeID::kTF32: return true; case NumericTypeID::kF32: return true; case NumericTypeID::kF64: return true; case NumericTypeID::kCF16: return true; + case NumericTypeID::kCBF16: return true; + case NumericTypeID::kCTF32: return true; case NumericTypeID::kCF32: return true; case NumericTypeID::kCF64: return true; default: break; @@ -431,7 +483,7 @@ OpcodeClassID_enumerants[] = { {"simt", "", OpcodeClassID::kSimt}, {"tensorop", "", OpcodeClassID::kTensorOp}, {"wmmatensorop", "", OpcodeClassID::kWmmaTensorOp}, - {"wmma", "", OpcodeClassID::kWmmaTensorOp} + {"wmma", "", OpcodeClassID::kWmmaTensorOp}, }; /// Converts a OpcodeClassID enumerant to a string @@ -509,6 +561,47 @@ ComplexTransform from_string(std::string const &str) { } +static struct { + char const *text; + char const *pretty; + SplitKMode enumerant; +} +SplitKMode_enumerants[] = { + {"serial", "", SplitKMode::kSerial}, + {"parallel", "", SplitKMode::kParallel}, +}; + +/// Converts a SplitKMode enumerant to a string +char const *to_string(SplitKMode type, bool pretty) { + + for (auto const & possible : SplitKMode_enumerants) { + if (type == possible.enumerant) { + if (pretty) { + return possible.pretty; + } + else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} + +/// Converts a SplitKMode enumerant from a string +template <> +SplitKMode from_string(std::string const &str) { + + for (auto const & possible : SplitKMode_enumerants) { + if ((str.compare(possible.text) == 0) || + (str.compare(possible.pretty) == 0)) { + return possible.enumerant; + } + } + + return SplitKMode::kInvalid; +} + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid. bool lexical_cast(std::vector &bytes, NumericTypeID type, std::string const &str) { @@ -570,6 +663,20 @@ bool lexical_cast(std::vector &bytes, NumericTypeID type, std::string c *reinterpret_cast(bytes.data()) = static_cast(tmp); } break; + case NumericTypeID::kBF16: + { + float tmp; + ss >> tmp; + *reinterpret_cast(bytes.data()) = static_cast(tmp); + } + break; + case NumericTypeID::kTF32: + { + float tmp; + ss >> tmp; + *reinterpret_cast(bytes.data()) = static_cast(tmp); + } + break; case NumericTypeID::kF32: { ss >> *reinterpret_cast(bytes.data()); @@ -589,11 +696,29 @@ bool lexical_cast(std::vector &bytes, NumericTypeID type, std::string c x->imag() = static_cast(std::imag(tmp)); } break; + case NumericTypeID::kCBF16: + { + std::complex tmp; + ss >> tmp; + cutlass::complex *x = reinterpret_cast *>(bytes.data()); + x->real() = static_cast(std::real(tmp)); + x->imag() = static_cast(std::imag(tmp)); + } + break; case NumericTypeID::kCF32: { ss >> *reinterpret_cast*>(bytes.data()); } break; + case NumericTypeID::kCTF32: + { + std::complex tmp; + ss >> tmp; + cutlass::complex *x = reinterpret_cast *>(bytes.data()); + x->real() = static_cast(std::real(tmp)); + x->imag() = static_cast(std::imag(tmp)); + } + break; case NumericTypeID::kCF64: { ss >> *reinterpret_cast*>(bytes.data()); @@ -674,6 +799,18 @@ std::string lexical_cast(std::vector &bytes, NumericTypeID type) { ss << tmp; } break; + case NumericTypeID::kBF16: + { + float tmp = *reinterpret_cast(bytes.data());; + ss << tmp; + } + break; + case NumericTypeID::kTF32: + { + float tmp = *reinterpret_cast(bytes.data());; + ss << tmp; + } + break; case NumericTypeID::kF32: { ss << *reinterpret_cast(bytes.data()); @@ -696,6 +833,18 @@ std::string lexical_cast(std::vector &bytes, NumericTypeID type) { } } break; + case NumericTypeID::kCBF16: + { + cutlass::complex const *x = + reinterpret_cast const *>(bytes.data()); + + ss << float(x->real()); + + if (x->imag() != cutlass::bfloat16_t()) { + ss << "+i" << float(x->imag()); + } + } + break; case NumericTypeID::kCF32: { cutlass::complex const * x = reinterpret_cast const *>(bytes.data()); @@ -707,6 +856,17 @@ std::string lexical_cast(std::vector &bytes, NumericTypeID type) { } } break; + case NumericTypeID::kCTF32: + { + cutlass::complex const * x = reinterpret_cast const *>(bytes.data()); + + ss << float(x->real()); + + if (x->imag() != tfloat32_t()) { + ss << "+i" << float(x->imag()); + } + } + break; case NumericTypeID::kCF64: { cutlass::complex const * x = reinterpret_cast const *>(bytes.data()); @@ -780,6 +940,16 @@ bool cast_from_int64(std::vector &bytes, NumericTypeID type, int64_t sr *reinterpret_cast(bytes.data()) = static_cast(float(src)); } break; + case NumericTypeID::kBF16: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; + case NumericTypeID::kTF32: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; case NumericTypeID::kF32: { *reinterpret_cast(bytes.data()) = static_cast(src); @@ -870,6 +1040,16 @@ bool cast_from_uint64(std::vector &bytes, NumericTypeID type, uint64_t *reinterpret_cast(bytes.data()) = static_cast(float(src)); } break; + case NumericTypeID::kBF16: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; + case NumericTypeID::kTF32: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; case NumericTypeID::kF32: { *reinterpret_cast(bytes.data()) = static_cast(src); @@ -961,6 +1141,16 @@ bool cast_from_double(std::vector &bytes, NumericTypeID type, double sr *reinterpret_cast(bytes.data()) = static_cast(float(src)); } break; + case NumericTypeID::kBF16: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; + case NumericTypeID::kTF32: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; case NumericTypeID::kF32: { *reinterpret_cast(bytes.data()) = static_cast(src); @@ -978,11 +1168,23 @@ bool cast_from_double(std::vector &bytes, NumericTypeID type, double sr x->imag() = static_cast(float(0)); } break; + case NumericTypeID::kCBF16: + { + cutlass::complex *x = reinterpret_cast *>(bytes.data()); + x->real() = static_cast(bfloat16_t(src)); + x->imag() = static_cast(bfloat16_t(0)); + } + break; case NumericTypeID::kCF32: { *reinterpret_cast*>(bytes.data()) = std::complex(float(src), float(0)); } break; + case NumericTypeID::kCTF32: + { + *reinterpret_cast*>(bytes.data()) = std::complex(tfloat32_t(src), tfloat32_t(0)); + } + break; case NumericTypeID::kCF64: { *reinterpret_cast*>(bytes.data()) = std::complex(src, double(0)); diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt index 6e822c68f..a47c83141 100644 --- a/tools/profiler/CMakeLists.txt +++ b/tools/profiler/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/profiler/src/cublas_helpers.cpp b/tools/profiler/src/cublas_helpers.cpp index 5e5e2cb05..05262a22d 100644 --- a/tools/profiler/src/cublas_helpers.cpp +++ b/tools/profiler/src/cublas_helpers.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -52,15 +52,35 @@ Status get_cutlass_status(cublasStatus_t cublas) { } /// Maps a CUTLASS tensor layout to a cuBLAS transpose operation -cublasOperation_t get_cublas_transpose_operation(library::LayoutTypeID layout) { +bool get_cublas_transpose_operation( + cublasOperation_t &operation, + library::LayoutTypeID layout, + library::ComplexTransform transform) { + switch (layout) { case library::LayoutTypeID::kColumnMajor: - return CUBLAS_OP_N; + if (transform == library::ComplexTransform::kNone) { + operation = CUBLAS_OP_N; + return true; + } + else { + return false; + } + break; case library::LayoutTypeID::kRowMajor: - return CUBLAS_OP_T; + if (transform == library::ComplexTransform::kNone) { + operation = CUBLAS_OP_T; + return true; + } + else if (transform == library::ComplexTransform::kConjugate) { + operation = CUBLAS_OP_C; + return true; + } + break; default: break; } - throw std::runtime_error("CUTLASS layout type does not correspond to cublas type"); + + return false; } /// Maps a CUTLASS numeric type to a cuBLAS data type enumeration @@ -114,6 +134,14 @@ bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID ele case library::NumericTypeID::kB1: break; + + case library::NumericTypeID::kCF32: + data_type = CUDA_C_32F; + return true; + + case library::NumericTypeID::kCF64: + data_type = CUDA_C_64F; + return true; case library::NumericTypeID::kInvalid: @@ -157,6 +185,104 @@ Status cublas_satisfies(library::GemmDescription const &desc) { ///////////////////////////////////////////////////////////////////////////////////////////////// +namespace detail { + +cublasGemmExDispatcher::cublasGemmExDispatcher( + library::GemmDescription const &op_desc, + library::GemmUniversalConfiguration configuration_, + library::GemmUniversalArguments arguments_, + cublasGemmAlgo_t algorithm +): + configuration(configuration_), arguments(arguments_), algo(algorithm), status(Status::kSuccess) { + + bool good = true; + + good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A)); + good = (good && get_cublas_transpose_operation(trans_B, op_desc.B.layout, op_desc.transform_B)); + good = (good && get_cublas_datatype(data_type_A, op_desc.A.element)); + good = (good && get_cublas_datatype(data_type_B, op_desc.B.element)); + good = (good && get_cublas_datatype(data_type_C, op_desc.C.element)); + + good = (good && get_cublas_datatype( + compute_data_type, + op_desc.tile_description.math_instruction.element_accumulator)); + + // cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe + // internal numerical data types used in the computation. +#if (__CUDA_VER_MAJOR__ >= 11) + library::OpcodeClassID const & opcode_class = + op_desc.tile_description.math_instruction.opcode_class; + + if (good && + op_desc.A.element == library::NumericTypeID::kF32 && + op_desc.B.element == library::NumericTypeID::kF32 && + opcode_class == library::OpcodeClassID::kTensorOp) { + + compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; + } + else if (good) { + bool const isPedantic = false; + switch (compute_data_type) { + case CUDA_R_32F: + case CUDA_C_32F: + compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F; + break; + case CUDA_R_64F: + case CUDA_C_64F: + compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F; + break; + case CUDA_R_16F: + compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F; + break; + case CUDA_R_32I: + compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I; + break; + default: + good = false; + break; + } + } +#endif // __CUDA_VER_MAJOR__ >= 11 + + if (!good) { + status = Status::kErrorNotSupported; + } +} + +/// Executes GEMM using these arguments +cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) { + + return cublasGemmEx( + handle, + trans_A, + trans_B, + configuration.problem_size.m(), + configuration.problem_size.n(), + configuration.problem_size.k(), + arguments.alpha, + arguments.A, + data_type_A, + int(configuration.lda), + arguments.B, + data_type_B, + int(configuration.ldb), + arguments.beta, + arguments.D, + data_type_C, + int(configuration.ldc), +#if (__CUDA_VER_MAJOR__ >= 11) + compute_type, +#else + compute_data_type, +#endif + algo + ); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// +} // namespace detail + } // namespace profiler } // namespace cutlass diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/src/cublas_helpers.h index 0ade09617..9c8078466 100644 --- a/tools/profiler/src/cublas_helpers.h +++ b/tools/profiler/src/cublas_helpers.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -33,7 +33,10 @@ #include "cutlass/cutlass.h" #include "cutlass/library/library.h" +#include "cutlass/library/util.h" + #include "options.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -45,7 +48,10 @@ namespace profiler { Status get_cutlass_status(cublasStatus_t cublas); /// Maps a CUTLASS tensor layout to a cuBLAS transpose operation -cublasOperation_t get_cublas_transpose_operation(library::LayoutTypeID layout); +bool get_cublas_transpose_operation( + cublasOperation_t &operation, + library::LayoutTypeID layout, + library::ComplexTransform transform = library::ComplexTransform::kNone); /// Maps a CUTLASS numeric type to a cuBLAS data type enumeration bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID element_type); @@ -168,8 +174,8 @@ struct cublasGemmExDispatcher { // // Data members // - library::GemmConfiguration configuration; - library::GemmArguments arguments; + library::GemmUniversalConfiguration configuration; + library::GemmUniversalArguments arguments; // cublass-specific data structures to fill cublas API call arguments cublasOperation_t trans_A; @@ -177,7 +183,12 @@ struct cublasGemmExDispatcher { cudaDataType_t data_type_A; cudaDataType_t data_type_B; cudaDataType_t data_type_C; - cudaDataType_t compute_type; + cudaDataType_t compute_data_type; + +#if (__CUDA_VER_MAJOR__ >= 11) + cublasComputeType_t compute_type; +#endif + cublasGemmAlgo_t algo; Status status; @@ -187,54 +198,13 @@ struct cublasGemmExDispatcher { cublasGemmExDispatcher( library::GemmDescription const &op_desc, - library::GemmConfiguration configuration_, - library::GemmArguments arguments_, + library::GemmUniversalConfiguration configuration_, + library::GemmUniversalArguments arguments_, cublasGemmAlgo_t algorithm = CUBLAS_GEMM_DFALT - ): - configuration(configuration_), arguments(arguments_), algo(algorithm), status(Status::kSuccess) { - - trans_A = get_cublas_transpose_operation(op_desc.A.layout); - trans_B = get_cublas_transpose_operation(op_desc.B.layout); - - bool good = true; - good = (good && get_cublas_datatype(data_type_A, op_desc.A.element)); - good = (good && get_cublas_datatype(data_type_B, op_desc.B.element)); - good = (good && get_cublas_datatype(data_type_C, op_desc.C.element)); - - good = (good && get_cublas_datatype( - compute_type, - op_desc.tile_description.math_instruction.element_accumulator)); - - if (!good) { - status = Status::kErrorNotSupported; - } - } + ); /// Executes GEMM using these arguments - cublasStatus_t operator()(cublasHandle_t handle) { - - return cublasGemmEx( - handle, - trans_A, - trans_B, - configuration.problem_size.m(), - configuration.problem_size.n(), - configuration.problem_size.k(), - arguments.alpha, - arguments.A, - data_type_A, - int(configuration.lda), - arguments.B, - data_type_B, - int(configuration.ldb), - arguments.beta, - arguments.D, - data_type_C, - int(configuration.ldc), - compute_type, - algo - ); - } + cublasStatus_t operator()(cublasHandle_t handle); }; /////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu index b36f897b5..90f4a9597 100644 --- a/tools/profiler/src/cutlass_profiler.cu +++ b/tools/profiler/src/cutlass_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -44,7 +44,7 @@ CutlassProfiler::CutlassProfiler( ): options_(options) { - operation_profilers_.emplace_back(new GemmOperationProfiler); + operation_profilers_.emplace_back(new GemmOperationProfiler(options)); } @@ -108,13 +108,6 @@ void CutlassProfiler::enumerate_() { /// Profiles all operations int CutlassProfiler::profile_() { - library::Manifest manifest(library::Provider::kCUTLASS); - Status status = manifest.initialize(); - - if (status != Status::kSuccess) { - return -1; - } - int result = 0; DeviceContext device_context; @@ -124,7 +117,7 @@ int CutlassProfiler::profile_() { if (options_.operation_kind == library::OperationKind::kInvalid || options_.operation_kind == profiler->kind()) { - result = profiler->profile_all(options_, manifest, device_context); + result = profiler->profile_all(options_, library::Singleton::get().manifest, device_context); if (result) { return result; diff --git a/tools/profiler/src/cutlass_profiler.h b/tools/profiler/src/cutlass_profiler.h index eda24c5bf..d3b592a4e 100644 --- a/tools/profiler/src/cutlass_profiler.h +++ b/tools/profiler/src/cutlass_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -30,6 +30,7 @@ // CUTLASS Library includes #include "cutlass/library/library.h" #include "cutlass/library/manifest.h" +#include "cutlass/library/singleton.h" #include "options.h" #include "operation_profiler.h" diff --git a/tools/profiler/src/debug.h b/tools/profiler/src/debug.h index 8aad2ee9a..aed11ca18 100644 --- a/tools/profiler/src/debug.h +++ b/tools/profiler/src/debug.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu index c97f0de4d..4045abfee 100644 --- a/tools/profiler/src/device_allocation.cu +++ b/tools/profiler/src/device_allocation.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -431,6 +431,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) { dist ); break; + case library::NumericTypeID::kCF32: + cutlass::reference::device::BlockFillRandom>( + reinterpret_cast *>(pointer_), + capacity_, + seed, + dist + ); + break; case library::NumericTypeID::kF64: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), @@ -548,6 +556,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) { dist ); break; + case library::NumericTypeID::kCF32: + cutlass::reference::host::BlockFillRandom>( + reinterpret_cast *>(host_data.data()), + capacity_, + seed, + dist + ); + break; case library::NumericTypeID::kF64: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), @@ -655,6 +671,12 @@ bool DeviceAllocation::block_compare_equal( reinterpret_cast(ptr_A), reinterpret_cast(ptr_B), capacity); + + case library::NumericTypeID::kCF32: + return reference::device::BlockCompareEqual >( + reinterpret_cast const *>(ptr_A), + reinterpret_cast const *>(ptr_B), + capacity); case library::NumericTypeID::kCF16: return reference::device::BlockCompareEqual>( @@ -825,6 +847,23 @@ bool DeviceAllocation::block_compare_relatively_equal( static_cast(epsilon), static_cast(nonzero_floor)); + // No relatively equal comparison for complex numbers. + // + // As a simplification, we can require bitwise equality. This avoids false positives. + // (i.e. "pass" really means passing. "Fail" may not actually mean failure given appropriate epsilon.) + // + case library::NumericTypeID::kCF32: + return reference::device::BlockCompareEqual >( + reinterpret_cast const *>(ptr_A), + reinterpret_cast const *>(ptr_B), + capacity); + + case library::NumericTypeID::kCF64: + return reference::device::BlockCompareEqual >( + reinterpret_cast const *>(ptr_A), + reinterpret_cast const *>(ptr_B), + capacity); + default: throw std::runtime_error("Unsupported numeric type"); } @@ -970,6 +1009,14 @@ void DeviceAllocation::write_tensor_csv( case library::NumericTypeID::kU64: write_tensor_csv_static_type(out, *this); break; + + case library::NumericTypeID::kCF32: + write_tensor_csv_static_type >(out, *this); + break; + + case library::NumericTypeID::kCF64: + write_tensor_csv_static_type >(out, *this); + break; default: throw std::runtime_error("Unsupported numeric type"); diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/src/device_allocation.h index be69f0373..f57cda143 100644 --- a/tools/profiler/src/device_allocation.h +++ b/tools/profiler/src/device_allocation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu index 780e04477..f9cfe9ab5 100644 --- a/tools/profiler/src/device_context.cu +++ b/tools/profiler/src/device_context.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_context.h b/tools/profiler/src/device_context.h index 7be0349ad..aea872eff 100644 --- a/tools/profiler/src/device_context.h +++ b/tools/profiler/src/device_context.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp index 1acefb1f0..29be6f8ba 100644 --- a/tools/profiler/src/enumerated_types.cpp +++ b/tools/profiler/src/enumerated_types.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/enumerated_types.h b/tools/profiler/src/enumerated_types.h index 051406d14..e7e713bdb 100644 --- a/tools/profiler/src/enumerated_types.h +++ b/tools/profiler/src/enumerated_types.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -50,7 +50,7 @@ T from_string(std::string const &); enum class ExecutionMode { kProfile, ///< regular verification and profiling kDryRun, ///< no kernels are launched or workspaces allocated; used to assess what operators might be launched - kEnumerate, ///< no kernels launched or workspaces allocated; lists all function types and functions + kEnumerate, ///< no kernels launched or workspaces allocated; lists all operation kind and operations kTrace, ///< executes a single device-side computation with no other kernel launches kInvalid }; diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu index cb4309681..f494eeee9 100644 --- a/tools/profiler/src/gemm_operation_profiler.cu +++ b/tools/profiler/src/gemm_operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,8 @@ #include #include +#include "cutlass/core_io.h" + #include "cublas_helpers.h" #include "gemm_operation_profiler.h" #include "gpu_timer.h" @@ -44,22 +46,27 @@ namespace profiler { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Ctor -GemmOperationProfiler::GemmOperationProfiler(): - OperationProfiler(library::OperationKind::kGemm,{ - {ArgumentTypeID::kEnumerated, {"Gemm_kind"}, "Variant of GEMM (e.g. gemm, planar complex, batched, ...)"}, - {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"}, - {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"}, - {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"}, - {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"}, - {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"}, - {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"}, - {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"}, - {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"}, - {ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"}, - {ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"}, - }) { +GemmOperationProfiler::GemmOperationProfiler(Options const &options): + OperationProfiler( + options, + library::OperationKind::kGemm, + { + {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (gemm, batched, array, universal, planar_complex, planar_complex_array)"}, + {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"}, + {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"}, + {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"}, + {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"}, + {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"}, + {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"}, + {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"}, + {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"}, + {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"}, + {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of GEMMs computed in one batch"}, + }, + { library::Provider::kCUBLAS} + ) { - description_ = "General matrix-matrix product. D = alpha * A*B + beta * C"; + description_ = " General matrix-matrix product. D = alpha * A*B + beta * C"; } /// Destructor @@ -107,6 +114,8 @@ void GemmOperationProfiler::print_examples(std::ostream &out) const { << " --providers=cutlass --output=functional-test.csv\n\n"; } +///////////////////////////////////////////////////////////////////////////////////////////////// + #if 0 // used this for debugging static std::string byte_string(std::vector const &bytes) { @@ -122,47 +131,34 @@ static std::string byte_string(std::vector const &bytes) { } #endif -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// Extracts the problem dimensions -Status GemmOperationProfiler::initialize_configuration( - Options const &options, - PerformanceReport &report, - DeviceContext &device_context, - library::Operation const *operation, +Status GemmOperationProfiler::GemmProblem::parse( + library::GemmDescription const &operation_desc, ProblemSpace const &problem_space, ProblemSpace::Problem const &problem) { - - library::GemmDescription const &operation_desc = - static_cast(operation->description()); - - if (operation_desc.gemm_kind != library::GemmKind::kGemm) { - return Status::kErrorInvalidProblem; + + if (!arg_as_int(this->m, "m", problem_space, problem)) { + // default value + this->m = 1024; } - if (!arg_as_int(problem_.m, "m", problem_space, problem)) { + if (!arg_as_int(this->n, "n", problem_space, problem)) { // default value - problem_.m = 1024; - } - - if (!arg_as_int(problem_.n, "n", problem_space, problem)) { - // default value - problem_.n = 1024; + this->n = 1024; } - if (!arg_as_int(problem_.k, "k", problem_space, problem)) { + if (!arg_as_int(this->k, "k", problem_space, problem)) { // default value - problem_.k = 1024; + this->k = 1024; } - if (!arg_as_int(problem_.split_k_slices, "split_k_slices", problem_space, problem)) { + if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) { // default value - problem_.split_k_slices = 1; + this->split_k_slices = 1; } - if (!arg_as_int(problem_.batch_count, "batch_count", problem_space, problem)) { + if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) { // default value - problem_.batch_count = 1; + this->batch_count = 1; } if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) { @@ -178,37 +174,97 @@ Status GemmOperationProfiler::initialize_configuration( } if (!arg_as_scalar( - problem_.alpha, + this->alpha, operation_desc.element_epilogue, "alpha", problem_space, problem)) { - if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) { + if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) { return Status::kErrorInternal; } } if (!arg_as_scalar( - problem_.beta, + this->beta, operation_desc.element_epilogue, "beta", problem_space, problem)) { - if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) { + if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) { return Status::kErrorInternal; } } - problem_.lda = DeviceAllocation::get_packed_layout( - operation_desc.A.layout, {int(problem_.m), int(problem_.k)}).front(); + this->lda = DeviceAllocation::get_packed_layout( + operation_desc.A.layout, {int(this->m), int(this->k)}).front(); - problem_.ldb = DeviceAllocation::get_packed_layout( - operation_desc.B.layout, {int(problem_.k), int(problem_.n)}).front(); + this->ldb = DeviceAllocation::get_packed_layout( + operation_desc.B.layout, {int(this->k), int(this->n)}).front(); - problem_.ldc = DeviceAllocation::get_packed_layout( - operation_desc.C.layout, {int(problem_.m), int(problem_.n)}).front(); + this->ldc = DeviceAllocation::get_packed_layout( + operation_desc.C.layout, {int(this->m), int(this->n)}).front(); + + return Status::kSuccess; +} + +/// Initializes a performance result +void GemmOperationProfiler::GemmProblem::initialize_result( + PerformanceResult &result, + library::GemmDescription const &operation_desc, + ProblemSpace const &problem_space) { + + result.arguments.resize(problem_space.rank()); + + set_argument(result, "gemm_kind", problem_space, library::to_string(operation_desc.gemm_kind)); + + set_argument(result, "A", problem_space, + std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout)); + + set_argument(result, "B", problem_space, + std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout)); + + set_argument(result, "C", problem_space, + std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout)); + + set_argument(result, "m", problem_space, m); + set_argument(result, "n", problem_space, n); + set_argument(result, "k", problem_space, k); + + set_argument(result, "split_k_slices", problem_space, split_k_slices); + set_argument(result, "batch_count", problem_space, batch_count); + + set_argument(result, "alpha", problem_space, + library::lexical_cast(alpha, operation_desc.element_epilogue)); + + set_argument(result, "beta", problem_space, + library::lexical_cast(beta, operation_desc.element_epilogue)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Extracts the problem dimensions +Status GemmOperationProfiler::initialize_configuration( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + library::GemmDescription const &operation_desc = + static_cast(operation->description()); + + if (operation_desc.gemm_kind != library::GemmKind::kUniversal) { + return Status::kErrorInvalidProblem; + } + + Status status = problem_.parse(operation_desc, problem_space, problem); + + if (status != Status::kSuccess) { + return status; + } gemm_workspace_.configuration.problem_size.m() = int(problem_.m); gemm_workspace_.configuration.problem_size.n() = int(problem_.n); @@ -217,7 +273,8 @@ Status GemmOperationProfiler::initialize_configuration( gemm_workspace_.configuration.ldb = problem_.ldb; gemm_workspace_.configuration.ldc = problem_.ldc; gemm_workspace_.configuration.ldd = problem_.ldc; - gemm_workspace_.configuration.split_k_slices = int(problem_.split_k_slices); + //gemm_workspace_.configuration.split_k_slices = int(problem_.split_k_slices); + gemm_workspace_.configuration.batch_count = int(problem_.split_k_slices); gemm_workspace_.arguments.A = nullptr; gemm_workspace_.arguments.B = nullptr; @@ -243,37 +300,24 @@ void GemmOperationProfiler::initialize_result_( result.disposition = Disposition::kNotRun; result.status = Status::kSuccess; result.operation_name = operation_desc.name; - - result.arguments.resize(problem_space.rank()); - - set_argument_(result, "A", problem_space, - std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout)); - - set_argument_(result, "B", problem_space, - std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout)); - - set_argument_(result, "C", problem_space, - std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout)); - - set_argument_(result, "m", problem_space, problem_.m); - set_argument_(result, "n", problem_space, problem_.n); - set_argument_(result, "k", problem_space, problem_.k); - - set_argument_(result, "split_k_slices", problem_space, problem_.split_k_slices); - set_argument_(result, "batch_count", problem_space, problem_.batch_count); - - set_argument_(result, "alpha", problem_space, - library::lexical_cast(problem_.alpha, operation_desc.element_epilogue)); - - set_argument_(result, "beta", problem_space, - library::lexical_cast(problem_.beta, operation_desc.element_epilogue)); + + problem_.initialize_result(result, operation_desc, problem_space); OperationProfiler::initialize_result_(result, operation_desc, problem_space); + // Input bytes read and Output bytes written for the gemm problem result.bytes = int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.m / 8) * problem_.k + int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.n / 8) * problem_.k + - int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n * 2; + int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n; + + // Set is_beta_zero true if beta is zero + bool is_beta_zero = std::all_of(problem_.beta.begin(), problem_.beta.end(), [](uint8_t i) { return i==0; }); + + // Output bytes read for the gemm problem for non-zero beta values + if (!is_beta_zero) { + result.bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n; + } result.flops = 2 * (problem_.m * problem_.n * problem_.k + problem_.m * problem_.n); result.runtime = 0; @@ -378,8 +422,9 @@ Status GemmOperationProfiler::initialize_workspace( results_.back().provider = library::Provider::kCUTLASS; results_.back().op_kind = library::OperationKind::kGemm; results_.back().disposition = Disposition::kNotRun; - for(auto &verification_provider : options.verification.providers) { - results_.back().verification_map[verification_provider] = Disposition::kNotRun; + + for(auto provider : verification_providers_) { + results_.back().verification_map[provider] = Disposition::kNotRun; } } @@ -559,8 +604,7 @@ bool GemmOperationProfiler::verify_with_cublas_( ); if (gemm_op.status != Status::kSuccess) { - - results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed; + results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun; return true; } diff --git a/tools/profiler/src/gemm_operation_profiler.h b/tools/profiler/src/gemm_operation_profiler.h index 3bd0bb62e..e4d23212e 100644 --- a/tools/profiler/src/gemm_operation_profiler.h +++ b/tools/profiler/src/gemm_operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,7 @@ #include #include #include +#include #include // CUTLASS Library includes @@ -75,6 +76,18 @@ public: GemmProblem(): m(16), n(16), k(16), lda(0), ldb(0), ldc(0), split_k_slices(1), batch_count(1) { } + + /// Parses the problem + Status parse( + library::GemmDescription const &operation_desc, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Initializes a performance result + void initialize_result( + PerformanceResult &result, + library::GemmDescription const &operation_desc, + ProblemSpace const &problem_space); }; /// Workspace used @@ -86,8 +99,8 @@ public: DeviceAllocation *Computed; DeviceAllocation *Reference; - library::GemmConfiguration configuration; - library::GemmArguments arguments; + library::GemmUniversalConfiguration configuration; + library::GemmUniversalArguments arguments; /// Buffer used for the operation's host workspace std::vector host_workspace; @@ -122,7 +135,7 @@ public: // /// Ctor - GemmOperationProfiler(); + GemmOperationProfiler(Options const &options); /// Destructor virtual ~GemmOperationProfiler(); diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp index 218e09d31..eb3a84115 100644 --- a/tools/profiler/src/gpu_timer.cpp +++ b/tools/profiler/src/gpu_timer.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/src/gpu_timer.h index ca00ad7aa..5cd4b0037 100644 --- a/tools/profiler/src/gpu_timer.h +++ b/tools/profiler/src/gpu_timer.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp index a76fcf9ac..a1e523111 100644 --- a/tools/profiler/src/main.cpp +++ b/tools/profiler/src/main.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu index 6d21f87e4..754118a73 100644 --- a/tools/profiler/src/operation_profiler.cu +++ b/tools/profiler/src/operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef __unix__ #include @@ -55,30 +56,41 @@ OperationProfiler::OperationProfiler(): kind_(library::OperationKind::kInvalid) /// Ctor OperationProfiler::OperationProfiler( + Options const &options, library::OperationKind kind, ArgumentDescriptionVector const &arguments, - ProviderVector const & reference_providers + ProviderVector const & verification_providers ): - kind_(kind), arguments_(arguments), reference_providers_(reference_providers) { + kind_(kind), arguments_(arguments) { ArgumentDescriptionVector tile_description_arguments{ - {ArgumentTypeID::kEnumerated, {"op_class", "opcode-class"}, "Class of math instruction (SIMT or TensorOp)."}, - {ArgumentTypeID::kEnumerated, {"accum", "accumulator-type"}, "Math instruction accumulator data type."}, - {ArgumentTypeID::kInteger, {"cta_m", "threadblock-shape::m"}, "Threadblock shape in the M dimension."}, - {ArgumentTypeID::kInteger, {"cta_n", "threadblock-shape::n"}, "Threadblock shape in the N dimension."}, - {ArgumentTypeID::kInteger, {"cta_k", "threadblock-shape::k"}, "Threadblock shape in the K dimension."}, - {ArgumentTypeID::kInteger, {"stages", "threadblock-stages"}, "Number of stages of threadblock-scoped matrix multiply."}, - {ArgumentTypeID::kInteger, {"warps_m", "warp-count::m"}, "Number of warps within threadblock along the M dimension."}, - {ArgumentTypeID::kInteger, {"warps_n", "warp-count::n"}, "Number of warps within threadblock along the N dimension."}, - {ArgumentTypeID::kInteger, {"warps_k", "warp-count::k"}, "Number of warps within threadblock along the K dimension."}, - {ArgumentTypeID::kInteger, {"inst_m", "instruction-shape::m"}, "Math instruction shape in the M dimension."}, - {ArgumentTypeID::kInteger, {"inst_n", "instruction-shape::n"}, "Math instruction shape in the N dimension."}, - {ArgumentTypeID::kInteger, {"inst_k", "instruction-shape::k"}, "Math instruction shape in the K dimension."}, - {ArgumentTypeID::kInteger, {"min_cc", "minimum-compute-capability"}, "Minimum device compute capability."}, - {ArgumentTypeID::kInteger, {"max_cc", "maximum-compute-capability"}, "Maximum device compute capability."} + {ArgumentTypeID::kEnumerated, {"op_class", "opcode-class"}, "Class of math instruction (simt, tensorop, wmmatensorop, wmma)"}, + {ArgumentTypeID::kEnumerated, {"accum", "accumulator-type"}, "Math instruction accumulator data type"}, + {ArgumentTypeID::kInteger, {"cta_m", "threadblock-shape::m"}, "Threadblock shape in the M dimension"}, + {ArgumentTypeID::kInteger, {"cta_n", "threadblock-shape::n"}, "Threadblock shape in the N dimension"}, + {ArgumentTypeID::kInteger, {"cta_k", "threadblock-shape::k"}, "Threadblock shape in the K dimension"}, + {ArgumentTypeID::kInteger, {"stages", "threadblock-stages"}, "Number of stages of threadblock-scoped matrix multiply"}, + {ArgumentTypeID::kInteger, {"warps_m", "warp-count::m"}, "Number of warps within threadblock along the M dimension"}, + {ArgumentTypeID::kInteger, {"warps_n", "warp-count::n"}, "Number of warps within threadblock along the N dimension"}, + {ArgumentTypeID::kInteger, {"warps_k", "warp-count::k"}, "Number of warps within threadblock along the K dimension"}, + {ArgumentTypeID::kInteger, {"inst_m", "instruction-shape::m"}, "Math instruction shape in the M dimension"}, + {ArgumentTypeID::kInteger, {"inst_n", "instruction-shape::n"}, "Math instruction shape in the N dimension"}, + {ArgumentTypeID::kInteger, {"inst_k", "instruction-shape::k"}, "Math instruction shape in the K dimension"}, + {ArgumentTypeID::kInteger, {"min_cc", "minimum-compute-capability"}, "Minimum device compute capability"}, + {ArgumentTypeID::kInteger, {"max_cc", "maximum-compute-capability"}, "Maximum device compute capability"} }; arguments_.insert(arguments_.end(), tile_description_arguments.begin(), tile_description_arguments.end()); + + for (auto provider : verification_providers) { + if (std::find( + options.verification.providers.begin(), + options.verification.providers.end(), + provider) != options.verification.providers.end()) { + + verification_providers_.push_back(provider); + } + } } /// Destructor @@ -248,8 +260,9 @@ int OperationProfiler::profile_all( auto min_cc = operation->description().tile_description.minimum_compute_capability; auto max_cc = operation->description().tile_description.maximum_compute_capability; - // Execute compatible operations if they satisfy the current device's compute capability + // Execute compatible cutlass operations if they satisfy the current device's compute capability if (operation->description().kind == kind_ && + operation->description().provider == library::Provider::kCUTLASS && options.device.compute_capability() >= min_cc && options.device.compute_capability() <= max_cc) { @@ -259,7 +272,7 @@ int OperationProfiler::profile_all( if (!filtered_by_name) { for (auto const & op_name : options.operation_names) { - if (operation_name.find(op_name) !=std::string::npos) { + if (find_string_matches_(op_name, operation_name)) { filtered_by_name = true; break; } @@ -278,7 +291,7 @@ int OperationProfiler::profile_all( operation, problem_space, problem); - + if (status == Status::kErrorInternal) { // Stop profiling if there was an internal error return false; @@ -548,29 +561,28 @@ void OperationProfiler::initialize_result_( library::OperationDescription const &operation_desc, ProblemSpace const &problem_space) { - set_argument_(result, "op_class", problem_space, + set_argument(result, "op_class", problem_space, library::to_string(operation_desc.tile_description.math_instruction.opcode_class)); - set_argument_(result, "accum", problem_space, + set_argument(result, "accum", problem_space, library::to_string(operation_desc.tile_description.math_instruction.element_accumulator)); - set_argument_(result, "cta_m", problem_space, operation_desc.tile_description.threadblock_shape.m()); - set_argument_(result, "cta_n", problem_space, operation_desc.tile_description.threadblock_shape.n()); - set_argument_(result, "cta_k", problem_space, operation_desc.tile_description.threadblock_shape.k()); - set_argument_(result, "stages", problem_space, operation_desc.tile_description.threadblock_stages); - set_argument_(result, "warps_m", problem_space, operation_desc.tile_description.warp_count.m()); - set_argument_(result, "warps_n", problem_space, operation_desc.tile_description.warp_count.n()); - set_argument_(result, "warps_k", problem_space, operation_desc.tile_description.warp_count.k()); - set_argument_(result, "inst_m", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.m()); - set_argument_(result, "inst_n", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.n()); - set_argument_(result, "inst_k", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.k()); - set_argument_(result, "min_cc", problem_space, operation_desc.tile_description.minimum_compute_capability); - set_argument_(result, "max_cc", problem_space, operation_desc.tile_description.maximum_compute_capability); + set_argument(result, "cta_m", problem_space, operation_desc.tile_description.threadblock_shape.m()); + set_argument(result, "cta_n", problem_space, operation_desc.tile_description.threadblock_shape.n()); + set_argument(result, "cta_k", problem_space, operation_desc.tile_description.threadblock_shape.k()); + set_argument(result, "stages", problem_space, operation_desc.tile_description.threadblock_stages); + set_argument(result, "warps_m", problem_space, operation_desc.tile_description.warp_count.m()); + set_argument(result, "warps_n", problem_space, operation_desc.tile_description.warp_count.n()); + set_argument(result, "warps_k", problem_space, operation_desc.tile_description.warp_count.k()); + set_argument(result, "inst_m", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.m()); + set_argument(result, "inst_n", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.n()); + set_argument(result, "inst_k", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.k()); + set_argument(result, "min_cc", problem_space, operation_desc.tile_description.minimum_compute_capability); + set_argument(result, "max_cc", problem_space, operation_desc.tile_description.maximum_compute_capability); } - /// Helper -void OperationProfiler::set_argument_( +void OperationProfiler::set_argument( PerformanceResult &result, char const *name, ProblemSpace const &problem_space, @@ -579,7 +591,7 @@ void OperationProfiler::set_argument_( result.arguments.at(problem_space.argument_index(name)) = make_pair(std::string(name), value); } -void OperationProfiler::set_argument_( +void OperationProfiler::set_argument( PerformanceResult &result, char const *name, ProblemSpace const &problem_space, @@ -588,6 +600,39 @@ void OperationProfiler::set_argument_( result.arguments.at(problem_space.argument_index(name)) = make_pair(std::string(name), library::lexical_cast(value)); } + +/// finds string matches filter_string in operation_name +bool OperationProfiler::find_string_matches_( + std::string const &filter_string, + std::string const &operation_name) { + // Returns true if all substrings appear in the operation_name in order + + // Split filter_string of the format "gemm*f32*nt" to tokens ["gemm", "f32", "nt"] + std::string item; + std::istringstream iss(filter_string); + std::vector filter_tokens; + while (std::getline(iss, item, '*')) { + filter_tokens.push_back(item); + } + + // Search filter_tokens in operation_name in order + size_t start = 0, idx = 0; + for(auto & token : filter_tokens) { + // Check if characters left to be parsed in operation_name + if (start < operation_name.length()) { + // Find token in operation_name[start:] + idx = operation_name.substr(start).find(token); + if (idx == std::string::npos) { + return false; + } + } + start += (idx + token.length()); + } + + // All tokens in filter_string found in operation_name + return true; +} + /////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace profiler diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/src/operation_profiler.h index ce06b1c96..c7e20f36f 100644 --- a/tools/profiler/src/operation_profiler.h +++ b/tools/profiler/src/operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ protected: ArgumentDescriptionVector arguments_; /// List of providers used to verify and compare each result - ProviderVector reference_providers_; + ProviderVector verification_providers_; /// Model performance result initailized by the operation profiler with workload statistics /// and reasonable default state. @@ -92,9 +92,10 @@ public: OperationProfiler(); OperationProfiler( + Options const &options, library::OperationKind kind, ArgumentDescriptionVector const &arguments = ArgumentDescriptionVector(), - ProviderVector const & reference_providers = ProviderVector()); + ProviderVector const & verification_providers = ProviderVector()); /// Destructor virtual ~OperationProfiler(); @@ -196,6 +197,20 @@ public: library::OperationDescription const &desc, library::Provider provider, library::Provider verification_provider = library::Provider::kInvalid); + + /// Helper to set a performance result member + static void set_argument( + PerformanceResult &result, + char const *name, + ProblemSpace const &problem_space, + std::string const &value); + + /// Helper to set a performance result member + static void set_argument( + PerformanceResult &result, + char const *name, + ProblemSpace const &problem_space, + int64_t value); protected: @@ -205,20 +220,6 @@ protected: library::OperationDescription const &operation_desc, ProblemSpace const &problem_space); - /// Helper to set a performance result member - static void set_argument_( - PerformanceResult &result, - char const *name, - ProblemSpace const &problem_space, - std::string const &value); - - /// Helper to set a performance result member - static void set_argument_( - PerformanceResult &result, - char const *name, - ProblemSpace const &problem_space, - int64_t value); - /// Method to profile an initialized CUTLASS operation virtual Status profile_cutlass_( double &runtime, @@ -227,6 +228,12 @@ protected: void const *arguments, void *host_workspace, void *device_workspace); + +private: + /// finds string matches filter_string in operation_name + bool find_string_matches_( + std::string const &filter_string, + std::string const &operation_name); }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu index 946e536c7..5f62a81e7 100644 --- a/tools/profiler/src/options.cu +++ b/tools/profiler/src/options.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -76,7 +76,7 @@ Options::Device::Device(cutlass::CommandLine const &cmdline) { void Options::Device::print_usage(std::ostream &out) const { out << "Device:\n" - << " --device= " + << " --device= " << " CUDA Device ID\n\n"; int device_count = 0; @@ -106,7 +106,7 @@ void Options::Device::print_usage(std::ostream &out) const { } out - << " --compute-capability= " + << " --compute-capability= " << " Override the compute capability.\n\n"; } @@ -255,12 +255,6 @@ void Options::Initialization::get_distribution( continue; // next token } - // Casts as integer without scaling - if (it->first.compare("integer") == 0) { - dist.int_scale = 0; - continue; // next token - } - // initialize other members for (int m = 0; members[m].label; ++m) { if (it->first == members[m].label && !it->second.empty()) { @@ -276,19 +270,23 @@ void Options::Initialization::print_usage(std::ostream &out) const { out << "Initialization:\n" - << " --initialization= " + << " --initialization= " << " Enables initialization (default: true). If false, device memory is" << end_of_line - << "not initialized after allocation.\n\n" + << " not initialized after allocation.\n\n" - << " --initialization-provider= " - << " Selects 'device' or 'host' initialization.\n\n" + << " --initialization-provider= " + << " Selects initialization provider {host, device*}. (default: '*')\n\n" - << " --dist= " - << " Data distribution of input tensors\n\n" + << " --dist= " + << " Data distribution of input tensors {uniform*, gaussian, identity, sequential}" << end_of_line + << " --dist=uniform,min:,max:,scale:" << end_of_line + << " --dist=gaussian,mean:,stddev:,scale:" << end_of_line + << " --dist=sequential,start:,delta:,scale:" << end_of_line + << " --dist=identity\n\n" - << " --seed= " + << " --seed= " << " Random number generator seed. Used to enforce deterministic" << end_of_line - << "initialization.\n\n"; + << " initialization.\n\n"; } @@ -339,12 +337,12 @@ void Options::Library::print_usage(std::ostream &out) const { out << "Library:\n" - << " --library-algo-mode= " + << " --library-algo-mode= " << " Indicates algorithm mode used to call libraries such as cuBLAS and cuDNN.\n" - << " " + << " " << " mode={default*,matching,best}\n\n" - << " --library-algos= " + << " --library-algos= " << " If --algorithm-mode=best, permits specifying a selection of algorithms.\n\n"; } @@ -393,21 +391,25 @@ void Options::Profiling::print_usage(std::ostream &out) const { out << "Profiling:\n" - << " --profiling-iterations= " + << " --profiling-iterations= " << " Number of iterations to profile each kernel. If zero, kernels" << end_of_line - << "are launched up to the profiling duration.\n\n" + << " are launched up to the profiling duration.\n\n" - << " --warmup-iterations= " + << " --warmup-iterations= " << " Number of iterations to execute each kernel prior to profiling.\n\n" - << " --sleep-duration= " - << " Number of ms to sleep between profiling periods (ms)\n\n" + << " --sleep-duration= " + << " Number of ms to sleep between profiling periods (ms).\n\n" - << " --profiling-enabled= " + << " --profiling-enabled= " << " If true, profiling is actually conducted.\n\n" - << " --providers= " - << " List of providers to be profiled for performance\n\n"; + << " --providers= " + << " List of providers to be profiled for performance. (default: '*')" << end_of_line + << " Gemm providers {cutlass*" + << "}" << end_of_line + << "\n\n"; + } void Options::Profiling::print_options(std::ostream &out, int indent) const { @@ -477,6 +479,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) { } else { providers.push_back(library::Provider::kCUBLAS); + providers.push_back(library::Provider::kReferenceDevice); } } @@ -484,22 +487,27 @@ void Options::Verification::print_usage(std::ostream &out) const { out << "Verification:\n" - << " --verification-enabled= " + << " --verification-enabled= " << " Whether to perform verification checks.\n\n" - << " --epsilon= " + << " --epsilon= " << " Error threshold. Setting to zero (default) requires" << end_of_line - << "bit-level equivalence.\n\n" + << " bit-level equivalence.\n\n" - << " --nonzero-floor= " + << " --nonzero-floor= " << " Results whose absolute value is less than this quantity" << end_of_line - << "are treated as zero for comparisons.\n\n" + << " are treated as zero for comparisons.\n\n" - << " --save-workspace={*never,incorrect,always}" - << " Specifies when to save the GEMM inputs and results to the filesystem.\n\n" + << " --save-workspace= " + << " Specifies when to save the GEMM inputs and results to the filesystem." << end_of_line + << " --save-workspace=never never save workspace (default)" << end_of_line + << " --save-workspace=incorrect save workspace for incorrect results" << end_of_line + << " --save-workspace=always always save workspace\n\n" - << " --verification-providers= " - << " List of providers used to verify result. (default: device)\n\n"; + << " --verification-providers= " + << " List of providers used to verify result. (default: '*')" << end_of_line + << " Gemm verification-providers {cublas*}" << end_of_line + << "\n\n"; } void Options::Verification::print_options(std::ostream &out, int indent) const { @@ -554,22 +562,22 @@ void Options::Report::print_usage(std::ostream &out) const { out << "Report:\n" - << " --append= " + << " --append= " << " If true, result is appended to possibly existing file. Otherwise, " << end_of_line - << "any existing file is overwritten.\n\n" + << " any existing file is overwritten.\n\n" - << " --output= " - << " Path to output file for machine readable results.\n\n" + << " --output= " + << " Path to output file for machine readable results. Operation kind and '.csv' is appended.\n\n" - << " --report-not-run= " + << " --report-not-run= " << " If true, reports the status of all kernels including those that" << end_of_line - << "do not satisfy the given arguments.\n\n" + << " do not satisfy the given arguments.\n\n" - << " --tags= " + << " --tags= " << " Inserts leading columns in output table and uniform values for each" << end_of_line - << "column. Useful for generating pivot tables.\n\n" + << " column. Useful for generating pivot tables.\n\n" - << " --verbose= " + << " --verbose= " << " Prints human-readable text to stdout. If false, nothing is written to stdout.\n\n"; } @@ -600,7 +608,7 @@ Options::About::About(cutlass::CommandLine const &cmdline) { void Options::About::print_usage(std::ostream &out) const { out << "About:\n" - << " --version "; + << " --version "; print_version(out); @@ -675,22 +683,29 @@ Options::Options(cutlass::CommandLine const &cmdline): void Options::print_usage(std::ostream &out) const { out - << "CUTLASS Performance Tool\n" + << "CUTLASS Profiler\n" << "usage:\n\n" << " cutlass_profiler [options]\n\n" << " --help\n\n" - << " --mode={profile*,single,dry,trace,enumerate} " - << " Regular profiling, single kernel mode only, or no profiling.\n\n" + << " --mode= " + << " Cutlass profiler execution mode." << end_of_line + << " --mode=profile regular verification and profiling (default)" << end_of_line + << " --mode=dry_run no kernels are launched or workspaces allocated" << end_of_line + << " --mode=enumerate lists all operation kind and operations" << end_of_line + << " --mode=trace executes a single device-side computation with" << end_of_line + << " no other kernel launches\n\n" - << " --device-info " + << " --device-info " << " Prints information on all GPUs present in the system\n\n" - << " --operation= " + << " --operation= " << " CUTLASS operation to profile.\n\n" - << " --kernels= " - << " List of substrings to filter operations by name.\n\n" + << " --kernels= " + << " Filter operations by kernel names. For example, call all kernels with" << end_of_line + << " (\"s1688\" and \"nt\") or (\"s844\" and \"tn\" and \"align8\") in their" << end_of_line + << " operation name using --kernels=\"s1688*nt, s884*tn*align8\"\n\n" ; // @@ -755,4 +770,3 @@ std::string Options::indent_str(int indent) { } // namespace profiler } // namespace cutlass - diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h index 4f723fa54..f4b5f0a13 100644 --- a/tools/profiler/src/options.h +++ b/tools/profiler/src/options.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp index 52a820997..0ab704492 100644 --- a/tools/profiler/src/performance_report.cpp +++ b/tools/profiler/src/performance_report.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -68,9 +68,11 @@ PerformanceReport::PerformanceReport( ): options_(options), argument_names_(argument_names), problem_index_(0), good_(true), op_kind_(op_kind) { - std::string file_name = options_.report.output_path.substr(0, options_.report.output_path.rfind(".")); - std::string file_extension = options_.report.output_path.substr(options_.report.output_path.rfind(".") + 1); - op_file_name_ = file_name + "." + to_string(op_kind_) + "." + file_extension; + // Strip '.csv' if present + std::string base_path = options_.report.output_path.substr( + 0, options_.report.output_path.rfind(".csv")); + + op_file_name_ = base_path + "." + to_string(op_kind_) + ".csv"; // // Open output file for operation of PerformanceReport::op_kind @@ -166,6 +168,7 @@ void PerformanceReport::close() { static const char *disposition_status_color(Disposition disposition) { switch (disposition) { case Disposition::kPassed: return SHELL_COLOR_GREEN(); + case Disposition::kIncorrect: return SHELL_COLOR_RED(); case Disposition::kFailed: return SHELL_COLOR_RED(); default: break; @@ -195,16 +198,17 @@ std::ostream & PerformanceReport::print_result_pretty_( out << "\n" - << " Provider: " << SHELL_COLOR_BRIGHT() << library::to_string(result.provider, true) << SHELL_COLOR_END() << "\n" - << " Operation: " << result.operation_name << "\n\n" - << " Status: " << SHELL_COLOR_BRIGHT() << library::to_string(result.status, true) << SHELL_COLOR_END() << "\n" - << " Verification: " << SHELL_COLOR_BRIGHT() << (options_.verification.enabled ? "ON":"OFF") << SHELL_COLOR_END() << "\n" - << " Disposition: " << disposition_status_color(result.disposition) << to_string(result.disposition, true) << SHELL_COLOR_END() << "\n\n"; + << " Provider: " << SHELL_COLOR_BRIGHT() << library::to_string(result.provider, true) << SHELL_COLOR_END() << "\n" + << " OperationKind: " << SHELL_COLOR_BRIGHT() << library::to_string(result.op_kind) << SHELL_COLOR_END() << "\n" + << " Operation: " << result.operation_name << "\n\n" + << " Status: " << SHELL_COLOR_BRIGHT() << library::to_string(result.status, true) << SHELL_COLOR_END() << "\n" + << " Verification: " << SHELL_COLOR_BRIGHT() << (options_.verification.enabled ? "ON":"OFF") << SHELL_COLOR_END() << "\n" + << " Disposition: " << disposition_status_color(result.disposition) << to_string(result.disposition, true) << SHELL_COLOR_END() << "\n\n"; // Display individual verification results for each verification-provider if (options_.verification.enabled) { - static int const indent_spaces = 22; + static int const indent_spaces = 16; for(auto & m : result.verification_map) { out << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n"; @@ -212,15 +216,15 @@ std::ostream & PerformanceReport::print_result_pretty_( } out - << "\n Arguments: "; + << "\n Arguments:"; int column_idx = 0; for (auto const &arg : result.arguments) { if (!arg.second.empty()) { out << " --" << arg.first << "=" << arg.second; column_idx += int(4 + arg.first.size() + arg.second.size()); - if (column_idx > 90) { - out << " \\\n "; + if (column_idx > 98) { + out << " \\\n "; column_idx = 0; } } @@ -228,15 +232,15 @@ std::ostream & PerformanceReport::print_result_pretty_( out << "\n\n"; out - << " Bytes: " << result.bytes << " bytes\n" - << " FLOPs: " << result.flops << " flops\n\n"; + << " Bytes: " << result.bytes << " bytes\n" + << " FLOPs: " << result.flops << " flops\n\n"; if (result.good()) { out - << " Runtime: " << result.runtime << " ms\n" - << " Memory: " << result.gbytes_per_sec() << " GiB/s\n" - << "\n Math: " << result.gflops_per_sec() << " GFLOP/s\n"; + << " Runtime: " << result.runtime << " ms\n" + << " Memory: " << result.gbytes_per_sec() << " GiB/s\n" + << "\n Math: " << result.gflops_per_sec() << " GFLOP/s\n"; } @@ -256,7 +260,7 @@ std::ostream & PerformanceReport::print_csv_header_( out << (column_idx ? "," : "") << "Problem,Provider" - << ",Operation,Disposition,Status"; + << ",OperationKind,Operation,Disposition,Status"; for (auto const &arg_name : argument_names_) { out << "," << arg_name; @@ -289,6 +293,7 @@ std::ostream & PerformanceReport::print_result_csv_( << (column_idx ? "," : "") << result.problem_index << "," << to_string(result.provider, true) + << "," << to_string(result.op_kind) << "," << result.operation_name << "," << to_string(result.disposition) << "," << library::to_string(result.status); diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/src/performance_report.h index 573a049e7..1c086e618 100644 --- a/tools/profiler/src/performance_report.h +++ b/tools/profiler/src/performance_report.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu new file mode 100644 index 000000000..86cabfb75 --- /dev/null +++ b/tools/profiler/src/performance_result.cu @@ -0,0 +1,55 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief +*/ + +#pragma once + +#include + +#include "cutlass/cutlass.h" + +// CUTLASS Profiler includes +#include "enumerated_types.h" +#include "performance_result.h" + +// CUTLASS Library includes +#include "cutlass/library/library.h" +#include "cutlass/library/util.h" + +namespace cutlass { +namespace profiler { + +///////////////////////////////////////////////////////////////////////////////////////////////// + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace profiler +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/profiler/src/performance_result.h b/tools/profiler/src/performance_result.h index 23eb60f24..9e3ebeb5c 100644 --- a/tools/profiler/src/performance_result.h +++ b/tools/profiler/src/performance_result.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -65,7 +65,7 @@ struct PerformanceResult { /// Outcome of verification (all verification results) DispositionMap verification_map; - /// Operation object + /// Operation name std::string operation_name; /// Stringified vector of argument values @@ -119,3 +119,4 @@ using PerformanceResultVector = std::vector; } // namespace profiler } // namespace cutlass + diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp index e95b9e1ba..adede0ea1 100644 --- a/tools/profiler/src/problem_space.cpp +++ b/tools/profiler/src/problem_space.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -849,6 +849,47 @@ bool arg_as_OpcodeClassID( return arg_as_OpcodeClassID(opcode_class, value_ptr); } + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_SplitKModeID( + library::SplitKMode &split_k_mode, + KernelArgument::Value const *value_ptr) { + + if (value_ptr->not_null) { + if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) { + + split_k_mode = library::from_string( + static_cast(value_ptr)->element); + + if (split_k_mode == library::SplitKMode::kInvalid) { + throw std::runtime_error( + "arg_as_SplitKModeID() - illegal cast."); + } + } + else { + + throw std::runtime_error( + "arg_as_SplitKModeID() - illegal cast."); + } + return true; + } + return false; +} + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_SplitKModeID( + library::SplitKMode &split_k_mode, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + size_t idx = problem_space.argument_index(name); + KernelArgument::Value const *value_ptr = problem.at(idx).get(); + + return arg_as_SplitKModeID(split_k_mode, value_ptr); +} + + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null. bool arg_as_scalar( @@ -939,7 +980,6 @@ bool tensor_description_satisfies( } ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace profiler } // namespace cutlass diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h index 8dfd216cf..77a79ca2a 100644 --- a/tools/profiler/src/problem_space.h +++ b/tools/profiler/src/problem_space.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -811,6 +811,17 @@ bool arg_as_OpcodeClassID( ProblemSpace const &problem_space, ProblemSpace::Problem const &problem); + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_SplitKModeID(library::SplitKMode &split_k_mode, KernelArgument::Value const *value_ptr); + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_SplitKModeID( + library::SplitKMode &split_k_mode, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null. bool arg_as_scalar( std::vector &bytes, diff --git a/tools/util/CMakeLists.txt b/tools/util/CMakeLists.txt index 51be4b541..0d2f86fb9 100644 --- a/tools/util/CMakeLists.txt +++ b/tools/util/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h index 31fa7f346..c158ef976 100644 --- a/tools/util/include/cutlass/util/command_line.h +++ b/tools/util/include/cutlass/util/command_line.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/debug.h b/tools/util/include/cutlass/util/debug.h index 065a94e42..3ebbd4d84 100644 --- a/tools/util/include/cutlass/util/debug.h +++ b/tools/util/include/cutlass/util/debug.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/device_dump.h b/tools/util/include/cutlass/util/device_dump.h index 2dd67c890..dac6029c4 100644 --- a/tools/util/include/cutlass/util/device_dump.h +++ b/tools/util/include/cutlass/util/device_dump.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h index 52229425e..79b123687 100644 --- a/tools/util/include/cutlass/util/device_memory.h +++ b/tools/util/include/cutlass/util/device_memory.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h index d9b61ca55..033773774 100644 --- a/tools/util/include/cutlass/util/distribution.h +++ b/tools/util/include/cutlass/util/distribution.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/exceptions.h b/tools/util/include/cutlass/util/exceptions.h index ab5623bfc..b6cf2fcd8 100644 --- a/tools/util/include/cutlass/util/exceptions.h +++ b/tools/util/include/cutlass/util/exceptions.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h index bb9ed621b..d46d45946 100644 --- a/tools/util/include/cutlass/util/host_reorder.h +++ b/tools/util/include/cutlass/util/host_reorder.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/host_tensor.h b/tools/util/include/cutlass/util/host_tensor.h index b43186a0d..c734a5f5e 100644 --- a/tools/util/include/cutlass/util/host_tensor.h +++ b/tools/util/include/cutlass/util/host_tensor.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/host_tensor_planar_complex.h b/tools/util/include/cutlass/util/host_tensor_planar_complex.h index a5e990cf1..ed85cf223 100644 --- a/tools/util/include/cutlass/util/host_tensor_planar_complex.h +++ b/tools/util/include/cutlass/util/host_tensor_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/detail/inner_product.h b/tools/util/include/cutlass/util/reference/detail/inner_product.h index 77a3076ed..f75f8b888 100644 --- a/tools/util/include/cutlass/util/reference/detail/inner_product.h +++ b/tools/util/include/cutlass/util/reference/detail/inner_product.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h index 9dc66cca2..5aef19ff2 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h index 10ce474e6..b3003409b 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h +++ b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h index 6e3891029..4c8e361ec 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h index cf47c9a4e..4d9de5156 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h index b7c2f073a..64cb37bea 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h index dca50c2f7..3323bed51 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h index 34ba24754..962ded094 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h index aa6610e1d..d03080b2a 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_relu.h b/tools/util/include/cutlass/util/reference/device/tensor_relu.h new file mode 100644 index 000000000..d78e19533 --- /dev/null +++ b/tools/util/include/cutlass/util/reference/device/tensor_relu.h @@ -0,0 +1,135 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines device-side elementwise operations on TensorView. Note, the operations defined + in this header are not specialized for any particular data layout and are therefore not + intended to offer the best possible performance. Rather, they are intended to be generic + reference implementations to support the CUTLASS unit tests. +*/ + +#pragma once + +// Cutlass includes +#include "cutlass/cutlass.h" +#include "cutlass/tensor_view.h" + +#include "cutlass/util/reference/device/tensor_foreach.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace reference { +namespace device { + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +template < + typename Element, ///< Element type + typename Layout> ///< Layout function +struct TensorReLuFunc { + + /// View type + using TensorView = TensorView; + + /// Coordinate in tensor's index space + using TensorCoord = typename TensorView::TensorCoord; + + /// Parameters structure + struct Params { + + // + // Data members + // + + TensorView view; + Element threshold; + + + // + // Methods + // + + Params( + TensorView view_ = TensorView(), + Element threshold_ = Element(0) + ): + view(view_), threshold(threshold_) { + + } + }; + + // + // Data members + // + + Params params; + + // + // Methods + // + + CUTLASS_DEVICE + TensorReLuFunc(Params const ¶ms): params(params) { + + } + + CUTLASS_DEVICE + void operator()(TensorCoord const &coord) { + + Element const & value = params.view.at(coord); + params.view.at(coord) = (value < params.threshold) ? params.threshold : value; + } +}; + +} // namespace detail + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Apply ReLu on a tensor +template < + typename Element, ///< Element type + typename Layout> ///< Layout function +void TensorReLu( + TensorView view, ///< destination tensor + Element threshold = Element(0)) { ///< ReLu threshold + + using Func = detail::TensorReLuFunc; + using Params = typename Func::Params; + + TensorForEach( + view.extent(), + Params(view, threshold) + ); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace reference +} // namespace cutlass diff --git a/tools/util/include/cutlass/util/reference/device/thread/gemm.h b/tools/util/include/cutlass/util/reference/device/thread/gemm.h index fefc4131d..11485a91d 100644 --- a/tools/util/include/cutlass/util/reference/device/thread/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/thread/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h index 13dbd5cf0..3e38886dd 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm.h +++ b/tools/util/include/cutlass/util/reference/host/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -37,11 +37,41 @@ #include "cutlass/tensor_view.h" #include "cutlass/gemm/gemm.h" #include "cutlass/arch/mma.h" +#include "cutlass/util/host_tensor.h" namespace cutlass { namespace reference { namespace host { +template +struct CastIfScalar { + static Out cast(In in) { + return Out(in); + } +}; + +template +struct CastIfScalar, In> { + typedef cutlass::complex Out; + static Out cast(In in) { + return Out(static_cast(in)); + } +}; + +template +struct CastIfScalar, cutlass::complex> { + typedef cutlass::complex Out; + typedef cutlass::complex In; + static Out cast(In in) { + return Out(in); + } +}; + +template +Out cast_if_scalar(In in) { + return CastIfScalar::cast(in); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef @@ -107,7 +137,10 @@ void compute_gemm( ElementA a = tensor_a.at(MatrixCoord(row, k_block)); ElementB b = tensor_b.at(MatrixCoord(k_block, col)); - accum[i][j] = inner_product_op(ComputeType(a), ComputeType(b), accum[i][j]); + ComputeType compute_a(cast_if_scalar(a)); + ComputeType compute_b(cast_if_scalar(b)); + + accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]); } } } diff --git a/tools/util/include/cutlass/util/reference/host/gemm_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_complex.h index 0f067691e..27f368200 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm_complex.h +++ b/tools/util/include/cutlass/util/reference/host/gemm_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h index 4d02747d3..2a23fd272 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h +++ b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h index bf05a099b..2d7545e90 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_copy.h b/tools/util/include/cutlass/util/reference/host/tensor_copy.h index 737119e81..a81f02112 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_copy.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_copy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h index 73eb328d3..88bbb39f4 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h index b298e4c26..87c14d61c 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h index 23ee9f93d..feb439d72 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_norm.h b/tools/util/include/cutlass/util/reference/host/tensor_norm.h index 6c73d91fa..1d494b9f4 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_norm.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_norm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/tensor_view_io.h b/tools/util/include/cutlass/util/tensor_view_io.h index 590462f72..0043d745c 100644 --- a/tools/util/include/cutlass/util/tensor_view_io.h +++ b/tools/util/include/cutlass/util/tensor_view_io.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/type_traits.h b/tools/util/include/cutlass/util/type_traits.h index 059a23ab4..d97af0a42 100644 --- a/tools/util/include/cutlass/util/type_traits.h +++ b/tools/util/include/cutlass/util/type_traits.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: