FP16 data in-register transpose (#41)

* start fixing 16bit data packing

* adding StaticTensor

* adding StaticTensor

* adding StaticTensor

* add missing constexpr

* adding static tensor

* adding static tensor

* adding transpose

* add inline asm for transpose 2x2 of half_t

* add general transpose_vectors(), but have unnecessary register initialization using v_mov

* fix unnecessary register initialization in transpose_vector by using more pass-by-reference

* add hardcoded logic for NHWC wrw

* improve asm for v_pack

* make ThreadwiseTensorSliceTransfer_v3r2 support any tensor

* tweak

* reorganize file
This commit is contained in:
Chao Liu
2021-11-15 10:05:58 -06:00
committed by GitHub
parent e823d518cb
commit b491ebf384
27 changed files with 1857 additions and 376 deletions

View File

@@ -3,7 +3,6 @@
#include <iostream>
#include "device.hpp"
#include "gemm_common.hpp"
#include "device_base.hpp"
#include "device_gemm.hpp"
#include "common_header.hpp"

View File

@@ -1,22 +0,0 @@
#ifndef GEMM_COMMON_HPP
#define GEMM_COMMON_HPP
enum GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
MK_KN_NM, // 4
MK_NK_NM, // 5
KM_KN_NM, // 6
KM_NK_NM, // 7
};
enum GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
};
#endif