mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
AOCL-BLAS Release 4.2
This commit is contained in:
@@ -43,6 +43,7 @@ build_script:
|
||||
- ps: Push-AppveyorArtifact C:\blis.zip
|
||||
|
||||
test_script:
|
||||
# "make checkblas" does not work with shared linking Windows due to inability to override xerbla_
|
||||
- if [%LIB_TYPE%]==[shared] set "TEST_TARGET=checkblis-fast"
|
||||
- if [%LIB_TYPE%]==[static] set "TEST_TARGET=check"
|
||||
- bash -lc "cd /c/projects/blis && mingw32-make %TEST_TARGET% -j4 V=1"
|
||||
|
||||
76
.travis.yml
76
.travis.yml
@@ -1,80 +1,76 @@
|
||||
language: c
|
||||
sudo: required
|
||||
dist: trusty
|
||||
dist: focal
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
- dev
|
||||
- amd
|
||||
matrix:
|
||||
include:
|
||||
# full testsuite (all tests except for mixed datatype)
|
||||
# full testsuite (all tests + mixed datatype (gemm_nn only) + salt + SDE + OOT)
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto"
|
||||
# mixed-datatype testsuite (gemm_nn only)
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto"
|
||||
# salt testsuite (fast set of operations+parameters)
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto"
|
||||
# test x86_64 ukrs with SDE
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64"
|
||||
env: OOT=1 TEST=ALL SDE=1 THR="none" CONF="x86_64" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# openmp build
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto"
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="openmp" CONF="auto" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# pthreads build
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto"
|
||||
# out-of-tree build
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto"
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="pthreads" CONF="auto" \
|
||||
PACKAGES="gcc-8 binutils"
|
||||
# clang build
|
||||
- os: linux
|
||||
compiler: clang
|
||||
env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto"
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="auto"
|
||||
# There seems to be some difficulty installing 2 Clang toolchains of different versions.
|
||||
# Use the TravisCI default.
|
||||
# PACKAGES="clang-8 binutils"
|
||||
# macOS with system compiler (clang)
|
||||
- os: osx
|
||||
compiler: clang
|
||||
env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto"
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="auto"
|
||||
# cortexa15 build and fast testsuite (qemu)
|
||||
- os: linux
|
||||
compiler: arm-linux-gnueabihf-gcc
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa15" \
|
||||
PACKAGES="gcc-arm-linux-gnueabihf qemu-system-arm qemu-user" \
|
||||
CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ \
|
||||
PACKAGES="gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user" \
|
||||
TESTSUITE_WRAPPER="qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/"
|
||||
# cortexa57 build and fast testsuite (qemu)
|
||||
- os: linux
|
||||
compiler: aarch64-linux-gnu-gcc
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa57" \
|
||||
PACKAGES="gcc-aarch64-linux-gnu qemu-system-arm qemu-user" \
|
||||
CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ \
|
||||
PACKAGES="gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
|
||||
TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
|
||||
# armsve build and fast testsuite (qemu)
|
||||
- os: linux
|
||||
compiler: aarch64-linux-gnu-gcc-10
|
||||
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="armsve" \
|
||||
CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 \
|
||||
PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
|
||||
TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/"
|
||||
install:
|
||||
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/as; fi
|
||||
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/as /usr/bin/as; fi
|
||||
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/ld; fi
|
||||
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/ld /usr/bin/ld; fi
|
||||
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-6"; fi
|
||||
- if [ -n "$PACKAGES" ]; then sudo apt-get install -y $PACKAGES; fi
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
packages:
|
||||
- gcc-6
|
||||
- binutils-2.26
|
||||
- clang
|
||||
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi
|
||||
- if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
|
||||
script:
|
||||
- export DIST_PATH=.
|
||||
- pwd
|
||||
- if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi
|
||||
- pwd
|
||||
- $DIST_PATH/configure -t $THR CC=$CC $CONF
|
||||
- $DIST_PATH/configure -p `pwd`/../install -t $THR CC=$CC $CONF
|
||||
- pwd
|
||||
- ls -l
|
||||
- $CC --version
|
||||
- make -j 2
|
||||
- make install
|
||||
- $DIST_PATH/travis/cxx/cxx-test.sh $DIST_PATH $(ls -1 include)
|
||||
# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx).
|
||||
- if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
|
||||
- if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi
|
||||
- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi
|
||||
|
||||
1721
CMakeLists.txt
1721
CMakeLists.txt
File diff suppressed because it is too large
Load Diff
6
CREDITS
6
CREDITS
@@ -42,6 +42,7 @@ but many others have contributed code and feedback, including
|
||||
Shivaprashanth H (Global Edge)
|
||||
Jean-Michel Hautbois @jhautbois
|
||||
Ian Henriksen @insertinterestingnamehere (The University of Texas at Austin)
|
||||
Greg Henry (Intel)
|
||||
Minh Quan Ho @hominhquan
|
||||
Matthew Honnibal @honnibal
|
||||
Stefan Husmann @stefanhusmann
|
||||
@@ -50,9 +51,11 @@ but many others have contributed code and feedback, including
|
||||
Tony Kelman @tkelman
|
||||
Lee Killough @leekillough (Cray)
|
||||
Mike Kistler @mkistler (IBM, Austin Research Laboratory)
|
||||
Ivan Korostelev @ivan23kor (University of Alberta)
|
||||
Kyungmin Lee @kyungminlee (Ohio State University)
|
||||
Michael Lehn @michael-lehn
|
||||
Shmuel Levine @ShmuelLevine
|
||||
@lschork2
|
||||
Dave Love @loveshack
|
||||
Tze Meng Low (The University of Texas at Austin)
|
||||
Ye Luo @ye-luo (Argonne National Laboratory)
|
||||
@@ -92,6 +95,7 @@ but many others have contributed code and feedback, including
|
||||
Paul Springer @springer13 (RWTH Aachen University)
|
||||
Adam J. Stewart @adamjstewart (University of Illinois at Urbana-Champaign)
|
||||
Vladimir Sukarev
|
||||
Chengguo Sun @chengguosun
|
||||
Santanu Thangaraj (AMD)
|
||||
Nicholai Tukanov @nicholaiTukanov (The University of Texas at Austin)
|
||||
Rhys Ulerich @RhysU (The University of Texas at Austin)
|
||||
@@ -99,6 +103,7 @@ but many others have contributed code and feedback, including
|
||||
Meghana Vankadari @Meghana-vankadari (AMD)
|
||||
Kiran Varaganti @kvaragan (AMD)
|
||||
Natalia Vassilieva (Hewlett Packard Enterprise)
|
||||
Andrew Wildman @awild82 (University of Washington)
|
||||
Zhang Xianyi @xianyi (Chinese Academy of Sciences)
|
||||
Benda Xu @heroxbd
|
||||
Guodong Xu @docularxu (Linaro.org)
|
||||
@@ -106,6 +111,7 @@ but many others have contributed code and feedback, including
|
||||
Costas Yamin @cosstas
|
||||
Chenhan Yu @ChenhanYu (The University of Texas at Austin)
|
||||
Roman Yurchak @rth (Symerio)
|
||||
Stefano Zampini @stefanozampini
|
||||
M. Zhou @cdluminate
|
||||
|
||||
BLIS's development was partially funded by grants from industry
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -15,7 +15,7 @@ copyright info. All parties provide their portions of the code under the
|
||||
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
50
Makefile
50
Makefile
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -320,6 +320,7 @@ BLASTEST_INPUT_PATH := $(DIST_PATH)/$(BLASTEST_DIR)/input
|
||||
|
||||
# The location of the BLAS test suite object directory.
|
||||
BASE_OBJ_BLASTEST_PATH := $(BASE_OBJ_PATH)/$(BLASTEST_DIR)
|
||||
BASE_EXE_BLASTEST_PATH := $(BASE_OBJ_BLASTEST_PATH)/$(MK_USE_LIB)
|
||||
|
||||
# The locations of the BLAS test suite source code (f2c and drivers).
|
||||
BLASTEST_F2C_SRC_PATH := $(DIST_PATH)/$(BLASTEST_DIR)/f2c
|
||||
@@ -347,7 +348,7 @@ BLASTEST_DRV_BASES := $(basename $(notdir $(BLASTEST_DRV_OBJS)))
|
||||
|
||||
# The binary executable driver names.
|
||||
BLASTEST_DRV_BINS := $(addsuffix .x,$(BLASTEST_DRV_BASES))
|
||||
BLASTEST_DRV_BIN_PATHS := $(addprefix $(BASE_OBJ_BLASTEST_PATH)/,$(BLASTEST_DRV_BINS))
|
||||
BLASTEST_DRV_BIN_PATHS := $(addprefix $(BASE_EXE_BLASTEST_PATH)/,$(BLASTEST_DRV_BINS))
|
||||
|
||||
# Binary executable driver "run-" names
|
||||
BLASTEST_DRV_BINS_R := $(addprefix run-,$(BLASTEST_DRV_BASES))
|
||||
@@ -393,6 +394,7 @@ TESTSUITE_SALT_OPS_PATH := $(DIST_PATH)/$(TESTSUITE_DIR)/$(TESTSUITE_SALT_OPS)
|
||||
# directory.
|
||||
TESTSUITE_SRC_PATH := $(DIST_PATH)/$(TESTSUITE_DIR)/src
|
||||
BASE_OBJ_TESTSUITE_PATH := $(BASE_OBJ_PATH)/$(TESTSUITE_DIR)
|
||||
BASE_EXE_TESTSUITE_PATH := $(BASE_OBJ_PATH)/$(TESTSUITE_DIR)/$(MK_USE_LIB)
|
||||
|
||||
# Convert source file paths to object file paths by replacing the base source
|
||||
# directories with the base object directories, and also replacing the source
|
||||
@@ -414,7 +416,7 @@ MK_TESTSUITE_OBJS := $(sort \
|
||||
# unusual environments (e.g. ARM) can run the testsuite through some other
|
||||
# binary. See .travis.yml for details on how the variable is employed in
|
||||
# practice.
|
||||
TESTSUITE_BIN := test_$(LIBBLIS).x
|
||||
TESTSUITE_BIN := $(BASE_EXE_TESTSUITE_PATH)/test_$(LIBBLIS).x
|
||||
TESTSUITE_WRAPPER ?=
|
||||
|
||||
# The location of the script that checks the BLIS testsuite output.
|
||||
@@ -504,7 +506,7 @@ endif
|
||||
|
||||
flat-header: check-env $(BLIS_H_FLAT)
|
||||
|
||||
$(BLIS_H_FLAT): $(FRAME_H99_FILES)
|
||||
$(BLIS_H_FLAT): $(ALL_H99_FILES)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
|
||||
else
|
||||
@@ -820,7 +822,7 @@ blastest-bin: check-env blastest-f2c $(BLASTEST_DRV_BIN_PATHS)
|
||||
blastest-run: $(BLASTEST_DRV_BINS_R)
|
||||
|
||||
# f2c object file rule.
|
||||
$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c
|
||||
$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c $(BLIS_H_FLAT)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@
|
||||
else
|
||||
@@ -829,7 +831,7 @@ else
|
||||
endif
|
||||
|
||||
# driver object file rule.
|
||||
$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c
|
||||
$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c $(BLIS_H_FLAT)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@
|
||||
else
|
||||
@@ -850,7 +852,8 @@ endif
|
||||
|
||||
# first argument: the base name of the BLAS test driver.
|
||||
define make-blat-rule
|
||||
$(BASE_OBJ_BLASTEST_PATH)/$(1).x: $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK)
|
||||
$(BASE_EXE_BLASTEST_PATH)/$(1).x: $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK)
|
||||
@mkdir -p $(BASE_EXE_BLASTEST_PATH)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(LINKER) $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $$@
|
||||
else
|
||||
@@ -864,12 +867,12 @@ $(foreach name, $(BLASTEST_DRV_BASES), $(eval $(call make-blat-rule,$(name))))
|
||||
|
||||
# A rule to run ?blat1.x driver files.
|
||||
define make-run-blat1-rule
|
||||
run-$(1): $(BASE_OBJ_BLASTEST_PATH)/$(1).x
|
||||
run-$(1): $(BASE_EXE_BLASTEST_PATH)/$(1).x
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x > out.$(1)
|
||||
$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x > out.$(1)
|
||||
else
|
||||
@echo "Running $(1).x > 'out.$(1)'"
|
||||
@$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x > out.$(1)
|
||||
@$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x > out.$(1)
|
||||
endif
|
||||
endef
|
||||
|
||||
@@ -878,12 +881,12 @@ $(foreach name, $(BLASTEST_DRV1_BASES), $(eval $(call make-run-blat1-rule,$(name
|
||||
|
||||
# A rule to run ?blat2.x and ?blat3.x driver files.
|
||||
define make-run-blat23-rule
|
||||
run-$(1): $(BASE_OBJ_BLASTEST_PATH)/$(1).x
|
||||
run-$(1): $(BASE_EXE_BLASTEST_PATH)/$(1).x
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in
|
||||
$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in
|
||||
else
|
||||
@echo "Running $(1).x < '$(BLASTEST_INPUT_PATH)/$(1).in' (output to 'out.$(1)')"
|
||||
@$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in
|
||||
@$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in
|
||||
endif
|
||||
endef
|
||||
|
||||
@@ -916,7 +919,7 @@ testsuite: testsuite-run
|
||||
testsuite-bin: check-env $(TESTSUITE_BIN)
|
||||
|
||||
# Object file rule.
|
||||
$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c
|
||||
$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c $(BLIS_H_FLAT)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) -c $< -o $@
|
||||
else
|
||||
@@ -926,6 +929,7 @@ endif
|
||||
|
||||
# Testsuite binary rule.
|
||||
$(TESTSUITE_BIN): $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK)
|
||||
@mkdir -p $(BASE_EXE_TESTSUITE_PATH)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(LINKER) $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
else
|
||||
@@ -936,13 +940,13 @@ endif
|
||||
# A rule to run the testsuite using the normal input.* files.
|
||||
testsuite-run: testsuite-bin
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
$(TESTSUITE_WRAPPER) ./$(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \
|
||||
$(TESTSUITE_WRAPPER) $(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \
|
||||
-o $(TESTSUITE_CONF_OPS_PATH) \
|
||||
> $(TESTSUITE_OUT_FILE)
|
||||
|
||||
else
|
||||
@echo "Running $(TESTSUITE_BIN) with output redirected to '$(TESTSUITE_OUT_FILE)'"
|
||||
@$(TESTSUITE_WRAPPER) ./$(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \
|
||||
@$(TESTSUITE_WRAPPER) $(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \
|
||||
-o $(TESTSUITE_CONF_OPS_PATH) \
|
||||
> $(TESTSUITE_OUT_FILE)
|
||||
endif
|
||||
@@ -1285,7 +1289,7 @@ ifeq ($(IS_CONFIGURED),yes)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(BLASTEST_F2C_OBJS) $(BLASTEST_DRV_OBJS)
|
||||
- $(RM_F) $(BLASTEST_F2C_LIB)
|
||||
- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
|
||||
- $(RM_RF) $(BASE_OBJ_BLASTEST_PATH)/{shared,static}
|
||||
- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
|
||||
else
|
||||
@echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)"
|
||||
@@ -1293,7 +1297,7 @@ else
|
||||
@echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)"
|
||||
@- $(RM_F) $(BLASTEST_F2C_LIB)
|
||||
@echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)"
|
||||
@- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
|
||||
@- $(RM_RF) $(BASE_OBJ_BLASTEST_PATH)/{shared,static}
|
||||
@echo "Removing driver output files 'out.*'"
|
||||
@- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
|
||||
endif # ENABLE_VERBOSE
|
||||
@@ -1328,13 +1332,13 @@ cleanblistesttop:
|
||||
ifeq ($(IS_CONFIGURED),yes)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(RM_F) $(MK_TESTSUITE_OBJS)
|
||||
- $(RM_F) $(TESTSUITE_BIN)
|
||||
- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static}
|
||||
- $(RM_F) $(TESTSUITE_OUT_FILE)
|
||||
else
|
||||
@echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)"
|
||||
@- $(RM_F) $(MK_TESTSUITE_OBJS)
|
||||
@echo "Removing binary $(TESTSUITE_BIN)"
|
||||
@- $(RM_F) $(TESTSUITE_BIN)
|
||||
@- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static}
|
||||
@echo "Removing $(TESTSUITE_OUT_FILE)"
|
||||
@- $(RM_F) $(TESTSUITE_OUT_FILE)
|
||||
endif # ENABLE_VERBOSE
|
||||
@@ -1344,13 +1348,13 @@ cleanblistestdir:
|
||||
ifeq ($(IS_CONFIGURED),yes)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
|
||||
- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
|
||||
- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static}
|
||||
- $(MAKE) -C $(VEND_TESTCPP_DIR) clean
|
||||
else
|
||||
@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)"
|
||||
@- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
|
||||
@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)"
|
||||
@- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
|
||||
@echo "Removing binary $(TESTSUITE_BIN)"
|
||||
@- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static}
|
||||
@$(MAKE) -C $(VEND_TESTCPP_DIR) clean
|
||||
endif # ENABLE_VERBOSE
|
||||
endif # IS_CONFIGURED
|
||||
|
||||
206
addon/CMakeLists.txt
Normal file
206
addon/CMakeLists.txt
Normal file
@@ -0,0 +1,206 @@
|
||||
##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ##
|
||||
|
||||
# Writing a function that will be used to generate the required object
|
||||
# libraries for the required addons.
|
||||
function(generate_addon_targets addon_target)
|
||||
# Collect all subdirectory paths that have at least one file with suffix in ADDON_C99_SUFS list.
|
||||
get_filepaths_with_suffixes(LOCAL_SOURCE_C99_FILES "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_C99_SUFS}")
|
||||
# We want to break the files above in 2 categories, files in kernel directory and the rest.
|
||||
# Only list files in kernel directory.
|
||||
set(LOCAL_KERNEL_FILES_C99 ${LOCAL_SOURCE_FILES})
|
||||
list(FILTER LOCAL_KERNEL_FILES_C99 INCLUDE REGEX ${addon_target}/kernels/)
|
||||
# All C99 files, except of the ones in kernels directory.
|
||||
list(REMOVE_ITEM LOCAL_SOURCE_C99_FILES ${LOCAL_KERNEL_FILES_C99})
|
||||
|
||||
# Collect all subdirectory paths that have at least one file with suffix in ADDON_H99_SUFS list.
|
||||
get_dirpaths_with_suffixes(CADDONINCFLAGS "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_H99_SUFS}")
|
||||
|
||||
# Only generate the object library if there is at least one source file.
|
||||
list(LENGTH LOCAL_SOURCE_C99_FILES size)
|
||||
if(size GREATER 0)
|
||||
# Create an object library using the source file list above.
|
||||
add_library(${addon_target}_C99_ADDON
|
||||
OBJECT
|
||||
${LOCAL_SOURCE_C99_FILES}
|
||||
)
|
||||
# Include the corresponding make_defs.cmake that holds the required compiler options.
|
||||
include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake)
|
||||
# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets.
|
||||
# mimicing get-addon-c99flags-for
|
||||
target_compile_options(${addon_target}_C99_ADDON
|
||||
PRIVATE
|
||||
# load-var-for,COPTFLAGS
|
||||
${COPTFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CDBGFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CWARNFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CMISCFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CLANGFLAGS}
|
||||
# in get-addon-c99flags-for
|
||||
${BUILD_SYMFLAGS}
|
||||
)
|
||||
target_compile_definitions(${addon_target}_C99_ADDON
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${CPPROCFLAGS}
|
||||
# in get-noopt-cflags-for
|
||||
${VERS_DEF}
|
||||
# in get-addon-c99flags-for
|
||||
${BUILD_CPPFLAGS}
|
||||
)
|
||||
target_include_directories(${addon_target}_C99_ADDON
|
||||
BEFORE
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${CINFLAGS}
|
||||
# in get-addon-c99flags-for
|
||||
${CADDONINCFLAGS}
|
||||
)
|
||||
if(THREADING_MODEL STREQUAL "openmp")
|
||||
# Equivalent to CTHREADFLAGS in get-noopt-cflags-for
|
||||
target_link_libraries(${addon_target}_C99_ADDON PRIVATE OpenMP::OpenMP_C)
|
||||
elseif(THREADING_MODEL STREQUAL "pthreads")
|
||||
# in get-noopt-cflags-for
|
||||
target_compile_options(${addon_target}_C99_ADDON PRIVATE ${CTHREADFLAGS})
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
# Equivalent to CPICFLAGS in get-noopt-cflags-for
|
||||
set_target_properties(${addon_target}_C99_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
add_dependencies(${addon_target}_C99_ADDON flat-header)
|
||||
# Put all those targets under object-libs-targets folder name so that they appear all together in IDE.
|
||||
set_target_properties(${addon_target}_C99_ADDON PROPERTIES FOLDER object-libs-targets)
|
||||
endif()
|
||||
|
||||
# Only generate the object library if there is at least one source file.
|
||||
list(LENGTH LOCAL_KERNEL_FILES_C99 size)
|
||||
if(size GREATER 0)
|
||||
# Create an object library using the kernel source file list above.
|
||||
add_library(${addon_target}_C99_KERNEL_ADDON
|
||||
OBJECT
|
||||
${LOCAL_KERNEL_FILES_C99}
|
||||
)
|
||||
# Include the corresponding make_defs.cmake that holds the required compiler options.
|
||||
include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake)
|
||||
# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets.
|
||||
# mimicing get-addon-c99flags-for
|
||||
target_compile_options(${addon_target}_C99_KERNEL_ADDON
|
||||
PRIVATE
|
||||
# load-var-for,CKOPTFLAGS
|
||||
${CKOPTFLAGS}
|
||||
# load-var-for,CKVECFLAGS
|
||||
${CKVECFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CDBGFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CWARNFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CMISCFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CLANGFLAGS}
|
||||
# in get-addon-kernel-c99flags-for
|
||||
${BUILD_SYMFLAGS}
|
||||
)
|
||||
target_compile_definitions(${addon_target}_C99_KERNEL_ADDON
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${CPPROCFLAGS}
|
||||
# in get-noopt-cflags-for
|
||||
${VERS_DEF}
|
||||
# in get-addon-kernel-c99flags-for
|
||||
${BUILD_CPPFLAGS}
|
||||
)
|
||||
target_include_directories(${addon_target}_C99_KERNEL_ADDON
|
||||
BEFORE
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${CINFLAGS}
|
||||
# in get-addon-kernel-c99flags-for
|
||||
${CADDONINCFLAGS}
|
||||
)
|
||||
if(THREADING_MODEL STREQUAL "openmp")
|
||||
# Equivalent to CTHREADFLAGS in get-noopt-cflags-for
|
||||
target_link_libraries(${addon_target}_C99_KERNEL_ADDON PRIVATE OpenMP::OpenMP_C)
|
||||
elseif(THREADING_MODEL STREQUAL "pthreads")
|
||||
# in get-noopt-cflags-for
|
||||
target_compile_options(${addon_target}_C99_KERNEL_ADDON PRIVATE ${CTHREADFLAGS})
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
# Equivalent to CPICFLAGS in get-noopt-cflags-for
|
||||
set_target_properties(${addon_target}_C99_KERNEL_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
add_dependencies(${addon_target}_C99_KERNEL_ADDON flat-header)
|
||||
# Put all those targets under object-libs-targets folder name so that they appear all together in IDE.
|
||||
set_target_properties(${addon_target}_C99_KERNEL_ADDON PROPERTIES FOLDER object-libs-targets)
|
||||
endif()
|
||||
|
||||
# Collect all subdirectory paths that have at least one file with suffix in ADDON_CXX_SUFS list.
|
||||
get_filepaths_with_suffixes(LOCAL_SOURCE_CXX_FILES "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_CXX_SUFS}")
|
||||
|
||||
# Only generate the object library if there is at least one source file.
|
||||
list(LENGTH LOCAL_SOURCE_CXX_FILES size)
|
||||
if(size GREATER 0)
|
||||
# Create an object library using the source file list above.
|
||||
add_library(${addon_target}_CXX_ADDON
|
||||
OBJECT
|
||||
${LOCAL_SOURCE_CXX_FILES}
|
||||
)
|
||||
|
||||
# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets.
|
||||
# mimicing get-addon-cxxflags-for
|
||||
target_compile_options(${addon_target}_CXX_ADDON
|
||||
PRIVATE
|
||||
# load-var-for,COPTFLAGS
|
||||
${COPTFLAGS}
|
||||
# get-noopt-cxxflags-for
|
||||
${CDBGFLAGS}
|
||||
# get-noopt-cxxflags-for
|
||||
${CWARNFLAGS}
|
||||
# get-noopt-cxxflags-for
|
||||
${CMISCFLAGS}
|
||||
# get-noopt-cxxflags-for
|
||||
${CXXLANGFLAGS}
|
||||
# in get-addon-cxxflags-for
|
||||
${BUILD_SYMFLAGS}
|
||||
)
|
||||
target_compile_definitions(${addon_target}_CXX_ADDON
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${CPPROCFLAGS}
|
||||
# in get-noopt-cflags-for
|
||||
${VERS_DEF}
|
||||
# in get-addon-cxxflags-for
|
||||
${BUILD_CPPFLAGS}
|
||||
)
|
||||
target_include_directories(${addon_target}_CXX_ADDON
|
||||
BEFORE
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${CINFLAGS}
|
||||
# in get-addon-cxxflags-for
|
||||
${CADDONINCFLAGS}
|
||||
)
|
||||
if(THREADING_MODEL STREQUAL "openmp")
|
||||
# Equivalent to CTHREADFLAGS in get-noopt-cflags-for
|
||||
target_link_libraries(${addon_target}_CXX_ADDON PRIVATE OpenMP::OpenMP_C)
|
||||
elseif(THREADING_MODEL STREQUAL "pthreads")
|
||||
# in get-noopt-cflags-for
|
||||
target_compile_options(${addon_target}_CXX_ADDON PRIVATE ${CTHREADFLAGS})
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
# Equivalent to CPICFLAGS in get-noopt-cflags-for
|
||||
set_target_properties(${addon_target}_CXX_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
add_dependencies(${addon_target}_CXX_ADDON flat-header)
|
||||
# Put all those targets under object-libs-targets folder name so that they appear all together in IDE.
|
||||
set_target_properties(${addon_target}_CXX_ADDON PROPERTIES FOLDER object-libs-targets)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Generate targets for each of the addons.
|
||||
foreach(ADDON ${ENABLE_ADDON})
|
||||
generate_addon_targets(${ADDON})
|
||||
endforeach()
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -43,7 +43,7 @@
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "lpgemm_kernels.h"
|
||||
#include "lpgemm_utils_kernels.h"
|
||||
#include "lpgemm_packb_bf16.h"
|
||||
#include "lpgemm_pack_bf16.h"
|
||||
#include "lpgemm_packb_s16.h"
|
||||
#include "lpgemm_packa.h"
|
||||
#include "lpgemm_packb.h"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -85,12 +85,34 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
|
||||
|
||||
AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
|
||||
{
|
||||
trans_t blis_trans;
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( trans, &blis_trans );
|
||||
|
||||
if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
|
||||
( k <= 0 ) || ( n <= 0 ) || ( ldb < n ) )
|
||||
( k <= 0 ) || ( n <= 0 ) || ( bli_is_notrans( blis_trans ) && ( ldb < n ) ) ||
|
||||
( bli_is_trans( blis_trans ) && ( ldb < k ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
inc_t rs_b, cs_b;
|
||||
if( ( order == 'r') || ( order == 'R' ) )
|
||||
{
|
||||
rs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
|
||||
cs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
|
||||
}
|
||||
else if ( ( order == 'c' ) || ( order == 'C' ) )
|
||||
{
|
||||
rs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
|
||||
cs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
return; // Error
|
||||
}
|
||||
|
||||
// Check if avx512_bf16 ISA is supported, lpgemm matmul only works with it.
|
||||
if ( bli_cpuid_is_avx512bf16_supported() == FALSE )
|
||||
{
|
||||
@@ -117,7 +139,7 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
|
||||
|
||||
@@ -128,7 +150,8 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
|
||||
// Create dummy original b obj;
|
||||
lpgemm_obj_t b;
|
||||
b.storage.aligned_buffer = ( void* )input_buf_addr;
|
||||
b.rs = ldb;
|
||||
b.rs = rs_b;
|
||||
b.cs = cs_b;
|
||||
b.width = n;
|
||||
b.length = k;
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "lpgemm_thread_decor_openmp.h"
|
||||
@@ -73,57 +74,42 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"bf16bf16f32obf16",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
|
||||
bli_param_map_netlib_to_blis_trans( transb, &blis_transb );
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
// Transpose not supported.
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) );
|
||||
bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) );
|
||||
|
||||
inc_t rs_a = lda;
|
||||
inc_t cs_a = 1;
|
||||
|
||||
if ( bli_is_trans( blis_transa ) )
|
||||
{
|
||||
return; // Error.
|
||||
rs_a = 1;
|
||||
cs_a = lda;
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
inc_t rs_b = ldb;
|
||||
inc_t cs_b = 1;
|
||||
|
||||
bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
|
||||
bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
|
||||
|
||||
// Row major input expected with leading dimensions >= row stride.
|
||||
if ( ( is_row_major == TRUE ) &&
|
||||
( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
|
||||
if( bli_is_trans( blis_transb ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// Column major input expected with leading dimensions >= column stride.
|
||||
else if ( ( is_column_major == TRUE ) &&
|
||||
( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
rs_b = 1;
|
||||
cs_b = ldb;
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
|
||||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
const inc_t cs_b = 1;
|
||||
const inc_t rs_c = ldc;
|
||||
const inc_t cs_c = 1;
|
||||
|
||||
@@ -133,6 +119,21 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
|
||||
bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
|
||||
|
||||
// Reorder is not supported for A matrix
|
||||
if( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) )
|
||||
{
|
||||
bli_print_msg(" Reordering of A matrix is not supported in row major case.", __FILE__, __LINE__ );
|
||||
return;
|
||||
}
|
||||
// Inputs swapped in column major, A becomes B from kernel point of view.
|
||||
// Reorder is not supported for column major matrices.
|
||||
else if ( ( is_column_major == TRUE ) && ( ( mtag_b == REORDERED ) || ( mtag_a == REORDERED ) ) )
|
||||
{
|
||||
bli_print_msg(" Reordering of column major matrices is not supported.", __FILE__, __LINE__ );
|
||||
return;
|
||||
}
|
||||
|
||||
// From 5-loop function point of view,
|
||||
// B matrix needs to be packed in a certain format in order to be loaded
|
||||
// and used in bf16 instrution. As such the mtag_b always needs to be either
|
||||
// packed or reordered. B matrix as it is (unpacked) cannot be used, and
|
||||
@@ -147,30 +148,34 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
mtag_a = PACK;
|
||||
}
|
||||
|
||||
// Only unpacked A supported now.
|
||||
if ( ( is_row_major == TRUE ) && ( mtag_a != UNPACKED ) )
|
||||
// From 5-loop function point of view,
|
||||
// A matrix when in column major storage needs to be packed to row-major
|
||||
// storage as kernel expects A matrix to be in row-major format.
|
||||
if( ( is_row_major == TRUE ) && ( bli_is_trans(blis_transa ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
mtag_a = PACK;
|
||||
}
|
||||
// Inputs swapped in column major, B becomes A from kernel point of view.
|
||||
else if ( ( is_column_major == TRUE ) && ( mtag_b != UNPACKED ) )
|
||||
// Inputs swapped in column major, A becomes B from kernel point of view.
|
||||
else if ( ( is_column_major == TRUE ) && ( bli_is_trans(blis_transb ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
mtag_b = PACK;
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
|
||||
|
||||
@@ -186,7 +191,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
( float* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, BF16
|
||||
);
|
||||
}
|
||||
else
|
||||
@@ -199,7 +204,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
( float* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, BF16
|
||||
);
|
||||
}
|
||||
#else
|
||||
@@ -214,7 +219,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
( float* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, BF16
|
||||
);
|
||||
}
|
||||
else
|
||||
@@ -227,7 +232,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
|
||||
( float* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, BF16
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "lpgemm_thread_decor_openmp.h"
|
||||
@@ -73,58 +74,42 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"bf16bf16f32obf16",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
|
||||
bli_param_map_netlib_to_blis_trans( transb, &blis_transb );
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
// Transpose not supported.
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
|
||||
bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
|
||||
bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
|
||||
|
||||
// Row major input expected with leading dimensions >= row stride.
|
||||
if ( ( is_row_major == TRUE ) &&
|
||||
( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// Column major input expected with leading dimensions >= column stride.
|
||||
else if ( ( is_column_major == TRUE ) &&
|
||||
( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
|
||||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) );
|
||||
bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) );
|
||||
|
||||
// The strides are set assuming a row major kernel.
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
const inc_t cs_b = 1;
|
||||
inc_t rs_a = lda;
|
||||
inc_t cs_a = 1;
|
||||
|
||||
if ( bli_is_trans( blis_transa ) )
|
||||
{
|
||||
rs_a = 1;
|
||||
cs_a = lda;
|
||||
}
|
||||
|
||||
inc_t rs_b = ldb;
|
||||
inc_t cs_b = 1;
|
||||
|
||||
if( bli_is_trans( blis_transb ) )
|
||||
{
|
||||
rs_b = 1;
|
||||
cs_b = ldb;
|
||||
}
|
||||
const inc_t rs_c = ldc;
|
||||
const inc_t cs_c = 1;
|
||||
|
||||
@@ -134,12 +119,21 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
|
||||
bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
|
||||
|
||||
if ( ( is_column_major == TRUE ) && ( mtag_b == REORDERED ) )
|
||||
// Reorder is not supported for A matrix
|
||||
if( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) )
|
||||
{
|
||||
// Reorder not supported with column major inputs.
|
||||
bli_print_msg(" Reordering of A matrix is not supported in row major case.", __FILE__, __LINE__ );
|
||||
return;
|
||||
}
|
||||
// Inputs swapped in column major, A becomes B from kernel point of view.
|
||||
// Reorder is not supported for column major matrices.
|
||||
else if ( ( is_column_major == TRUE ) && ( ( mtag_b == REORDERED ) || ( mtag_a == REORDERED ) ) )
|
||||
{
|
||||
bli_print_msg(" Reordering of column major matrices is not supported.", __FILE__, __LINE__ );
|
||||
return;
|
||||
}
|
||||
|
||||
// From 5-loop function point of view
|
||||
// B matrix needs to be packed in a certain format in order to be loaded
|
||||
// and used in bf16 instrution. As such the mtag_b always needs to be either
|
||||
// packed or reordered. B matrix as it is (unpacked) cannot be used, and
|
||||
@@ -154,30 +148,34 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
mtag_a = PACK;
|
||||
}
|
||||
|
||||
// Only unpacked A supported now.
|
||||
if ( ( is_row_major == TRUE ) && ( mtag_a != UNPACKED ) )
|
||||
// From 5-loop function point of view,
|
||||
// A matrix when in column major storage needs to be packed to row-major
|
||||
// storage as kernel expects A matrix to be in row-major format.
|
||||
if( ( is_row_major == TRUE ) && ( bli_is_trans(blis_transa ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
mtag_a = PACK;
|
||||
}
|
||||
// Inputs swapped in column major, B becomes A from kernel point of view.
|
||||
else if ( ( is_column_major == TRUE ) && ( mtag_b != UNPACKED ) )
|
||||
// Inputs swapped in column major, A becomes B from kernel point of view.
|
||||
else if ( ( is_column_major == TRUE ) && ( bli_is_trans(blis_transb ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
mtag_b = PACK;
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
|
||||
|
||||
@@ -193,7 +191,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, F32
|
||||
);
|
||||
}
|
||||
else
|
||||
@@ -206,7 +204,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, F32
|
||||
);
|
||||
}
|
||||
#else
|
||||
@@ -221,7 +219,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, F32
|
||||
);
|
||||
}
|
||||
else
|
||||
@@ -234,7 +232,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, F32
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
104
addon/aocl_gemm/aocl_gemm_check.h
Normal file
104
addon/aocl_gemm/aocl_gemm_check.h
Normal file
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// yet to add validity check for postops
|
||||
#define AOCL_GEMM_CHECK( op_str, \
|
||||
order, transa, transb, \
|
||||
m, n, k, \
|
||||
a, lda, mtag_a, \
|
||||
b, ldb, mtag_b, \
|
||||
c, ldc \
|
||||
) \
|
||||
{ \
|
||||
int32_t info = 0; \
|
||||
bool col_stored, row_stored; \
|
||||
bool nota, notb, ta, tb; \
|
||||
\
|
||||
col_stored = ( order == 'c' ) || ( order == 'C' ); \
|
||||
row_stored = ( order == 'r' ) || ( order == 'R' ); \
|
||||
\
|
||||
nota = ( transa == 'n' ) || ( transa == 'N' ); \
|
||||
notb = ( transb == 'n' ) || ( transb == 'N' ); \
|
||||
\
|
||||
ta = ( transa == 't' ) || ( transa == 'T' ); \
|
||||
tb = ( transb == 't' ) || ( transb == 'T' ); \
|
||||
\
|
||||
if( ( order != 'r') && ( order != 'R' ) && ( order != 'c' ) && ( order != 'C' ) ) \
|
||||
info = 1; \
|
||||
else if( ( transa != 'n' ) && ( transa != 'N' ) && ( transa != 't' ) && ( transa != 'T' ) ) \
|
||||
info = 2; \
|
||||
else if( ( transb != 'n' ) && ( transb != 'N' ) && ( transb != 't' ) && ( transb != 'T' ) ) \
|
||||
info = 3; \
|
||||
else if ( m <= 0 ) \
|
||||
info = 4; \
|
||||
else if ( n <= 0 ) \
|
||||
info = 5; \
|
||||
else if ( k <= 0 ) \
|
||||
info = 6; \
|
||||
else if ( a == NULL ) \
|
||||
info = 8; \
|
||||
else if ( row_stored && ( ( nota && ( lda < k ) ) || ( ta && ( lda < m ) ) ) ) \
|
||||
info = 9; \
|
||||
else if ( col_stored && ( ( nota && ( lda < m ) ) || ( ta && ( lda < k ) ) ) ) \
|
||||
info = 9; \
|
||||
else if ( ( mtag_a != 'n' ) && ( mtag_a != 'N' ) && \
|
||||
( mtag_a != 'p' ) && ( mtag_a != 'P' ) && \
|
||||
( mtag_a != 'r' ) && ( mtag_a != 'R' ) ) \
|
||||
info = 10; \
|
||||
else if ( b == NULL ) \
|
||||
info = 11; \
|
||||
else if ( row_stored && ( ( notb && ( ldb < n ) ) || ( tb && ( ldb < k ) ) ) ) \
|
||||
info = 12; \
|
||||
else if ( col_stored && ( ( notb && ( ldb < k ) ) || ( tb && ( ldb < n ) ) ) ) \
|
||||
info = 12; \
|
||||
else if ( ( mtag_b != 'n' ) && ( mtag_b != 'N' ) && \
|
||||
( mtag_b != 'p' ) && ( mtag_b != 'P' ) && \
|
||||
( mtag_b != 'r' ) && ( mtag_b != 'R' ) ) \
|
||||
info = 13; \
|
||||
else if ( c == NULL ) \
|
||||
info = 15; \
|
||||
else if ( row_stored && ( ldc < n ) ) \
|
||||
info = 16; \
|
||||
else if ( col_stored && ( ldc < m ) ) \
|
||||
info = 16; \
|
||||
\
|
||||
if( info != 0 ) \
|
||||
{ \
|
||||
char print_msg[ 100 ]; \
|
||||
\
|
||||
sprintf( print_msg, "** On entry to %6s, parameter number %2i had an illegal value", op_str, info); \
|
||||
bli_print_msg(print_msg, __FILE__, __LINE__); \
|
||||
return; \
|
||||
} \
|
||||
}
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "lpgemm_thread_decor_openmp.h"
|
||||
@@ -64,13 +65,16 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), transa, transb, m, n, k,\
|
||||
(void*)&alpha, lda, ldb, (void*)&beta, ldc);
|
||||
|
||||
// Null check for pointers.
|
||||
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \
|
||||
"Invalid pointers provided for input parameters.");
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"f32f32f32of32",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
|
||||
@@ -86,36 +90,8 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
|
||||
bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
|
||||
bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
|
||||
|
||||
// Row major input expected with leading dimensions >= row stride.
|
||||
if ( ( is_row_major == TRUE ) &&
|
||||
( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// Column major input expected with leading dimensions >= column stride.
|
||||
else if ( ( is_column_major == TRUE ) &&
|
||||
( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
|
||||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \
|
||||
"Invalid matrix dimensions.");
|
||||
return; // Error.
|
||||
}
|
||||
bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) );
|
||||
bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) );
|
||||
|
||||
// The strides are set assuming a row major kernel.
|
||||
const inc_t rs_a = lda;
|
||||
@@ -168,17 +144,19 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
|
||||
|
||||
@@ -197,7 +175,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, F32
|
||||
);
|
||||
}
|
||||
else
|
||||
@@ -210,7 +188,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, F32
|
||||
);
|
||||
}
|
||||
#else
|
||||
@@ -229,7 +207,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, F32
|
||||
);
|
||||
}
|
||||
else
|
||||
@@ -242,7 +220,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, F32
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -42,6 +42,8 @@
|
||||
#define AOCL_GEMM_GET_REORDER_BUF_SIZE(LP_SFX) \
|
||||
BLIS_EXPORT_ADDON siz_t aocl_get_reorder_buf_size_ ## LP_SFX \
|
||||
( \
|
||||
const char order, \
|
||||
const char trans, \
|
||||
const char mat_type, \
|
||||
const dim_t k, \
|
||||
const dim_t n \
|
||||
@@ -60,6 +62,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(s8s8s16os16);
|
||||
#define AOCL_GEMM_REORDER(B_type,LP_SFX) \
|
||||
BLIS_EXPORT_ADDON void aocl_reorder_ ## LP_SFX \
|
||||
( \
|
||||
const char order, \
|
||||
const char trans, \
|
||||
const char mat_type, \
|
||||
const B_type* input_buf_addr, \
|
||||
B_type* reorder_buf_addr, \
|
||||
@@ -106,6 +110,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16);
|
||||
AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32);
|
||||
AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8);
|
||||
AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8);
|
||||
AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8);
|
||||
AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16);
|
||||
AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32);
|
||||
AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8);
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_5loop_interface_apis.h"
|
||||
#include "lpgemm_config.h"
|
||||
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ((a == NULL) || (b == NULL) || (c == NULL))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"s8s8s16os16",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
|
||||
@@ -75,31 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
// Row major input expected with leading dimensions equal to row stride.
|
||||
if ((lda != k) || (ldb != n) || (ldc != n))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
@@ -125,22 +116,25 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
|
||||
// Only unpacked A supported now.
|
||||
if (mtag_a != UNPACKED)
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global(&rntm_g);
|
||||
bli_membrk_rntm_set_membrk(&rntm_g);
|
||||
bli_pba_rntm_set_pba(&rntm_g);
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );
|
||||
|
||||
@@ -153,7 +147,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, S16
|
||||
);
|
||||
#else
|
||||
lpgemm_s8s8s16o16_thread_decorator
|
||||
@@ -164,7 +158,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, S16
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ AOCL_GEMM_REORDER(int8_t,s8s8s16os16)
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global(&rntm_g);
|
||||
bli_membrk_rntm_set_membrk(&rntm_g);
|
||||
bli_pba_rntm_set_pba(&rntm_g);
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_5loop_interface_apis.h"
|
||||
#include "lpgemm_config.h"
|
||||
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ((a == NULL) || (b == NULL) || (c == NULL))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"s8s8s16os8",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
|
||||
@@ -75,31 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
// Row major input expected with leading dimensions equal to row stride.
|
||||
if ((lda != k) || (ldb != n) || (ldc != n))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
@@ -125,22 +116,25 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
|
||||
// Only unpacked A supported now.
|
||||
if (mtag_a != UNPACKED)
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global(&rntm_g);
|
||||
bli_membrk_rntm_set_membrk(&rntm_g);
|
||||
bli_pba_rntm_set_pba(&rntm_g);
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );
|
||||
|
||||
@@ -153,7 +147,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
|
||||
( int16_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, S8
|
||||
);
|
||||
#else
|
||||
lpgemm_s8s8s16o16_thread_decorator
|
||||
@@ -164,7 +158,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
|
||||
( int16_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, S8
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "lpgemm_thread_decor_openmp.h"
|
||||
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"s8s8s32os32",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
|
||||
@@ -75,32 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
// Row major input expected with leading dimensions equal to row stride.
|
||||
if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
|
||||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
@@ -126,22 +116,25 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
|
||||
// Only unpacked A supported now.
|
||||
if ( mtag_a != UNPACKED )
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );
|
||||
|
||||
@@ -154,7 +147,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, S32
|
||||
);
|
||||
#else
|
||||
lpgemm_s8s8s32o32_thread_decorator
|
||||
@@ -165,7 +158,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, S32
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ AOCL_GEMM_REORDER(int8_t,s8s8s32os32)
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "lpgemm_thread_decor_openmp.h"
|
||||
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"s8s8s32os8",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
|
||||
@@ -75,32 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
// Row major input expected with leading dimensions equal to row stride.
|
||||
if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
|
||||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
@@ -126,22 +116,25 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
|
||||
// Only unpacked A supported now.
|
||||
if ( mtag_a != UNPACKED )
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );
|
||||
|
||||
@@ -154,7 +147,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
|
||||
( int32_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, S8
|
||||
);
|
||||
#else
|
||||
lpgemm_s8s8s32o32_thread_decorator
|
||||
@@ -165,7 +158,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
|
||||
( int32_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, S8
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_5loop_interface_apis.h"
|
||||
#include "lpgemm_config.h"
|
||||
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ((a == NULL) || (b == NULL) || (c == NULL))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"u8s8s16os16",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
|
||||
@@ -75,31 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
// Row major input expected with leading dimensions equal to row stride.
|
||||
if ((lda != k) || (ldb != n) || (ldc != n))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
@@ -125,22 +116,25 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
|
||||
// Only unpacked A supported now.
|
||||
if (mtag_a != UNPACKED)
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global(&rntm_g);
|
||||
bli_membrk_rntm_set_membrk(&rntm_g);
|
||||
bli_pba_rntm_set_pba(&rntm_g);
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
|
||||
|
||||
@@ -153,7 +147,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, S16
|
||||
);
|
||||
#else
|
||||
lpgemm_u8s8s16o16_thread_decorator
|
||||
@@ -164,7 +158,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, S16
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -117,7 +117,7 @@ AOCL_GEMM_REORDER(int8_t,u8s8s16os16)
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global(&rntm_g);
|
||||
bli_membrk_rntm_set_membrk(&rntm_g);
|
||||
bli_pba_rntm_set_pba(&rntm_g);
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_5loop_interface_apis.h"
|
||||
#include "lpgemm_config.h"
|
||||
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ((a == NULL) || (b == NULL) || (c == NULL))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"u8s8s16os8",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
|
||||
@@ -75,31 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
// Row major input expected with leading dimensions equal to row stride.
|
||||
if ((lda != k) || (ldb != n) || (ldc != n))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
@@ -125,22 +116,25 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
|
||||
// Only unpacked A supported now.
|
||||
if (mtag_a != UNPACKED)
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global(&rntm_g);
|
||||
bli_membrk_rntm_set_membrk(&rntm_g);
|
||||
bli_pba_rntm_set_pba(&rntm_g);
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
|
||||
|
||||
@@ -153,7 +147,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
|
||||
( int16_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, S8
|
||||
);
|
||||
#else
|
||||
lpgemm_u8s8s16o16_thread_decorator
|
||||
@@ -164,7 +158,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
|
||||
( int16_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, S8
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
164
addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c
Normal file
164
addon/aocl_gemm/aocl_gemm_u8s8s16ou8.c
Normal file
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_5loop_interface_apis.h"
|
||||
#include "lpgemm_config.h"
|
||||
#include "lpgemm_utils.h"
|
||||
#include "lpgemm_thread_decor_openmp.h"
|
||||
#include "lpgemm_post_ops.h"
|
||||
|
||||
AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
trans_t blis_transb;
|
||||
|
||||
// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
|
||||
if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
|
||||
{
|
||||
bli_print_msg(" AVX2 ISA not supported by processor, "
|
||||
"cannot perform u8s8s16 gemm.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"u8s8s16ou8",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
|
||||
bli_param_map_netlib_to_blis_trans(transb, &blis_transb);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
// Transpose not supported.
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
const inc_t cs_b = 1;
|
||||
const inc_t rs_c = ldc;
|
||||
const inc_t cs_c = 1;
|
||||
|
||||
AOCL_MEMORY_TAG mtag_a;
|
||||
AOCL_MEMORY_TAG mtag_b;
|
||||
|
||||
bli_param_map_char_to_lpmtag(mem_format_a, &mtag_a);
|
||||
bli_param_map_char_to_lpmtag(mem_format_b, &mtag_b);
|
||||
|
||||
// B matrix needs to be packed in a certain format in order to be loaded
|
||||
// and used in VNNI instrution. As such the mtag_b always needs to be either
|
||||
// packed or reordered. B matrix as it is (unpacked) cannot be used, and
|
||||
// the mtag_b is set to packed to enable runtime packing.
|
||||
if (mtag_b == UNPACKED)
|
||||
{
|
||||
mtag_b = PACK;
|
||||
}
|
||||
|
||||
// Only unpacked A supported now.
|
||||
if (mtag_a != UNPACKED)
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global(&rntm_g);
|
||||
bli_pba_rntm_set_pba(&rntm_g);
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
lpgemm_u8s8s16o16_openmp_thread_decorator
|
||||
(
|
||||
m, n, k,
|
||||
a, rs_a, cs_a, mtag_a,
|
||||
b, rs_b, cs_b, mtag_b,
|
||||
( int16_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, U8
|
||||
);
|
||||
#else
|
||||
lpgemm_u8s8s16o16_thread_decorator
|
||||
(
|
||||
m, n, k,
|
||||
a, rs_a, cs_a, mtag_a,
|
||||
b, rs_b, cs_b, mtag_b,
|
||||
( int16_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, U8
|
||||
);
|
||||
#endif
|
||||
}
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "lpgemm_thread_decor_openmp.h"
|
||||
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"u8s8s32os32",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
|
||||
@@ -75,32 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
// Row major input expected with leading dimensions equal to row stride.
|
||||
if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
|
||||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
@@ -126,22 +116,25 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
|
||||
// Only unpacked A supported now.
|
||||
if ( mtag_a != UNPACKED )
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );
|
||||
|
||||
@@ -154,7 +147,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, S32
|
||||
);
|
||||
#else
|
||||
lpgemm_u8s8s32o32_thread_decorator
|
||||
@@ -165,7 +158,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
|
||||
c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, FALSE
|
||||
post_op_list, S32
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -117,7 +117,7 @@ AOCL_GEMM_REORDER(int8_t,u8s8s32os32)
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocl_gemm_interface_apis.h"
|
||||
#include "aocl_gemm_check.h"
|
||||
#include "lpgemm_types.h"
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "lpgemm_thread_decor_openmp.h"
|
||||
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
|
||||
// Set MC, NC, KC, NR, MR.
|
||||
aocl_lpgemm_init_global_cntx();
|
||||
|
||||
// Null check for pointers.
|
||||
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
// check for validity of params.
|
||||
AOCL_GEMM_CHECK
|
||||
(
|
||||
"u8s8s32os8",
|
||||
order, transa, transb,
|
||||
m, n, k,
|
||||
a, lda, mem_format_a,
|
||||
b, ldb, mem_format_b,
|
||||
c, ldc
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
|
||||
@@ -75,32 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
|
||||
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
|
||||
( blis_transb != BLIS_NO_TRANSPOSE ) )
|
||||
{
|
||||
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Sanitize order input.
|
||||
char order_use =
|
||||
( ( order == 'r' ) || ( order == 'R' ) ||
|
||||
( order == 'c' ) || ( order == 'C' ) ) ?
|
||||
order : 'r';
|
||||
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
|
||||
if ( ( order != 'r' ) && ( order != 'R' ) )
|
||||
{
|
||||
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
|
||||
return; // Only row major supported.
|
||||
}
|
||||
|
||||
// Row major input expected with leading dimensions equal to row stride.
|
||||
if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Check if dimensions are valid.
|
||||
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
|
||||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
|
||||
{
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
const inc_t rs_a = lda;
|
||||
const inc_t cs_a = 1;
|
||||
const inc_t rs_b = ldb;
|
||||
@@ -126,22 +116,25 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
|
||||
// Only unpacked A supported now.
|
||||
if ( mtag_a != UNPACKED )
|
||||
{
|
||||
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
|
||||
return; // Error.
|
||||
}
|
||||
|
||||
// Convert post op struct to post op linked list format.
|
||||
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
|
||||
lpgemm_translate_to_post_ops_list
|
||||
err_t err = lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
post_op_unparsed, post_op_list,
|
||||
( void* )c, ( void* )( &order_use )
|
||||
( void* )c, ( void* )( &order )
|
||||
);
|
||||
|
||||
if( err != BLIS_SUCCESS ) return;
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_g;
|
||||
bli_rntm_init_from_global( &rntm_g );
|
||||
bli_membrk_rntm_set_membrk( &rntm_g );
|
||||
bli_pba_rntm_set_pba( &rntm_g );
|
||||
|
||||
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );
|
||||
|
||||
@@ -154,7 +147,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
|
||||
( int32_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, S8
|
||||
);
|
||||
#else
|
||||
lpgemm_u8s8s32o32_thread_decorator
|
||||
@@ -165,7 +158,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
|
||||
( int32_t* )c, rs_c, cs_c,
|
||||
alpha, beta,
|
||||
&rntm_g, lcntx_g,
|
||||
post_op_list, TRUE
|
||||
post_op_list, S8
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -37,7 +37,7 @@
|
||||
#include "lpgemm_func_map.h"
|
||||
#include "lpgemm_blksz_map.h"
|
||||
#include "lpgemm_kernels.h"
|
||||
#include "lpgemm_packb_bf16.h"
|
||||
#include "lpgemm_pack_bf16.h"
|
||||
#include "lpgemm_packb_s16.h"
|
||||
#include "lpgemm_packa.h"
|
||||
#include "lpgemm_packb.h"
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -56,7 +56,7 @@
|
||||
#define LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI_BF16 \
|
||||
PAMACRO(U8S8S16OS16, NULL) \
|
||||
PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
|
||||
PAMACRO(BF16BF16F32OF32, NULL) \
|
||||
PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \
|
||||
PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
|
||||
PAMACRO(S8S8S16OS16, NULL) \
|
||||
|
||||
@@ -84,7 +84,7 @@
|
||||
#define LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI \
|
||||
PAMACRO(U8S8S16OS16, NULL) \
|
||||
PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
|
||||
PAMACRO(BF16BF16F32OF32, NULL) \
|
||||
PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \
|
||||
PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
|
||||
PAMACRO(S8S8S16OS16, NULL) \
|
||||
|
||||
@@ -112,7 +112,7 @@
|
||||
#define LPGEMM_PACKA_FUNC_MAP_AVX512 \
|
||||
PAMACRO(U8S8S16OS16, NULL) \
|
||||
PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
|
||||
PAMACRO(BF16BF16F32OF32, NULL) \
|
||||
PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \
|
||||
PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
|
||||
PAMACRO(S8S8S16OS16, NULL) \
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,12 +34,14 @@
|
||||
|
||||
#include "blis.h"
|
||||
#include "lpgemm_5loop_interface_apis.h"
|
||||
#include "lpgemm_packb_bf16.h"
|
||||
#include "lpgemm_pack_bf16.h"
|
||||
#include "lpgemm_kernels.h"
|
||||
#include "lpgemm_utils.h"
|
||||
#include "lpgemm_thrinfo_utils.h"
|
||||
#include "lpgemm_config.h"
|
||||
|
||||
|
||||
|
||||
// Kernel function prototypes
|
||||
typedef void (*lpgemm_rowvar_bf16)
|
||||
(
|
||||
@@ -73,6 +75,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
|
||||
const int16_t* a_use = NULL;
|
||||
dim_t cs_a_use = cs_a;
|
||||
dim_t rs_a_use = rs_a;
|
||||
dim_t a_block_stride = 0;
|
||||
|
||||
const int16_t* b_use = NULL;
|
||||
@@ -86,8 +89,11 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
|
||||
// Pack buffer for B.
|
||||
bfloat16* pack_b_buffer_bf16;
|
||||
bfloat16* pack_a_buffer_bf16;
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER;
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER;
|
||||
siz_t mem_b_size_req = 0;
|
||||
siz_t mem_a_size_req = 0;
|
||||
dim_t packb_min_NR = 16;
|
||||
|
||||
// Temporary buffer for C accumulation when downscaling is required.
|
||||
@@ -109,7 +115,8 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
bool is_first_k = FALSE;
|
||||
|
||||
lpgemm_post_op_attr post_ops_attr;
|
||||
if ( c_downscale == TRUE )
|
||||
post_ops_attr.c_stor_type = c_downscale;
|
||||
if ( c_downscale < F32 )
|
||||
{
|
||||
post_ops_attr.buf_downscale = c;
|
||||
}
|
||||
@@ -149,12 +156,12 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
);
|
||||
}
|
||||
|
||||
if ( c_downscale == FALSE )
|
||||
if ( c_downscale == F32 )
|
||||
{
|
||||
c_use_jc = c + jc;
|
||||
}
|
||||
// Temp accumulaton buffer for C allocation.
|
||||
else if ( c_downscale == TRUE )
|
||||
else if ( c_downscale < F32 )
|
||||
{
|
||||
// Buffer memory is only required if output needs to be
|
||||
// persisted across iterations of the pc/KC loop.
|
||||
@@ -167,7 +174,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
|
||||
lpgemm_alloc_mem_panel
|
||||
(
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
||||
&mem_scale_c, rntm
|
||||
);
|
||||
|
||||
@@ -254,11 +261,11 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
if ( ( jc_packb_end > jc_packb_start ) &&
|
||||
( jc_packb_start < ( jc + nc0 ) ) )
|
||||
{
|
||||
( ( packb_bf16 )lcntx->packb_fun_ptr )
|
||||
( ( pack_bf16 )lcntx->packb_fun_ptr )
|
||||
(
|
||||
pack_b_buffer_bf16 + ( jc_packb_start * kc0_updated ),
|
||||
( b + ( rs_b * pc ) + ( cs_b * jc ) +
|
||||
( cs_b * jc_packb_start ) ), rs_b,
|
||||
( cs_b * jc_packb_start ) ), rs_b, cs_b,
|
||||
( jc_packb_end - jc_packb_start ), kc0,
|
||||
&rs_b_use, &cs_b_use
|
||||
);
|
||||
@@ -297,7 +304,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
|
||||
// Only per thread C matrix is stored in temp buffer, so both
|
||||
// per thread jc and ic start should be normalized to zero.
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < F32 )
|
||||
{
|
||||
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
|
||||
}
|
||||
@@ -315,6 +322,31 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
// Non bf16 based kernel requires update to this code.
|
||||
cs_a_use = 2;
|
||||
a_block_stride = rs_a;
|
||||
rs_a_use = rs_a;
|
||||
}
|
||||
else if ( mtag_a == PACK )
|
||||
{
|
||||
|
||||
mem_a_size_req = sizeof( bfloat16 ) * mc0 * kc0;
|
||||
|
||||
lpgemm_alloc_mem_panel
|
||||
(
|
||||
mem_a_size_req, BLIS_BUFFER_FOR_A_BLOCK,
|
||||
&mem_a, rntm
|
||||
);
|
||||
|
||||
pack_a_buffer_bf16 =
|
||||
( bfloat16* ) bli_mem_buffer( &mem_a );
|
||||
|
||||
( ( pack_bf16 )lcntx->packa_fun_ptr )
|
||||
(
|
||||
pack_a_buffer_bf16,
|
||||
( a + ( rs_a * ic ) + ( cs_a * pc )), rs_a, cs_a,
|
||||
mc0, kc0,
|
||||
&rs_a_use, &cs_a_use
|
||||
);
|
||||
a_use = pack_a_buffer_bf16;
|
||||
a_block_stride = rs_a_use;
|
||||
}
|
||||
|
||||
for ( dim_t jr = 0; jr < nc0; jr += NR )
|
||||
@@ -330,7 +362,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
( ( lpgemm_rowvar_bf16 )lcntx->kern_fun_ptr )
|
||||
(
|
||||
mc0, nr0, kc0,
|
||||
a_use, rs_a, cs_a_use, a_block_stride,
|
||||
a_use, rs_a_use, cs_a_use, a_block_stride,
|
||||
( b_use + ( jr * kc0_updated ) ), rs_b_use, cs_b_use,
|
||||
( c_use_ic + jr ), rs_c_use, 1,
|
||||
alpha, beta0,
|
||||
@@ -360,15 +392,22 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_b ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_b );
|
||||
bli_pba_release( rntm, &mem_b );
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( c_downscale == TRUE )
|
||||
if( mtag_a == PACK )
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_a ) )
|
||||
{
|
||||
bli_pba_release(rntm, &mem_a);
|
||||
}
|
||||
}
|
||||
if ( c_downscale < F32 )
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_scale_c ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_scale_c );
|
||||
bli_pba_release( rntm, &mem_scale_c );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
#include "lpgemm_utils.h"
|
||||
#include "lpgemm_reorder_bf16.h"
|
||||
#include "lpgemm_packb_bf16.h"
|
||||
#include "lpgemm_pack_bf16.h"
|
||||
#include "lpgemm_config.h"
|
||||
#include "aocl_bf16_type.h"
|
||||
|
||||
@@ -53,6 +53,7 @@ void reorderb_nr64_bf16bf16f32of32
|
||||
|
||||
// Extracting the matrix properties from the lpgemm object
|
||||
dim_t rs_b = b->rs;
|
||||
dim_t cs_b = b->cs;
|
||||
dim_t n = b->width;
|
||||
dim_t k = b->length;
|
||||
|
||||
@@ -148,14 +149,14 @@ void reorderb_nr64_bf16bf16f32of32
|
||||
// st = ( jc_cur_loop * k ) <traverse blocks 1,2,3,4>
|
||||
// + ( n_sub_updated * pc ) <traverse block 5>
|
||||
// + ( NC' * kc0_updated) <traverse block 6>
|
||||
( ( packb_bf16 )lcntx->packb_fun_ptr )
|
||||
( ( pack_bf16 )lcntx->packb_fun_ptr )
|
||||
(
|
||||
( ( ( bfloat16* )b_reorder->storage.aligned_buffer ) +
|
||||
( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
|
||||
( jc_cur_loop_rem * kc0_updated ) ),
|
||||
( ( bfloat16* )b_reorder->storage.aligned_buffer ) +
|
||||
( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
|
||||
( jc_cur_loop_rem * kc0_updated ),
|
||||
( ( ( bfloat16* )b->storage.aligned_buffer ) +
|
||||
( rs_b * pc ) + jc ),
|
||||
rs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
|
||||
( rs_b * pc ) + (jc * cs_b)),
|
||||
rs_b, cs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -150,7 +150,8 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32)
|
||||
bool is_first_k = FALSE;
|
||||
|
||||
lpgemm_post_op_attr post_ops_attr;
|
||||
if ( c_downscale == TRUE )
|
||||
post_ops_attr.c_stor_type = c_downscale;
|
||||
if ( c_downscale < F32 )
|
||||
{
|
||||
post_ops_attr.buf_downscale = c;
|
||||
}
|
||||
@@ -395,7 +396,7 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32)
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_b ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_b );
|
||||
bli_pba_release( rntm, &mem_b );
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -403,7 +404,7 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32)
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_a ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_a );
|
||||
bli_pba_release( rntm, &mem_a );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -62,7 +62,7 @@ void lpgemm_rowvar_ ## LP_SFX \
|
||||
lpgemm_thrinfo_t* thread, \
|
||||
lpgemm_cntx_t* lcntx, \
|
||||
lpgemm_post_op* post_op_list, \
|
||||
bool c_downscale \
|
||||
AOCL_STORAGE_TYPE c_downscale \
|
||||
) \
|
||||
|
||||
LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32);
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -55,7 +55,7 @@ BLIS_INLINE void lpgemm_set_node_params
|
||||
post_op_node->next = NULL;
|
||||
}
|
||||
|
||||
void lpgemm_translate_to_post_ops_list
|
||||
err_t lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
aocl_post_op* post_op_unparsed,
|
||||
lpgemm_post_op* post_op_list,
|
||||
@@ -70,7 +70,7 @@ void lpgemm_translate_to_post_ops_list
|
||||
post_op_list, POST_OPS_DISABLE,
|
||||
NULL, NULL, NULL, NULL, FALSE
|
||||
);
|
||||
return;
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
if ( ( post_op_unparsed->seq_length > AOCL_MAX_POST_OPS ) )
|
||||
@@ -80,7 +80,7 @@ void lpgemm_translate_to_post_ops_list
|
||||
post_op_list, POST_OPS_DISABLE,
|
||||
NULL, NULL, NULL, NULL, FALSE
|
||||
);
|
||||
return; //Error, seq length exceeds max post ops permitted.
|
||||
return BLIS_SUCCESS; //Error, seq length exceeds max post ops permitted.
|
||||
}
|
||||
|
||||
dim_t e_i = 0; //Multiple eltwise supported.
|
||||
@@ -110,6 +110,11 @@ void lpgemm_translate_to_post_ops_list
|
||||
tmp_code = POST_OPS_RELU;
|
||||
break;
|
||||
case PRELU:
|
||||
if( ( post_op_unparsed->eltwise + e_i )->algo.alpha == NULL )
|
||||
{
|
||||
bli_print_msg(" Post_op.alpha is NULL. Exiting..", __FILE__, __LINE__ );
|
||||
return BLIS_NULL_POINTER;
|
||||
}
|
||||
tmp_code = POST_OPS_RELU_SCALE;
|
||||
break;
|
||||
case GELU_TANH:
|
||||
@@ -119,6 +124,12 @@ void lpgemm_translate_to_post_ops_list
|
||||
tmp_code = POST_OPS_GELU_ERF;
|
||||
break;
|
||||
case CLIP:
|
||||
if( ( ( post_op_unparsed->eltwise + e_i )->algo.alpha == NULL ) ||
|
||||
( ( post_op_unparsed->eltwise + e_i )->algo.beta == NULL ) )
|
||||
{
|
||||
bli_print_msg(" Post_op.clip min or max value is NULL. Exiting..", __FILE__, __LINE__ );
|
||||
return BLIS_NULL_POINTER;
|
||||
}
|
||||
tmp_code = POST_OPS_CLIP;
|
||||
break;
|
||||
default:
|
||||
@@ -137,6 +148,11 @@ void lpgemm_translate_to_post_ops_list
|
||||
}
|
||||
break;
|
||||
case BIAS:
|
||||
if( post_op_unparsed->bias.bias == NULL )
|
||||
{
|
||||
bli_print_msg(" Post_op.bias is NULL. Exiting..", __FILE__, __LINE__ );
|
||||
return BLIS_NULL_POINTER;
|
||||
}
|
||||
lpgemm_set_node_params
|
||||
(
|
||||
( post_op_list + i ), POST_OPS_BIAS,
|
||||
@@ -145,6 +161,12 @@ void lpgemm_translate_to_post_ops_list
|
||||
);
|
||||
break;
|
||||
case SCALE:
|
||||
if( ( post_op_unparsed->sum.scale_factor == NULL ) ||
|
||||
( post_op_unparsed->sum.zero_point == NULL ) )
|
||||
{
|
||||
bli_print_msg(" Post_op.scale scale_factor or zero_point is NULL. Exiting..", __FILE__, __LINE__ );
|
||||
return BLIS_NULL_POINTER;
|
||||
}
|
||||
lpgemm_set_node_params
|
||||
(
|
||||
( post_op_list + i ), POST_OPS_DOWNSCALE,
|
||||
@@ -163,4 +185,5 @@ void lpgemm_translate_to_post_ops_list
|
||||
( post_op_list + i )->next = ( post_op_list + i + 1);
|
||||
}
|
||||
}
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -70,12 +70,13 @@ typedef struct lpgemm_post_op_attr_t
|
||||
void* buf_downscale;
|
||||
bool is_first_k;
|
||||
bool is_last_k;
|
||||
AOCL_STORAGE_TYPE c_stor_type;
|
||||
dim_t b_sum_offset;
|
||||
int32_t* b_col_sum_vec;
|
||||
int16_t* b_col_sum_vec_s16;
|
||||
} lpgemm_post_op_attr;
|
||||
|
||||
void lpgemm_translate_to_post_ops_list
|
||||
err_t lpgemm_translate_to_post_ops_list
|
||||
(
|
||||
aocl_post_op* post_op_unparsed,
|
||||
lpgemm_post_op* post_op_list,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -42,6 +42,24 @@ typedef enum
|
||||
INT32 = 2
|
||||
} AOCL_ARRAY_TYPE;
|
||||
|
||||
// Enum to denote the storage data type (output matrix).
|
||||
// It is expected that the enum entries are in ascending order of
|
||||
// storage data type size.
|
||||
typedef enum
|
||||
{
|
||||
S8 = 0,
|
||||
U8 = 1,
|
||||
S16 = 2,
|
||||
U16 = 3,
|
||||
BF16 = 4,
|
||||
S32 = 5,
|
||||
U32 = 6,
|
||||
F32 = 7,
|
||||
S64 = 8,
|
||||
U64 = 9,
|
||||
F64 = 10
|
||||
} AOCL_STORAGE_TYPE;
|
||||
|
||||
// Enum name template:A_mat_type ## B_mat_type ## Accumulate_type ## C_mat_type.
|
||||
typedef enum
|
||||
{
|
||||
|
||||
@@ -116,7 +116,8 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
|
||||
bool is_first_k = FALSE;
|
||||
|
||||
lpgemm_post_op_attr post_ops_attr;
|
||||
if ( c_downscale == TRUE )
|
||||
post_ops_attr.c_stor_type = c_downscale;
|
||||
if ( c_downscale < S16 )
|
||||
{
|
||||
post_ops_attr.buf_downscale = c;
|
||||
}
|
||||
@@ -156,12 +157,12 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
|
||||
);
|
||||
}
|
||||
|
||||
if ( c_downscale == FALSE )
|
||||
if ( c_downscale == S16 )
|
||||
{
|
||||
c_use_jc = c + jc;
|
||||
}
|
||||
// Temp accumulaton buffer for C allocation.
|
||||
else if ( c_downscale == TRUE )
|
||||
else if ( c_downscale < S16 )
|
||||
{
|
||||
// Buffer memory is only required if output needs to be
|
||||
// persisted across iterations of the pc/KC loop.
|
||||
@@ -174,7 +175,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
|
||||
|
||||
lpgemm_alloc_mem_panel
|
||||
(
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
||||
&mem_scale_c, rntm
|
||||
);
|
||||
|
||||
@@ -329,7 +330,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
|
||||
|
||||
// Only per thread C matrix is stored in temp buffer, so both
|
||||
// per thread jc and ic start should be normalized to zero.
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < S16 )
|
||||
{
|
||||
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
|
||||
}
|
||||
@@ -388,15 +389,15 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
|
||||
{
|
||||
if (bli_mem_is_alloc(&mem_b))
|
||||
{
|
||||
bli_membrk_release(rntm, &mem_b);
|
||||
bli_pba_release(rntm, &mem_b);
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < S16 )
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_scale_c ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_scale_c );
|
||||
bli_pba_release( rntm, &mem_scale_c );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,7 +123,8 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
|
||||
bool is_first_k = FALSE;
|
||||
|
||||
lpgemm_post_op_attr post_ops_attr;
|
||||
if ( c_downscale == TRUE )
|
||||
post_ops_attr.c_stor_type = c_downscale;
|
||||
if ( c_downscale < S32 )
|
||||
{
|
||||
post_ops_attr.buf_downscale = c;
|
||||
}
|
||||
@@ -163,12 +164,12 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
|
||||
);
|
||||
}
|
||||
|
||||
if ( c_downscale == FALSE )
|
||||
if ( c_downscale == S32 )
|
||||
{
|
||||
c_use_jc = c + jc;
|
||||
}
|
||||
// Temp accumulaton buffer for C allocation.
|
||||
else if ( c_downscale == TRUE )
|
||||
else if ( c_downscale < S32 )
|
||||
{
|
||||
// Buffer memory is only required if output needs to be
|
||||
// persisted across iterations of the pc/KC loop.
|
||||
@@ -181,7 +182,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
|
||||
|
||||
lpgemm_alloc_mem_panel
|
||||
(
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
||||
&mem_scale_c, rntm
|
||||
);
|
||||
|
||||
@@ -335,7 +336,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
|
||||
|
||||
// Only per thread C matrix is stored in temp buffer, so both
|
||||
// per thread jc and ic start should be normalized to zero.
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < S32 )
|
||||
{
|
||||
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
|
||||
}
|
||||
@@ -426,7 +427,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_b ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_b );
|
||||
bli_pba_release( rntm, &mem_b );
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -434,14 +435,14 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_a ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_a );
|
||||
bli_pba_release( rntm, &mem_a );
|
||||
}
|
||||
}
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < S32 )
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_scale_c ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_scale_c );
|
||||
bli_pba_release( rntm, &mem_scale_c );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,7 +123,7 @@ BLIS_INLINE void lpgemm_alloc_mem_panel
|
||||
{
|
||||
if ( bli_mem_is_unalloc( mem ) )
|
||||
{
|
||||
bli_membrk_acquire_m
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
rntm_l,
|
||||
size_req,
|
||||
@@ -136,8 +136,8 @@ BLIS_INLINE void lpgemm_alloc_mem_panel
|
||||
siz_t mem_size = bli_mem_size( mem );
|
||||
if ( mem_size < size_req )
|
||||
{
|
||||
bli_membrk_release( rntm_l, mem );
|
||||
bli_membrk_acquire_m
|
||||
bli_pba_release( rntm_l, mem );
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
rntm_l,
|
||||
size_req,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -268,7 +268,7 @@ BLIS_INLINE void lpgemm_adjust_ic_jc_ways
|
||||
}
|
||||
}
|
||||
|
||||
BLIS_INLINE void lpgemm_u8s8s16o16_get_threading
|
||||
BLIS_INLINE void lpgemm_s16o16_get_threading
|
||||
(
|
||||
dim_t* n_threads,
|
||||
dim_t* ic_ways,
|
||||
@@ -276,7 +276,8 @@ BLIS_INLINE void lpgemm_u8s8s16o16_get_threading
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
rntm_t* rntm_g
|
||||
rntm_t* rntm_g,
|
||||
AOCL_OPERATION_TYPE op_type
|
||||
)
|
||||
{
|
||||
*n_threads = bli_rntm_num_threads( rntm_g );
|
||||
@@ -295,19 +296,176 @@ BLIS_INLINE void lpgemm_u8s8s16o16_get_threading
|
||||
else if ( ( *n_threads ) > 1 )
|
||||
{
|
||||
|
||||
dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S16OS16 );
|
||||
dim_t NR = lpgemm_get_block_size_NR_global_cntx( op_type );
|
||||
dim_t MR = lpgemm_get_block_size_MR_global_cntx( op_type );
|
||||
dim_t mr_blks = ( m + MR - 1 ) / MR;
|
||||
dim_t nr_blks = ( n + NR - 1 ) / NR;
|
||||
|
||||
if ( n <= NR )
|
||||
{
|
||||
// If n is less than micro panel dimension, allocating all threads
|
||||
// to ic resulted in gains.
|
||||
( *ic_ways ) = ( *n_threads );
|
||||
( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads );
|
||||
( *jc_ways ) = 1;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( m <= MR )
|
||||
{
|
||||
( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads );
|
||||
( *ic_ways ) = 1;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else
|
||||
{
|
||||
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
|
||||
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
|
||||
if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) )
|
||||
{
|
||||
( *ic_ways ) = mr_blks;
|
||||
( *jc_ways ) = nr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( mr_blks < ( *ic_ways ) )
|
||||
{
|
||||
( *ic_ways ) = mr_blks;
|
||||
dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) );
|
||||
( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( nr_blks < ( *jc_ways ) )
|
||||
{
|
||||
( *jc_ways ) = nr_blks;
|
||||
dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) );
|
||||
( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Setting all the values to 1 in case n_threads <= 1. This ensures
|
||||
// the threading parameters are valid.
|
||||
*n_threads = 1;
|
||||
*jc_ways = 1;
|
||||
*ic_ways = 1;
|
||||
}
|
||||
}
|
||||
|
||||
BLIS_INLINE void lpgemm_u8s8s16o16_get_threading
|
||||
(
|
||||
dim_t* n_threads,
|
||||
dim_t* ic_ways,
|
||||
dim_t* jc_ways,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
rntm_t* rntm_g
|
||||
)
|
||||
{
|
||||
lpgemm_s16o16_get_threading
|
||||
(
|
||||
n_threads,
|
||||
ic_ways, jc_ways,
|
||||
m, n, k, rntm_g,
|
||||
U8S8S16OS16
|
||||
);
|
||||
}
|
||||
|
||||
BLIS_INLINE void lpgemm_s8s8s16o16_get_threading
|
||||
(
|
||||
dim_t* n_threads,
|
||||
dim_t* ic_ways,
|
||||
dim_t* jc_ways,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
rntm_t* rntm_g
|
||||
)
|
||||
{
|
||||
lpgemm_s16o16_get_threading
|
||||
(
|
||||
n_threads,
|
||||
ic_ways, jc_ways,
|
||||
m, n, k, rntm_g,
|
||||
S8S8S16OS16
|
||||
);
|
||||
}
|
||||
|
||||
BLIS_INLINE void lpgemm_s32o32_get_threading
|
||||
(
|
||||
dim_t* n_threads,
|
||||
dim_t* ic_ways,
|
||||
dim_t* jc_ways,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
rntm_t* rntm_g,
|
||||
AOCL_OPERATION_TYPE op_type
|
||||
)
|
||||
{
|
||||
*n_threads = bli_rntm_num_threads( rntm_g );
|
||||
*jc_ways = bli_rntm_jc_ways( rntm_g );
|
||||
*ic_ways = bli_rntm_ic_ways( rntm_g );
|
||||
|
||||
if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
|
||||
{
|
||||
// If BLIS_IC_NT or JC_NT are set.
|
||||
// Default cases.
|
||||
*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
|
||||
*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
|
||||
|
||||
*n_threads = ( *jc_ways ) * ( *ic_ways );
|
||||
}
|
||||
else if ( ( *n_threads ) > 1 )
|
||||
{
|
||||
|
||||
dim_t NR = lpgemm_get_block_size_NR_global_cntx( op_type );
|
||||
dim_t MR = lpgemm_get_block_size_MR_global_cntx( op_type );
|
||||
dim_t mr_blks = ( m + MR - 1 ) / MR;
|
||||
dim_t nr_blks = ( n + NR - 1 ) / NR;
|
||||
|
||||
if ( n <= NR )
|
||||
{
|
||||
( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads );
|
||||
( *jc_ways ) = 1;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( m <= MR )
|
||||
{
|
||||
( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads );
|
||||
( *ic_ways ) = 1;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else
|
||||
{
|
||||
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
|
||||
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
|
||||
if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) )
|
||||
{
|
||||
( *ic_ways ) = mr_blks;
|
||||
( *jc_ways ) = nr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( mr_blks < ( *ic_ways ) )
|
||||
{
|
||||
( *ic_ways ) = mr_blks;
|
||||
dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) );
|
||||
( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( nr_blks < ( *jc_ways ) )
|
||||
{
|
||||
( *jc_ways ) = nr_blks;
|
||||
dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) );
|
||||
( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else
|
||||
{
|
||||
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
|
||||
(
|
||||
MR, NR, m, n,
|
||||
n_threads, ic_ways, jc_ways
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -331,52 +489,33 @@ BLIS_INLINE void lpgemm_u8s8s32o32_get_threading
|
||||
rntm_t* rntm_g
|
||||
)
|
||||
{
|
||||
*n_threads = bli_rntm_num_threads( rntm_g );
|
||||
*jc_ways = bli_rntm_jc_ways( rntm_g );
|
||||
*ic_ways = bli_rntm_ic_ways( rntm_g );
|
||||
lpgemm_s32o32_get_threading
|
||||
(
|
||||
n_threads,
|
||||
ic_ways, jc_ways,
|
||||
m, n, k, rntm_g,
|
||||
U8S8S32OS32
|
||||
);
|
||||
}
|
||||
|
||||
if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
|
||||
{
|
||||
// If BLIS_IC_NT or JC_NT are set.
|
||||
// Default cases.
|
||||
*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
|
||||
*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
|
||||
|
||||
*n_threads = ( *jc_ways ) * ( *ic_ways );
|
||||
}
|
||||
else if ( ( *n_threads ) > 1 )
|
||||
{
|
||||
|
||||
dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S32OS32 );
|
||||
dim_t MR = lpgemm_get_block_size_MR_global_cntx( U8S8S32OS32 );
|
||||
|
||||
if ( n <= NR )
|
||||
{
|
||||
// If n is less than micro panel dimension, allocating all threads
|
||||
// to ic resulted in gains.
|
||||
( *ic_ways ) = ( *n_threads );
|
||||
( *jc_ways ) = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
|
||||
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
|
||||
|
||||
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
|
||||
(
|
||||
MR, NR, m, n,
|
||||
n_threads, ic_ways, jc_ways
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Setting all the values to 1 in case n_threads <= 1. This ensures
|
||||
// the threading parameters are valid.
|
||||
*n_threads = 1;
|
||||
*jc_ways = 1;
|
||||
*ic_ways = 1;
|
||||
}
|
||||
BLIS_INLINE void lpgemm_s8s8s32o32_get_threading
|
||||
(
|
||||
dim_t* n_threads,
|
||||
dim_t* ic_ways,
|
||||
dim_t* jc_ways,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
rntm_t* rntm_g
|
||||
)
|
||||
{
|
||||
lpgemm_s32o32_get_threading
|
||||
(
|
||||
n_threads,
|
||||
ic_ways, jc_ways,
|
||||
m, n, k, rntm_g,
|
||||
S8S8S32OS32
|
||||
);
|
||||
}
|
||||
|
||||
BLIS_INLINE void lpgemm_bf16bf16f32of32_get_threading
|
||||
@@ -408,24 +547,53 @@ BLIS_INLINE void lpgemm_bf16bf16f32of32_get_threading
|
||||
|
||||
dim_t NR = lpgemm_get_block_size_NR_global_cntx( BF16BF16F32OF32 );
|
||||
dim_t MR = lpgemm_get_block_size_MR_global_cntx( BF16BF16F32OF32 );
|
||||
dim_t mr_blks = ( m + MR - 1 ) / MR;
|
||||
dim_t nr_blks = ( n + NR - 1 ) / NR;
|
||||
|
||||
if ( n <= NR )
|
||||
{
|
||||
// If n is less than micro panel dimension, allocating all threads
|
||||
// to ic resulted in gains.
|
||||
( *ic_ways ) = ( *n_threads );
|
||||
( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads );
|
||||
( *jc_ways ) = 1;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( m <= MR )
|
||||
{
|
||||
( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads );
|
||||
( *ic_ways ) = 1;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else
|
||||
{
|
||||
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
|
||||
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
|
||||
|
||||
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
|
||||
(
|
||||
MR, NR, m, n,
|
||||
n_threads, ic_ways, jc_ways
|
||||
);
|
||||
if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) )
|
||||
{
|
||||
( *ic_ways ) = mr_blks;
|
||||
( *jc_ways ) = nr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( mr_blks < ( *ic_ways ) )
|
||||
{
|
||||
( *ic_ways ) = mr_blks;
|
||||
dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) );
|
||||
( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( nr_blks < ( *jc_ways ) )
|
||||
{
|
||||
( *jc_ways ) = nr_blks;
|
||||
dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) );
|
||||
( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else
|
||||
{
|
||||
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
|
||||
(
|
||||
MR, NR, m, n,
|
||||
n_threads, ic_ways, jc_ways
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -485,15 +653,55 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
|
||||
}
|
||||
else if ( ( *n_threads ) > 1 )
|
||||
{
|
||||
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
|
||||
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
|
||||
dim_t mr_blks = ( m + MR - 1 ) / MR;
|
||||
dim_t nr_blks = ( n + NR - 1 ) / NR;
|
||||
|
||||
lpgemm_adjust_ic_jc_ways
|
||||
(
|
||||
m, n, k,
|
||||
MC, NC, KC, MR, NR,
|
||||
n_threads, ic_ways, jc_ways, 5
|
||||
);
|
||||
if ( n <= NR )
|
||||
{
|
||||
( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads );
|
||||
( *jc_ways ) = 1;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( m <= MR )
|
||||
{
|
||||
( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads );
|
||||
( *ic_ways ) = 1;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else
|
||||
{
|
||||
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
|
||||
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
|
||||
if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) )
|
||||
{
|
||||
( *ic_ways ) = mr_blks;
|
||||
( *jc_ways ) = nr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( mr_blks < ( *ic_ways ) )
|
||||
{
|
||||
( *ic_ways ) = mr_blks;
|
||||
dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) );
|
||||
( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else if ( nr_blks < ( *jc_ways ) )
|
||||
{
|
||||
( *jc_ways ) = nr_blks;
|
||||
dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) );
|
||||
( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks;
|
||||
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
|
||||
}
|
||||
else
|
||||
{
|
||||
lpgemm_adjust_ic_jc_ways
|
||||
(
|
||||
m, n, k,
|
||||
MC, NC, KC, MR, NR,
|
||||
n_threads, ic_ways, jc_ways, 5
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -513,9 +721,8 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
|
||||
|
||||
if ( ( m >= MT ) && ( n >= NT ) && ( k >= KT ) )
|
||||
{
|
||||
if ( ( k > page_size_b_floatx2 ) ||
|
||||
( ( k <= page_size_b_floatx2 ) &&
|
||||
( m_ic > MT_2 ) && ( n_jc >= NT ) ) )
|
||||
if (((k <= page_size_b_floatx2 ) && ( m_ic > MT_2 ) && ( n_jc >= NT ) ) ||
|
||||
((bli_cpuid_is_avx512_supported() == FALSE ) && (k > page_size_b_floatx2)))
|
||||
{
|
||||
bli_rntm_set_pack_b( 1, rntm_g );
|
||||
bli_rntm_set_pack_a( 1, rntm_g );
|
||||
@@ -523,118 +730,6 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
|
||||
}
|
||||
}
|
||||
|
||||
BLIS_INLINE void lpgemm_s8s8s32o32_get_threading
|
||||
(
|
||||
dim_t* n_threads,
|
||||
dim_t* ic_ways,
|
||||
dim_t* jc_ways,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
rntm_t* rntm_g
|
||||
)
|
||||
{
|
||||
*n_threads = bli_rntm_num_threads( rntm_g );
|
||||
*jc_ways = bli_rntm_jc_ways( rntm_g );
|
||||
*ic_ways = bli_rntm_ic_ways( rntm_g );
|
||||
|
||||
if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
|
||||
{
|
||||
// If BLIS_IC_NT or JC_NT are set.
|
||||
// Default cases.
|
||||
*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
|
||||
*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
|
||||
|
||||
*n_threads = ( *jc_ways ) * ( *ic_ways );
|
||||
}
|
||||
else if ( ( *n_threads ) > 1 )
|
||||
{
|
||||
|
||||
dim_t NR = lpgemm_get_block_size_NR_global_cntx( S8S8S32OS32 );
|
||||
dim_t MR = lpgemm_get_block_size_MR_global_cntx( S8S8S32OS32 );
|
||||
|
||||
if ( n <= NR )
|
||||
{
|
||||
// If n is less than micro panel dimension, allocating all threads
|
||||
// to ic resulted in gains.
|
||||
( *ic_ways ) = ( *n_threads );
|
||||
( *jc_ways ) = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
|
||||
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
|
||||
|
||||
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
|
||||
(
|
||||
MR, NR, m, n,
|
||||
n_threads, ic_ways, jc_ways
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Setting all the values to 1 in case n_threads <= 1. This ensures
|
||||
// the threading parameters are valid.
|
||||
*n_threads = 1;
|
||||
*jc_ways = 1;
|
||||
*ic_ways = 1;
|
||||
}
|
||||
}
|
||||
|
||||
BLIS_INLINE void lpgemm_s8s8s16o16_get_threading
|
||||
(
|
||||
dim_t* n_threads,
|
||||
dim_t* ic_ways,
|
||||
dim_t* jc_ways,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
rntm_t* rntm_g
|
||||
)
|
||||
{
|
||||
*n_threads = bli_rntm_num_threads( rntm_g );
|
||||
*jc_ways = bli_rntm_jc_ways( rntm_g );
|
||||
*ic_ways = bli_rntm_ic_ways( rntm_g );
|
||||
|
||||
if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
|
||||
{
|
||||
// If BLIS_IC_NT or JC_NT are set.
|
||||
// Default cases.
|
||||
*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
|
||||
*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
|
||||
|
||||
*n_threads = ( *jc_ways ) * ( *ic_ways );
|
||||
}
|
||||
else if ( ( *n_threads ) > 1 )
|
||||
{
|
||||
|
||||
dim_t NR = lpgemm_get_block_size_NR_global_cntx( S8S8S16OS16 );
|
||||
|
||||
if ( n <= NR )
|
||||
{
|
||||
// If n is less than micro panel dimension, allocating all threads
|
||||
// to ic resulted in gains.
|
||||
( *ic_ways ) = ( *n_threads );
|
||||
( *jc_ways ) = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
|
||||
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Setting all the values to 1 in case n_threads <= 1. This ensures
|
||||
// the threading parameters are valid.
|
||||
*n_threads = 1;
|
||||
*jc_ways = 1;
|
||||
*ic_ways = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define GEN_LPGEMM_OPENMP_DECORATOR(A_type,B_type,C_type,LPGEMM_SFX) \
|
||||
void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
|
||||
( \
|
||||
@@ -657,7 +752,7 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
|
||||
rntm_t* rntm_g, \
|
||||
lpgemm_cntx_t* lcntx, \
|
||||
lpgemm_post_op* post_op_list, \
|
||||
bool c_downscale \
|
||||
AOCL_STORAGE_TYPE c_downscale \
|
||||
) \
|
||||
{ \
|
||||
dim_t n_threads; \
|
||||
@@ -676,14 +771,15 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
|
||||
/* Set the packing block allocator field of the rntm. This will be
|
||||
* inherited by all of the child threads when they make local copies of
|
||||
* the rntm below.*/ \
|
||||
bli_membrk_rntm_set_membrk( rntm_g ); \
|
||||
bli_pba_rntm_set_pba( rntm_g ); \
|
||||
\
|
||||
thrcomm_t static_lpgemm_comms[BLIS_LPGEMM_NUM_STATIC_COMMS]; \
|
||||
thrcomm_t* cur_lpgemm_comms = static_lpgemm_comms; \
|
||||
err_t bli_errors = BLIS_SUCCESS; \
|
||||
\
|
||||
if ( jc_ways > BLIS_LPGEMM_NUM_STATIC_COMMS ) \
|
||||
{ \
|
||||
cur_lpgemm_comms = bli_malloc_intl( jc_ways * sizeof( thrcomm_t ) ); \
|
||||
cur_lpgemm_comms = bli_malloc_intl( jc_ways * sizeof( thrcomm_t ), &bli_errors ); \
|
||||
} \
|
||||
for ( dim_t i = 0; i < jc_ways; ++i ) \
|
||||
{ \
|
||||
@@ -758,7 +854,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
|
||||
rntm_t* rntm_g, \
|
||||
lpgemm_cntx_t* lcntx, \
|
||||
lpgemm_post_op* post_op_list, \
|
||||
bool c_downscale \
|
||||
AOCL_STORAGE_TYPE c_downscale \
|
||||
) \
|
||||
{ \
|
||||
dim_t n_threads = 1; \
|
||||
@@ -770,7 +866,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
|
||||
/* Set the packing block allocator field of the rntm. This will be
|
||||
* inherited by all of the child threads when they make local copies of
|
||||
* the rntm below.*/ \
|
||||
bli_membrk_rntm_set_membrk( rntm_g ); \
|
||||
bli_pba_rntm_set_pba( rntm_g ); \
|
||||
\
|
||||
thrcomm_t static_lpgemm_comm; \
|
||||
thrcomm_t* cur_lpgemm_comm = &static_lpgemm_comm; \
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -63,7 +63,7 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
|
||||
rntm_t* rntm_g, \
|
||||
lpgemm_cntx_t* lcntx, \
|
||||
lpgemm_post_op* post_op_list, \
|
||||
bool c_downscale \
|
||||
AOCL_STORAGE_TYPE c_downscale \
|
||||
); \
|
||||
|
||||
GEN_LPGEMM_OPENMP_DECORATOR_FN(uint8_t,int8_t,int16_t,u8s8s16o16)
|
||||
@@ -97,7 +97,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
|
||||
rntm_t* rntm_g, \
|
||||
lpgemm_cntx_t* lcntx, \
|
||||
lpgemm_post_op* post_op_list, \
|
||||
bool c_downscale \
|
||||
AOCL_STORAGE_TYPE c_downscale \
|
||||
); \
|
||||
|
||||
GEN_LPGEMM_DECORATOR_FN(uint8_t,int8_t,int16_t,u8s8s16o16)
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -113,7 +113,8 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
|
||||
bool is_first_k = FALSE;
|
||||
|
||||
lpgemm_post_op_attr post_ops_attr;
|
||||
if ( c_downscale == TRUE )
|
||||
post_ops_attr.c_stor_type = c_downscale;
|
||||
if ( c_downscale < S16 )
|
||||
{
|
||||
post_ops_attr.buf_downscale = c;
|
||||
}
|
||||
@@ -153,12 +154,12 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
|
||||
);
|
||||
}
|
||||
|
||||
if ( c_downscale == FALSE )
|
||||
if ( c_downscale == S16 )
|
||||
{
|
||||
c_use_jc = c + jc;
|
||||
}
|
||||
// Temp accumulaton buffer for C allocation.
|
||||
else if ( c_downscale == TRUE )
|
||||
else if ( c_downscale < S16 )
|
||||
{
|
||||
// Buffer memory is only required if output needs to be
|
||||
// persisted across iterations of the pc/KC loop.
|
||||
@@ -171,7 +172,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
|
||||
|
||||
lpgemm_alloc_mem_panel
|
||||
(
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
||||
&mem_scale_c, rntm
|
||||
);
|
||||
|
||||
@@ -305,7 +306,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
|
||||
|
||||
// Only per thread C matrix is stored in temp buffer, so both
|
||||
// per thread jc and ic start should be normalized to zero.
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < S16 )
|
||||
{
|
||||
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
|
||||
}
|
||||
@@ -361,15 +362,15 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
|
||||
{
|
||||
if (bli_mem_is_alloc(&mem_b))
|
||||
{
|
||||
bli_membrk_release(rntm, &mem_b);
|
||||
bli_pba_release(rntm, &mem_b);
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < S16 )
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_scale_c ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_scale_c );
|
||||
bli_pba_release( rntm, &mem_scale_c );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -122,7 +122,8 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
|
||||
bool is_first_k = FALSE;
|
||||
|
||||
lpgemm_post_op_attr post_ops_attr;
|
||||
if ( c_downscale == TRUE )
|
||||
post_ops_attr.c_stor_type = c_downscale;
|
||||
if ( c_downscale < S32 )
|
||||
{
|
||||
post_ops_attr.buf_downscale = c;
|
||||
}
|
||||
@@ -162,12 +163,12 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
|
||||
);
|
||||
}
|
||||
|
||||
if ( c_downscale == FALSE )
|
||||
if ( c_downscale == S32 )
|
||||
{
|
||||
c_use_jc = c + jc;
|
||||
}
|
||||
// Temp accumulaton buffer for C allocation.
|
||||
else if ( c_downscale == TRUE )
|
||||
else if ( c_downscale < S32 )
|
||||
{
|
||||
// Buffer memory is only required if output needs to be
|
||||
// persisted across iterations of the pc/KC loop.
|
||||
@@ -180,7 +181,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
|
||||
|
||||
lpgemm_alloc_mem_panel
|
||||
(
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
|
||||
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
|
||||
&mem_scale_c, rntm
|
||||
);
|
||||
|
||||
@@ -313,7 +314,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
|
||||
|
||||
// Only per thread C matrix is stored in temp buffer, so both
|
||||
// per thread jc and ic start should be normalized to zero.
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < S32 )
|
||||
{
|
||||
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
|
||||
}
|
||||
@@ -405,7 +406,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_b ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_b );
|
||||
bli_pba_release( rntm, &mem_b );
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -413,14 +414,14 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_a ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_a );
|
||||
bli_pba_release( rntm, &mem_a );
|
||||
}
|
||||
}
|
||||
if ( c_downscale == TRUE )
|
||||
if ( c_downscale < S32 )
|
||||
{
|
||||
if ( bli_mem_is_alloc( &mem_scale_c ) )
|
||||
{
|
||||
bli_membrk_release( rntm, &mem_scale_c );
|
||||
bli_pba_release( rntm, &mem_scale_c );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,7 +123,7 @@ BLIS_INLINE void lpgemm_alloc_mem_panel
|
||||
{
|
||||
if ( bli_mem_is_unalloc( mem ) )
|
||||
{
|
||||
bli_membrk_acquire_m
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
rntm_l,
|
||||
size_req,
|
||||
@@ -136,8 +136,8 @@ BLIS_INLINE void lpgemm_alloc_mem_panel
|
||||
siz_t mem_size = bli_mem_size( mem );
|
||||
if ( mem_size < size_req )
|
||||
{
|
||||
bli_membrk_release( rntm_l, mem );
|
||||
bli_membrk_acquire_m
|
||||
bli_pba_release( rntm_l, mem );
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
rntm_l,
|
||||
size_req,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -47,13 +47,14 @@ BLIS_INLINE dim_t get_packb_bf16bf16f32of32_min_NR()
|
||||
return 16;
|
||||
}
|
||||
|
||||
typedef void (*packb_bf16)
|
||||
typedef void (*pack_bf16)
|
||||
(
|
||||
bfloat16*,
|
||||
const bfloat16*,
|
||||
const dim_t,
|
||||
const dim_t,
|
||||
const dim_t,
|
||||
const dim_t,
|
||||
dim_t*,
|
||||
dim_t*
|
||||
);
|
||||
@@ -62,11 +63,24 @@ void packb_nr64_bf16bf16f32of32
|
||||
(
|
||||
bfloat16* pack_b_buffer_bf16bf16f32of32,
|
||||
const bfloat16* b,
|
||||
const dim_t ldb,
|
||||
const dim_t rs_b,
|
||||
const dim_t cs_b,
|
||||
const dim_t NC,
|
||||
const dim_t KC,
|
||||
dim_t* rs_b,
|
||||
dim_t* cs_b
|
||||
dim_t* rs_p,
|
||||
dim_t* cs_p
|
||||
);
|
||||
|
||||
|
||||
void packa_mr16_bf16bf16f32of32
|
||||
(
|
||||
bfloat16* pack_a_buffer,
|
||||
const bfloat16* a,
|
||||
const dim_t rs_a,
|
||||
const dim_t cs_a,
|
||||
const dim_t MC,
|
||||
const dim_t KC,
|
||||
dim_t* rs_p,
|
||||
dim_t* cs_p
|
||||
);
|
||||
#endif //BLIS_GEMM_BF16_PACKB
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -93,7 +93,7 @@ void bao_l3_thread_decorator
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// Check for a somewhat obscure OpenMP thread-mismatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
@@ -1,10 +1,59 @@
|
||||
##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ##
|
||||
##Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. ##
|
||||
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/aocldtl.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/aocldtl_blis.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/aoclfal.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/aoclflist.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/aoclos.c
|
||||
)
|
||||
# Collect all subdirectory paths that have at least one file with suffix in AOCLDTL_SRC_SUFS list.
|
||||
get_filepaths_with_suffixes(LOCAL_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${AOCLDTL_SRC_SUFS}")
|
||||
|
||||
# Create an object library using the source file list above.
|
||||
add_library(AOCL_DTL
|
||||
OBJECT
|
||||
${LOCAL_SOURCE_FILES}
|
||||
)
|
||||
|
||||
# Include the corresponding make_defs.cmake that holds the required compiler options.
|
||||
include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake)
|
||||
# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets.
|
||||
# mimicing get-aocldtl-cflags-for
|
||||
target_compile_options(AOCL_DTL
|
||||
PRIVATE
|
||||
# load-var-for,COPTFLAGS
|
||||
${COPTFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CDBGFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CWARNFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CMISCFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CLANGFLAGS}
|
||||
# in get-aocldtl-cflags-for
|
||||
${BUILD_SYMFLAGS}
|
||||
)
|
||||
target_compile_definitions(AOCL_DTL
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${VERS_DEF}
|
||||
# in get-aocldtl-cflags-for
|
||||
${BUILD_CPPFLAGS}
|
||||
# in get-aocldtl-cflags-for
|
||||
${CPPROCFLAGS}
|
||||
)
|
||||
target_include_directories(AOCL_DTL
|
||||
BEFORE
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${CINFLAGS}
|
||||
)
|
||||
if(THREADING_MODEL STREQUAL "openmp")
|
||||
# Equivalent to CTHREADFLAGS in get-noopt-cflags-for
|
||||
target_link_libraries(AOCL_DTL PRIVATE OpenMP::OpenMP_C)
|
||||
elseif(THREADING_MODEL STREQUAL "pthreads")
|
||||
# in get-noopt-cflags-for
|
||||
target_compile_options(AOCL_DTL PRIVATE ${CTHREADFLAGS})
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
# Equivalent to CPICFLAGS in get-noopt-cflags-for
|
||||
set_target_properties(AOCL_DTL PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
add_dependencies(AOCL_DTL flat-header)
|
||||
# Put all those targets under object-libs-targets folder name so that they appear all together in IDE.
|
||||
set_target_properties(AOCL_DTL PROPERTIES FOLDER object-libs-targets)
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* These functions are invoked though macros by
|
||||
* end user.
|
||||
*
|
||||
* Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*=======================================================================*/
|
||||
#include "blis.h"
|
||||
@@ -539,11 +539,11 @@ uint64 AOCL_DTL_get_time_spent(void)
|
||||
#ifdef AOCL_DTL_AUTO_TRACE_ENABLE
|
||||
|
||||
/*
|
||||
Disable intrumentation for these functions as they will also be
|
||||
called from compiler generated instumation code to trace
|
||||
Disable instrumentation for these functions as they will also be
|
||||
called from compiler generated instrumentation code to trace
|
||||
function execution.
|
||||
|
||||
It needs to be part of declration in the C file so can't be
|
||||
It needs to be part of declaration in the C file so can't be
|
||||
moved to header file.
|
||||
|
||||
WARNING: These functions are automatically invoked. however any function
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* It provides defination for all macros to be
|
||||
* used by user to add debug/trace information.
|
||||
*
|
||||
* Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Description : BLIS library specific debug helpes.
|
||||
*
|
||||
* Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
@@ -92,6 +92,7 @@ void AOCL_DTL_log_gemm_sizes(int8 loglevel,
|
||||
}
|
||||
|
||||
void AOCL_DTL_log_gemm_stats(int8 loglevel,
|
||||
char dt_type,
|
||||
const f77_int m,
|
||||
const f77_int n,
|
||||
const f77_int k)
|
||||
@@ -99,14 +100,52 @@ void AOCL_DTL_log_gemm_stats(int8 loglevel,
|
||||
char buffer[256];
|
||||
|
||||
double flops = 2.0 * m * n * k;
|
||||
if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z')
|
||||
{
|
||||
flops = 4.0 * flops;
|
||||
}
|
||||
|
||||
// Execution time is in micro seconds.
|
||||
Double execution_time = AOCL_DTL_get_time_spent();
|
||||
|
||||
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0,
|
||||
flops/(execution_time * 1e3));
|
||||
if (execution_time != 0.0)
|
||||
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0,
|
||||
flops/(execution_time * 1e3));
|
||||
else
|
||||
sprintf(buffer, " nt=%ld %.3f ms",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0);
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
|
||||
}
|
||||
|
||||
void AOCL_DTL_log_gemmt_stats(int8 loglevel,
|
||||
char dt_type,
|
||||
const f77_int n,
|
||||
const f77_int k)
|
||||
{
|
||||
char buffer[256];
|
||||
|
||||
double flops = n * n * k;
|
||||
if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z')
|
||||
{
|
||||
flops = 4.0 * flops;
|
||||
}
|
||||
|
||||
// Execution time is in micro seconds.
|
||||
Double execution_time = AOCL_DTL_get_time_spent();
|
||||
|
||||
if (execution_time != 0.0)
|
||||
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0,
|
||||
flops/(execution_time * 1e3));
|
||||
else
|
||||
sprintf(buffer, " nt=%ld %.3f ms",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0);
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
|
||||
}
|
||||
@@ -131,17 +170,57 @@ void AOCL_DTL_log_trsm_sizes(int8 loglevel,
|
||||
double alpha_real = 0.0;
|
||||
double alpha_imag = 0.0;
|
||||
|
||||
|
||||
DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag);
|
||||
|
||||
//{S, D, C, Z} side, uplo, transa, diaga, m, n, lda, ldb, alpha_real, alpha_imag
|
||||
sprintf(buffer, "%c %c %c %c %c %ld %ld %ld %ld %lf %lf\n", dt_type,
|
||||
sprintf(buffer, "%c %c %c %c %c %ld %ld %ld %ld %lf %lf", dt_type,
|
||||
side, uploa, transa, diaga,
|
||||
(dim_t)m, (dim_t)n, (dim_t)lda, (dim_t)ldb,
|
||||
alpha_real, alpha_imag);
|
||||
|
||||
AOCL_DTL_START_PERF_TIMER();
|
||||
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
|
||||
}
|
||||
|
||||
void AOCL_DTL_log_trsm_stats(int8 loglevel,
|
||||
char dt_type,
|
||||
f77_char side,
|
||||
const f77_int m,
|
||||
const f77_int n)
|
||||
{
|
||||
char buffer[256];
|
||||
|
||||
double flops = 0.0;
|
||||
if (side == 'L' || side =='l')
|
||||
{
|
||||
flops = 1.0 * m * n * m;
|
||||
}
|
||||
else
|
||||
{
|
||||
flops = 1.0 * m * n * n;
|
||||
}
|
||||
if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z')
|
||||
{
|
||||
flops = 4.0 * flops;
|
||||
}
|
||||
|
||||
// Execution time is in micro seconds.
|
||||
Double execution_time = AOCL_DTL_get_time_spent();
|
||||
|
||||
if (execution_time != 0.0)
|
||||
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0,
|
||||
flops/(execution_time * 1e3));
|
||||
else
|
||||
sprintf(buffer, " nt=%ld %.3f ms",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0);
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
|
||||
}
|
||||
|
||||
void AOCL_DTL_log_gemmt_sizes(int8 loglevel,
|
||||
char dt_type,
|
||||
char uplo,
|
||||
@@ -165,18 +244,20 @@ void AOCL_DTL_log_gemmt_sizes(int8 loglevel,
|
||||
double beta_real = 0.0;
|
||||
double beta_imag = 0.0;
|
||||
|
||||
|
||||
DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag);
|
||||
DTL_get_complex_parts(dt_type, beta, &beta_real, &beta_imag);
|
||||
|
||||
// {S,D,C,Z} {triangC : l or u} {n k lda ldb ldc transa transb alpha_real alpha_imaginary
|
||||
// beta_real, beta_imaginary}
|
||||
sprintf(buffer, "%c %c %ld %ld %lu %lu %lu %c %c %lf %lf %lf %lf\n",
|
||||
sprintf(buffer, "%c %c %ld %ld %lu %lu %lu %c %c %lf %lf %lf %lf",
|
||||
dt_type, uplo, (dim_t)n, (dim_t)k,
|
||||
(dim_t)lda, (dim_t)ldb, (dim_t)ldc,
|
||||
transa, transb,
|
||||
alpha_real, alpha_imag,
|
||||
beta_real, beta_imag);
|
||||
|
||||
AOCL_DTL_START_PERF_TIMER();
|
||||
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
|
||||
}
|
||||
|
||||
@@ -639,12 +720,41 @@ void AOCL_DTL_log_nrm2_sizes(int8 loglevel,
|
||||
{
|
||||
char buffer[256];
|
||||
// {S, D, C, Z} {n, incx}
|
||||
sprintf(buffer, "%c %ld %ld\n",
|
||||
sprintf(buffer, "%c %ld %ld",
|
||||
dt_type, (dim_t)n, (dim_t)incx);
|
||||
|
||||
AOCL_DTL_START_PERF_TIMER();
|
||||
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
|
||||
}
|
||||
|
||||
void AOCL_DTL_log_nrm2_stats(int8 loglevel,
|
||||
char dt_type,
|
||||
const f77_int n)
|
||||
{
|
||||
char buffer[256];
|
||||
|
||||
double flops = 2.0 * n;
|
||||
if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z')
|
||||
{
|
||||
flops = 2.0 * flops;
|
||||
}
|
||||
|
||||
// Execution time is in micro seconds.
|
||||
Double execution_time = AOCL_DTL_get_time_spent();
|
||||
|
||||
if (execution_time != 0.0)
|
||||
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0,
|
||||
flops/(execution_time * 1e3));
|
||||
else
|
||||
sprintf(buffer, " nt=%ld %.3f ms",
|
||||
AOCL_get_requested_threads_count(),
|
||||
execution_time/1000.0);
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
|
||||
}
|
||||
|
||||
//Level-2
|
||||
void AOCL_DTL_log_syr2_sizes(int8 loglevel,
|
||||
char dt_type,
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Description : BLIS library specific debug helpes.
|
||||
*
|
||||
* Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
@@ -33,10 +33,17 @@ void AOCL_DTL_log_gemm_sizes(int8 loglevel,
|
||||
int line);
|
||||
|
||||
void AOCL_DTL_log_gemm_stats(int8 loglevel,
|
||||
char dt_type,
|
||||
const f77_int m,
|
||||
const f77_int n,
|
||||
const f77_int k);
|
||||
|
||||
void AOCL_DTL_log_trsm_stats(int8 loglevel,
|
||||
char dt_type,
|
||||
f77_char side,
|
||||
const f77_int m,
|
||||
const f77_int n);
|
||||
|
||||
void AOCL_DTL_log_trsm_sizes(int8 loglevel,
|
||||
char dt,
|
||||
f77_char side,
|
||||
@@ -68,6 +75,11 @@ void AOCL_DTL_log_gemmt_sizes(int8 loglevel,
|
||||
const char* function_name,
|
||||
int line);
|
||||
|
||||
void AOCL_DTL_log_gemmt_stats(int8 loglevel,
|
||||
char dt_type,
|
||||
const f77_int n,
|
||||
const f77_int k);
|
||||
|
||||
void AOCL_DTL_log_hemm_sizes(int8 loglevel,
|
||||
char dt_type,
|
||||
const f77_char side,
|
||||
@@ -243,6 +255,10 @@ void AOCL_DTL_log_nrm2_sizes( int8 loglevel,
|
||||
const char* function_name,
|
||||
int line);
|
||||
|
||||
void AOCL_DTL_log_nrm2_stats(int8 loglevel,
|
||||
char dt_type,
|
||||
const f77_int n);
|
||||
|
||||
void AOCL_DTL_log_amax_sizes ( int8 loglevel,
|
||||
char dt_type,
|
||||
const f77_int n,
|
||||
@@ -389,15 +405,23 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
|
||||
AOCL_DTL_log_gemm_sizes(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc, \
|
||||
__FILE__, __FUNCTION__, __LINE__);
|
||||
|
||||
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, m, n, k) \
|
||||
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, dt_type, m, n, k) \
|
||||
if (gbIsLoggingEnabled) \
|
||||
AOCL_DTL_log_gemm_stats(loglevel, m, n, k);
|
||||
AOCL_DTL_log_gemm_stats(loglevel, dt_type, m, n, k);
|
||||
|
||||
#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, dt_type, n, k) \
|
||||
if (gbIsLoggingEnabled) \
|
||||
AOCL_DTL_log_gemmt_stats(loglevel, dt_type, n, k);
|
||||
|
||||
#define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb) \
|
||||
if (gbIsLoggingEnabled) \
|
||||
AOCL_DTL_log_trsm_sizes(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb, \
|
||||
__FILE__, __FUNCTION__, __LINE__);
|
||||
|
||||
#define AOCL_DTL_LOG_TRSM_STATS(loglevel, dt_type, side, m, n) \
|
||||
if (gbIsLoggingEnabled) \
|
||||
AOCL_DTL_log_trsm_stats(loglevel, dt_type, side, m, n);
|
||||
|
||||
#define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc) \
|
||||
if (gbIsLoggingEnabled) \
|
||||
AOCL_DTL_log_gemmt_sizes(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc, \
|
||||
@@ -460,6 +484,10 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
|
||||
if (gbIsLoggingEnabled) \
|
||||
AOCL_DTL_log_nrm2_sizes(loglevel, dt_type, n, incx, __FILE__,__FUNCTION__,__LINE__);
|
||||
|
||||
#define AOCL_DTL_LOG_NRM2_STATS(loglevel, dt_type, n) \
|
||||
if (gbIsLoggingEnabled) \
|
||||
AOCL_DTL_log_nrm2_stats(loglevel, dt_type, n);
|
||||
|
||||
#define AOCL_DTL_LOG_HEMV_INPUTS(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy) \
|
||||
if (gbIsLoggingEnabled) \
|
||||
AOCL_DTL_log_hemv_sizes(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy, \
|
||||
@@ -531,12 +559,16 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
|
||||
|
||||
#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc)
|
||||
|
||||
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, m, n, k)
|
||||
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, dt_type, m, n, k)
|
||||
|
||||
#define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb)
|
||||
|
||||
#define AOCL_DTL_LOG_TRSM_STATS(loglevel, dt_type, side, m, n)
|
||||
|
||||
#define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc)
|
||||
|
||||
#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, dt_type, n, k)
|
||||
|
||||
#define AOCL_DTL_LOG_HEMM_INPUTS(loglevel, dt_type, side, uplo, m, n, alpha, lda, ldb, beta, ldc)
|
||||
|
||||
#define AOCL_DTL_LOG_HERK_INPUTS(loglevel, dt_type, uploc, transa, m, k, alpha, lda, beta, ldc)
|
||||
@@ -561,6 +593,8 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
|
||||
|
||||
#define AOCL_DTL_LOG_NRM2_INPUTS(loglevel, dt_type, n, incx)
|
||||
|
||||
#define AOCL_DTL_LOG_NRM2_STATS(loglevel, dt_type, n)
|
||||
|
||||
#define AOCL_DTL_LOG_HEMV_INPUTS(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy)
|
||||
|
||||
#define AOCL_DTL_LOG_HER2_INPUTS(loglevel, dt_type, uploa, m, alpha, incx, incy, lda)
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* libaray, all debug features (except auto trace)
|
||||
* can be enabled/disabled in this file.
|
||||
*
|
||||
* Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Description : Platform/os independed file handling API's
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* Description : Interfaces for platform/os independed file
|
||||
* handling API's
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
|
||||
@@ -5,10 +5,11 @@
|
||||
* each thread. This is used to log the data
|
||||
* to correct file as per the current thread id.
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "aocltpdef.h"
|
||||
#include "aocldtl.h"
|
||||
#include "aoclfal.h"
|
||||
@@ -63,7 +64,11 @@ AOCL_FLIST_Node * AOCL_FLIST_GetNode(AOCL_FLIST_Node *plist, AOCL_TID tid)
|
||||
{
|
||||
if (temp->fp == NULL)
|
||||
{
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
AOCL_DEBUGPRINT("Could not get saved time stamp for thread = %ld", tid);
|
||||
#else
|
||||
AOCL_DEBUGPRINT("Could not get saved time stamp for thread = %d", tid);
|
||||
#endif
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
@@ -92,7 +97,11 @@ AOCL_FAL_FILE *AOCL_FLIST_GetFile(AOCL_FLIST_Node *plist, AOCL_TID tid)
|
||||
{
|
||||
if (temp->fp == NULL)
|
||||
{
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
AOCL_DEBUGPRINT("File associated with this thread id %ld does not exists or closed", tid);
|
||||
#else
|
||||
AOCL_DEBUGPRINT("File associated with this thread id %d does not exists or closed", tid);
|
||||
#endif
|
||||
}
|
||||
return temp->fp;
|
||||
}
|
||||
@@ -118,8 +127,11 @@ AOCL_FAL_FILE *AOCL_FLIST_AddFile(const int8 *pchFilePrefix, AOCL_FLIST_Node **p
|
||||
}
|
||||
|
||||
/* We don't have exiting file, lets try to open new one */
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
sprintf(pchFileName, "P%d_T%lu_%s", AOCL_getpid(), tid, pchFilePrefix);
|
||||
#else
|
||||
sprintf(pchFileName, "P%d_T%u_%s", AOCL_getpid(), tid, pchFilePrefix);
|
||||
|
||||
#endif
|
||||
file = AOCL_FAL_Open(pchFileName, "wb");
|
||||
if (file == NULL)
|
||||
{
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* each thread. This is used to log the deta
|
||||
* to correct file as per the current thread id.
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
|
||||
@@ -3,9 +3,10 @@
|
||||
*
|
||||
* Description : Abstraction for os services used by DTL.
|
||||
*
|
||||
* Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
#include "blis.h"
|
||||
#include "aocltpdef.h"
|
||||
#include "aocldtl.h"
|
||||
#include "aoclfal.h"
|
||||
@@ -20,18 +21,18 @@
|
||||
#endif
|
||||
|
||||
// BLIS TODO: This is workaround to check if BLIS is built with
|
||||
// openmp support. Ideally we dont' want any library
|
||||
// openmp support. Ideally we don't want any library
|
||||
// specific code in dtl.
|
||||
#include <blis.h>
|
||||
|
||||
#if defined(__linux__)
|
||||
|
||||
/*
|
||||
Disable intrumentation for these functions as they will also be
|
||||
called from compiler generated instumation code to trace
|
||||
Disable instrumentation for these functions as they will also be
|
||||
called from compiler generated instrumentation code to trace
|
||||
function execution.
|
||||
|
||||
It needs to be part of declration in the C file so can't be
|
||||
It needs to be part of declaration in the C file so can't be
|
||||
moved to header file.
|
||||
|
||||
*/
|
||||
@@ -47,7 +48,10 @@ AOCL_TID AOCL_gettid(void)
|
||||
return omp_get_thread_num();
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
return pthread_self();
|
||||
// pthread_self is not suitable for this purpose and may be replaced
|
||||
// in a later release with something else. It returns a value of type
|
||||
// pthread_t, which on linux is an unsigned long int.
|
||||
return (AOCL_TID) pthread_self();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@@ -89,7 +93,11 @@ AOCL_TID AOCL_gettid(void)
|
||||
return omp_get_thread_num();
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
return pthread_self();
|
||||
// pthread_self is not suitable for this purpose and may be replaced
|
||||
// in a later release with something else. It returns a value of type
|
||||
// pthread_t, whose type may depend upon the operating system. On
|
||||
// freeBSD it is a pointer to an empty struct.
|
||||
return (AOCL_TID) pthread_self();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Description : Abstraction for os services used by DTL.
|
||||
*
|
||||
* Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#define _AOCL_OS_H_
|
||||
|
||||
#include "aocltpdef.h"
|
||||
#include "malloc.h"
|
||||
#include "stdlib.h"
|
||||
|
||||
/* The OS Services function declaration */
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
*
|
||||
* Description : Abstraction for various datatypes used by DTL.
|
||||
*
|
||||
* Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
#ifndef AOCL_TYPEDEF_H_
|
||||
@@ -35,8 +35,11 @@ typedef signed long int int32;
|
||||
typedef short int int16;
|
||||
|
||||
typedef Void *AOCL_HANDLE;
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
typedef long int AOCL_TID;
|
||||
#else
|
||||
typedef pid_t AOCL_TID;
|
||||
|
||||
#endif
|
||||
#endif /*AOCL_TYPEDEF_H_ */
|
||||
|
||||
/* --------------- End of aocltpdef.h ----------------- */
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Description : Unit test cases for dtl.
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc
|
||||
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
*==================================================================*/
|
||||
|
||||
|
||||
@@ -61,6 +61,13 @@ if(ENABLE_OPENMP)
|
||||
endif()
|
||||
target_link_libraries(BenchGer optimized "${LIB_NAME}.lib")
|
||||
|
||||
add_executable(BenchNrm2 bench_nrm2.c)
|
||||
target_link_libraries(BenchNrm2 debug "${LIB_NAME}.lib")
|
||||
if(ENABLE_OPENMP)
|
||||
target_link_libraries(BenchNrm2 OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
target_link_libraries(BenchNrm2 optimized "${LIB_NAME}.lib")
|
||||
|
||||
add_executable(BenchScalv bench_scalv.c)
|
||||
target_link_libraries(BenchScalv debug "${LIB_NAME}.lib")
|
||||
if(ENABLE_OPENMP)
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -193,7 +193,8 @@ blis: \
|
||||
bench_amaxv_blis.x \
|
||||
bench_copyv_blis.x \
|
||||
bench_swapv_blis.x \
|
||||
bench_axpbyv_blis.x
|
||||
bench_axpbyv_blis.x \
|
||||
bench_gemm_pack_compute_blis.x
|
||||
|
||||
openblas: \
|
||||
bench_gemm_openblas.x \
|
||||
@@ -240,7 +241,8 @@ mkl: \
|
||||
bench_amaxv_mkl.x \
|
||||
bench_copyv_mkl.x \
|
||||
bench_swapv_mkl.x \
|
||||
bench_axpbyv_mkl.x
|
||||
bench_axpbyv_mkl.x \
|
||||
bench_gemm_pack_compute_mkl.x
|
||||
|
||||
|
||||
# --Object file rules --
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -157,7 +157,7 @@ void softmax_bench_driver_ ## SOFTMAX_SFX \
|
||||
|
||||
GEN_SOFTMAX_BENCH_DRV_FN(float,softmax_f32)
|
||||
|
||||
inline float gelu_tanh_f32
|
||||
static inline float gelu_tanh_f32
|
||||
(
|
||||
float temp_accum
|
||||
)
|
||||
@@ -168,7 +168,7 @@ inline float gelu_tanh_f32
|
||||
return temp_accum;
|
||||
}\
|
||||
|
||||
inline float gelu_erf_f32
|
||||
static inline float gelu_erf_f32
|
||||
(
|
||||
float temp_accum
|
||||
)
|
||||
@@ -261,10 +261,11 @@ void gelu_bench_main_ ## GELU_SFX \
|
||||
n_repeats = global_n_repeat; \
|
||||
} \
|
||||
\
|
||||
V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
|
||||
err_t bli_errors = BLIS_SUCCESS; \
|
||||
V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \
|
||||
GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx ) ); \
|
||||
\
|
||||
V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
|
||||
V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \
|
||||
GEN_FUNC_NAME(fill_array_,V_type)( ref_x, ( n * incx ) ); \
|
||||
\
|
||||
GEN_FUNC_NAME(gelu_bench_driver_,GELU_SFX)(n_repeats,n,x,incx); \
|
||||
@@ -292,10 +293,11 @@ void softmax_bench_main_ ## SOFTMAX_SFX \
|
||||
n_repeats = global_n_repeat; \
|
||||
} \
|
||||
\
|
||||
V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
|
||||
err_t bli_errors = BLIS_SUCCESS; \
|
||||
V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \
|
||||
GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx ) ); \
|
||||
\
|
||||
V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
|
||||
V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \
|
||||
GEN_FUNC_NAME(fill_array_,V_type)( ref_x, ( n * incx ) ); \
|
||||
\
|
||||
GEN_FUNC_NAME(softmax_bench_driver_,SOFTMAX_SFX)(n_repeats,n,x,incx); \
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
996
bench/bench_gemm_pack_compute.c
Executable file
996
bench/bench_gemm_pack_compute.c
Executable file
@@ -0,0 +1,996 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
// Benchmark application to process aocl logs generated by BLIS library.
|
||||
#ifndef DT
|
||||
#define DT BLIS_DOUBLE
|
||||
#endif
|
||||
|
||||
#ifndef IND
|
||||
#define IND BLIS_NAT
|
||||
#endif
|
||||
|
||||
#ifndef N_REPEAT
|
||||
//#define N_REPEAT 100
|
||||
#endif
|
||||
|
||||
|
||||
#define AOCL_MATRIX_INITIALISATION
|
||||
#define BUFFER_SIZE 256
|
||||
|
||||
/* For BLIS since logs are collected at BLAS interfaces
|
||||
* we disable cblas interfaces for this benchmark application
|
||||
*/
|
||||
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
// #define CBLAS
|
||||
#endif
|
||||
|
||||
// #define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta, alpha_one;
|
||||
dim_t m, n, k;
|
||||
dim_t p_inc = 0; // to keep track of number of inputs
|
||||
num_t dt;
|
||||
// ind_t ind;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
int packA, packB;
|
||||
|
||||
FILE* fin = NULL;
|
||||
FILE* fout = NULL;
|
||||
|
||||
n_repeats = N_REPEAT; // This macro will get from Makefile.
|
||||
|
||||
dt = DT;
|
||||
|
||||
if (argc < 3)
|
||||
{
|
||||
printf("Usage: ./test_gemm_pack_compute_XX.x input.csv output.csv\n");
|
||||
exit(1);
|
||||
}
|
||||
fin = fopen(argv[1], "r");
|
||||
if (fin == NULL)
|
||||
{
|
||||
printf("Error opening the file %s\n", argv[1]);
|
||||
exit(1);
|
||||
}
|
||||
fout = fopen(argv[2], "w");
|
||||
if (fout == NULL)
|
||||
{
|
||||
printf("Error opening output file %s\n", argv[2]);
|
||||
exit(1);
|
||||
}
|
||||
if (argc > 3)
|
||||
{
|
||||
n_repeats = atoi(argv[3]);
|
||||
}
|
||||
|
||||
fprintf(fout, "Dt transa transb identifier m n k alphaR alphaI lda ldb betaR betaI ldc gflops\n");
|
||||
|
||||
// Following variables are needed for scanf to read inputs properly
|
||||
// however they are not used in bench.
|
||||
char api_name[BUFFER_SIZE]; // to store function name, line no present in logs
|
||||
char dummy_buffer[BUFFER_SIZE];
|
||||
|
||||
// Variables extracted from the logs which are used by bench
|
||||
char stor_scheme, transA_c, transB_c, packA_c, packB_c;
|
||||
double alpha_r, beta_r, alpha_i, beta_i;
|
||||
dim_t m_trans, n_trans;
|
||||
inc_t lda, ldb, ldc;
|
||||
|
||||
stor_scheme = 'C'; // By default set it to Column Major
|
||||
|
||||
//{S, D, C, Z} transa, transb, packA, packB, m, n, k, alpha_real,
|
||||
// alpha_imag, lda ldb, beta_real, beta_imag, ldc,
|
||||
//
|
||||
// number of threads, execution time, gflops ---> ignored by bench
|
||||
while (fscanf(fin, "%s %c %c %c %c %c " INT_FS INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS"[^\n]",
|
||||
api_name, &dt_ch, &transA_c, &transB_c, &packA_c, &packB_c, &m, &n, &k, &alpha_r, &alpha_i,
|
||||
&lda, &ldb, &beta_r, &beta_i, &ldc) == 16)
|
||||
{
|
||||
// Discard any extra data on current line in the input file.
|
||||
fgets(dummy_buffer, BUFFER_SIZE, fin );
|
||||
|
||||
// At BLAS level only column major order is supported.
|
||||
stor_scheme = 'C';
|
||||
|
||||
if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE;
|
||||
else if (dt_ch == 'S' || dt_ch == 's') dt = BLIS_FLOAT;
|
||||
else
|
||||
{
|
||||
printf("Invalid data type %c\n", dt_ch);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( transA_c == 'n' || transA_c == 'N' ) transa = BLIS_NO_TRANSPOSE;
|
||||
else if ( transA_c == 't' || transA_c == 'T' ) transa = BLIS_TRANSPOSE;
|
||||
else if ( transA_c == 'c' || transA_c == 'C' ) transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid option for transA \n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( transB_c == 'n' || transB_c == 'N' ) transb = BLIS_NO_TRANSPOSE;
|
||||
else if ( transB_c == 't' || transB_c == 'T' ) transb = BLIS_TRANSPOSE;
|
||||
else if ( transB_c == 'c' || transB_c == 'C' ) transb = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid option for transB \n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( packA_c == 'p' || packA_c == 'P' ) packA = TRUE;
|
||||
else if ( packA_c == 'u' || packA_c == 'U' ) packA = FALSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid option for packA \n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( packB_c == 'p' || packB_c == 'P') packB = TRUE;
|
||||
else if ( packB_c == 'u' || packB_c == 'U') packB = FALSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid option for packB \n");
|
||||
continue;
|
||||
}
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha);
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha_one);
|
||||
|
||||
if( (stor_scheme == 'C') || (stor_scheme == 'c') )
|
||||
{
|
||||
// leading dimension should be greater than number of rows
|
||||
// if ((m > lda) || (k > ldb) || (m > ldc)) continue;
|
||||
// Since this bench app is run on logs generated by AOCL trace logs
|
||||
// - we have relaxed the checks on the input parameters.
|
||||
|
||||
// if A is transpose - A(lda x m), lda >= max(1,k)
|
||||
// if A is non-transpose - A (lda x k), lda >= max(1,m)
|
||||
// if B is transpose - B (ldb x k), ldb >= max(1,n)
|
||||
// if B is non-transpose - B (ldb x n), ldb >= max(1,k)
|
||||
// C is ldc x n - ldc >= max(1, m)
|
||||
//if(transa) lda = k; // We will end up overwriting lda
|
||||
bli_set_dims_with_trans( transa, m, k, &m_trans, &n_trans);
|
||||
bli_obj_create( dt, m_trans, n_trans, 1, lda, &a);
|
||||
|
||||
//if(transb) ldb = n; // we will end up overwriting ldb, ldb >= n
|
||||
bli_set_dims_with_trans( transb, k, n, &m_trans, &n_trans);
|
||||
bli_obj_create( dt, m_trans, n_trans, 1, ldb, &b);
|
||||
|
||||
bli_obj_create( dt, m, n, 1, ldc, &c);
|
||||
bli_obj_create( dt, m, n, 1, ldc, &c_save );
|
||||
}
|
||||
else if( (stor_scheme == 'r') || (stor_scheme == 'R') )
|
||||
{
|
||||
//leading dimension should be greater than number of columns
|
||||
//if ((k > lda) || (n > ldb) || (n > ldc)) continue;
|
||||
// Since this bench app is run on logs generated by AOCL trace logs
|
||||
// - we have relaxed the checks on the input parameters.
|
||||
|
||||
// if A is transpose - A(k x lda), lda >= max(1,m)
|
||||
// if A is non-transpose - A (m x lda), lda >= max(1,k)
|
||||
// if B is transpose - B (n x ldb), ldb >= max(1,k)
|
||||
// if B is non-transpose - B (k x ldb ), ldb >= max(1,n)
|
||||
// C is m x ldc - ldc >= max(1, n)
|
||||
|
||||
//if(transa) lda = m; // this will overwrite lda
|
||||
bli_set_dims_with_trans(transa, m, k, &m_trans, &n_trans);
|
||||
bli_obj_create( dt, m_trans, n_trans, lda, 1, &a);
|
||||
|
||||
//if(transb) ldb = k; // this will overwrite ldb
|
||||
bli_set_dims_with_trans(transb, k, n, &m_trans, &n_trans);
|
||||
bli_obj_create( dt, m_trans, n_trans, ldb, 1, &b);
|
||||
|
||||
bli_obj_create( dt, m, n, ldc, 1, &c);
|
||||
bli_obj_create( dt, m, n, ldc, 1, &c_save );
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Invalid storage scheme\n");
|
||||
continue;
|
||||
}
|
||||
#ifndef BLIS // Incase if we are using blis interface we don't have to check for col-storage.
|
||||
#ifndef CBLAS
|
||||
if( ( stor_scheme == 'R' ) || ( stor_scheme == 'r' ) )
|
||||
{
|
||||
printf("BLAS APIs doesn't support row-storage: Enable CBLAS\n");
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef AOCL_MATRIX_INITIALISATION
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
#endif
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a);
|
||||
bli_obj_set_conjtrans( transb, &b);
|
||||
|
||||
bli_setsc( 1.0, 1.0, &alpha_one );
|
||||
bli_setsc( alpha_r, alpha_i, &alpha );
|
||||
bli_setsc( beta_r, beta_i, &beta );
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.6f", "" );
|
||||
bli_printm( "b", &b, "%4.6f", "" );
|
||||
bli_printm( "c", &c, "%4.6f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
printf( "BLAS Extension APIs don't have a BLIS interface."
|
||||
"Enable CBLAS or BLAS interface!\n" );
|
||||
|
||||
#else
|
||||
|
||||
#ifdef CBLAS
|
||||
enum CBLAS_ORDER cblas_order;
|
||||
enum CBLAS_TRANSPOSE cblas_transa;
|
||||
enum CBLAS_TRANSPOSE cblas_transb;
|
||||
enum CBLAS_IDENTIFIER cblas_identifierA;
|
||||
enum CBLAS_IDENTIFIER cblas_identifierB;
|
||||
|
||||
size_t bufSizeA;
|
||||
size_t bufSizeB;
|
||||
|
||||
if ( ( stor_scheme == 'C' ) || ( stor_scheme == 'c' ) )
|
||||
cblas_order = CblasColMajor;
|
||||
else
|
||||
cblas_order = CblasRowMajor;
|
||||
|
||||
if( bli_is_trans( transa ) )
|
||||
cblas_transa = CblasTrans;
|
||||
else if( bli_is_conjtrans( transa ) )
|
||||
cblas_transa = CblasConjTrans;
|
||||
else
|
||||
cblas_transa = CblasNoTrans;
|
||||
|
||||
if( bli_is_trans( transb ) )
|
||||
cblas_transb = CblasTrans;
|
||||
else if( bli_is_conjtrans( transb ) )
|
||||
cblas_transb = CblasConjTrans;
|
||||
else
|
||||
cblas_transb = CblasNoTrans;
|
||||
|
||||
if ( packA )
|
||||
cblas_identifierA = CblasAMatrix;
|
||||
|
||||
if ( packB )
|
||||
cblas_identifierB = CblasBMatrix;
|
||||
#else
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
f77_char f77_identifierA;
|
||||
f77_char f77_identifierB;
|
||||
f77_int f77_bufSizeA;
|
||||
f77_int f77_bufSizeB;
|
||||
|
||||
f77_char f77_packed = 'P';
|
||||
f77_identifierA = 'A';
|
||||
f77_identifierB = 'B';
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
err_t err = BLIS_SUCCESS;
|
||||
|
||||
#endif
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
|
||||
float* alphaonep = bli_obj_buffer( &alpha_one );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* bp = bli_obj_buffer( &b );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
#ifdef CBLAS
|
||||
float* aBuffer;
|
||||
float* bBuffer;
|
||||
|
||||
if ( packA && !packB )
|
||||
{
|
||||
// Only A is pre-packed.
|
||||
bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
|
||||
|
||||
cblas_sgemm_pack( cblas_order,
|
||||
CblasAMatrix,
|
||||
cblas_transa,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
aBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
cblas_sgemm_compute( cblas_order,
|
||||
CblasPacked,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
aBuffer, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
}
|
||||
else if ( !packA && packB )
|
||||
{
|
||||
// Only B is pre-packed.
|
||||
bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
|
||||
|
||||
cblas_sgemm_pack( cblas_order,
|
||||
CblasBMatrix,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
bp, ldb,
|
||||
bBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
cblas_sgemm_compute( cblas_order,
|
||||
cblas_transa,
|
||||
CblasPacked,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
ap, lda,
|
||||
bBuffer, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else if ( packA && packB )
|
||||
{
|
||||
// Both A & B are pre-packed.
|
||||
bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
|
||||
|
||||
bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
|
||||
|
||||
cblas_sgemm_pack( cblas_order,
|
||||
CblasAMatrix,
|
||||
cblas_transa,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
aBuffer );
|
||||
|
||||
cblas_sgemm_pack( cblas_order,
|
||||
CblasBMatrix,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphaonep,
|
||||
bp, ldb,
|
||||
bBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
cblas_sgemm_compute( cblas_order,
|
||||
CblasPacked,
|
||||
CblasPacked,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
aBuffer, lda,
|
||||
bBuffer, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither A nor B is pre-packed.
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
cblas_sgemm_compute( cblas_order,
|
||||
cblas_transa,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
#else // -- BLAS API --
|
||||
float* aBuffer;
|
||||
float* bBuffer;
|
||||
|
||||
if ( packA && !packB )
|
||||
{
|
||||
// Only A is pre-packed.
|
||||
f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
|
||||
|
||||
sgemm_pack_( &f77_identifierA,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap,
|
||||
(f77_int*)&lda,
|
||||
aBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
sgemm_compute_( &f77_packed,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
aBuffer, (f77_int*)&lda,
|
||||
bp, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user( aBuffer );
|
||||
}
|
||||
else if ( !packA && packB )
|
||||
{
|
||||
// Only B is pre-packed.
|
||||
f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
|
||||
|
||||
sgemm_pack_( &f77_identifierB,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
bp,
|
||||
(f77_int*)&ldb,
|
||||
bBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
sgemm_compute_( &f77_transa,
|
||||
&f77_packed,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
ap, (f77_int*)&lda,
|
||||
bBuffer, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user( bBuffer );
|
||||
}
|
||||
else if ( packA && packB )
|
||||
{
|
||||
// Both A & B are pre-packed.
|
||||
f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
|
||||
bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
|
||||
|
||||
f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
|
||||
aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
|
||||
|
||||
sgemm_pack_( &f77_identifierA,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap,
|
||||
(f77_int*)&lda,
|
||||
aBuffer );
|
||||
|
||||
sgemm_pack_( &f77_identifierB,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphaonep,
|
||||
bp,
|
||||
(f77_int*)&ldb,
|
||||
bBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
sgemm_compute_( &f77_packed,
|
||||
&f77_packed,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
aBuffer, (f77_int*)&lda,
|
||||
bBuffer, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither A nor B is reordered.
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
sgemm_compute_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
ap, (f77_int*)&lda,
|
||||
bp, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* alphaonep = bli_obj_buffer( &alpha_one );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* bp = bli_obj_buffer( &b );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
#ifdef CBLAS
|
||||
double* aBuffer;
|
||||
double* bBuffer;
|
||||
|
||||
if ( packA && !packB )
|
||||
{
|
||||
// Only A is pre-packed.
|
||||
bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
|
||||
|
||||
cblas_dgemm_pack( cblas_order,
|
||||
CblasAMatrix,
|
||||
cblas_transa,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
aBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
cblas_dgemm_compute( cblas_order,
|
||||
CblasPacked,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
aBuffer, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
}
|
||||
else if ( !packA && packB )
|
||||
{
|
||||
// Only B is pre-packed.
|
||||
bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
|
||||
cblas_dgemm_pack( cblas_order,
|
||||
CblasBMatrix,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
bp, ldb,
|
||||
bBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
cblas_dgemm_compute( cblas_order,
|
||||
cblas_transa,
|
||||
CblasPacked,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
ap, lda,
|
||||
bBuffer, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else if ( packA && packB )
|
||||
{
|
||||
// Both A & B are pre-packed.
|
||||
bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
|
||||
|
||||
bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
bBuffer = (double*) bli_malloc_user( bufSizeB, &err );
|
||||
|
||||
cblas_dgemm_pack( cblas_order,
|
||||
CblasAMatrix,
|
||||
cblas_transa,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
aBuffer );
|
||||
|
||||
cblas_dgemm_pack( cblas_order,
|
||||
CblasBMatrix,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
bp, ldb,
|
||||
bBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
cblas_dgemm_compute( cblas_order,
|
||||
CblasPacked,
|
||||
CblasPacked,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
aBuffer, lda,
|
||||
bBuffer, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither A nor B is pre-packed.
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
cblas_dgemm_compute( cblas_order,
|
||||
cblas_transa,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
#else // -- BLAS API --
|
||||
double* aBuffer;
|
||||
double* bBuffer;
|
||||
|
||||
if ( packA && !packB )
|
||||
{
|
||||
// Only A is pre-packed.
|
||||
f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
|
||||
|
||||
dgemm_pack_( &f77_identifierA,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap,
|
||||
(f77_int*)&lda,
|
||||
aBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
dgemm_compute_( &f77_packed,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
aBuffer, (f77_int*)&lda,
|
||||
bp, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user( aBuffer );
|
||||
}
|
||||
else if ( !packA && packB )
|
||||
{
|
||||
// Only B is pre-packed.
|
||||
f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
|
||||
|
||||
dgemm_pack_( &f77_identifierB,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
bp,
|
||||
(f77_int*)&ldb,
|
||||
bBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
dgemm_compute_( &f77_transa,
|
||||
&f77_packed,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
ap, (f77_int*)&lda,
|
||||
bBuffer, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user( bBuffer );
|
||||
}
|
||||
else if ( packA && packB )
|
||||
{
|
||||
// Both A & B are pre-packed.
|
||||
f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
|
||||
|
||||
f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
|
||||
|
||||
dgemm_pack_( &f77_identifierA,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap,
|
||||
(f77_int*)&lda,
|
||||
aBuffer );
|
||||
|
||||
dgemm_pack_( &f77_identifierB,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphaonep,
|
||||
bp,
|
||||
(f77_int*)&ldb,
|
||||
bBuffer );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
dgemm_compute_( &f77_packed,
|
||||
&f77_packed,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
aBuffer, (f77_int*)&lda,
|
||||
bBuffer, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither A nor B is reordered.
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
dgemm_compute_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
ap, (f77_int*)&lda,
|
||||
bp, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c compute", &c, "%4.6f", "" );
|
||||
#endif
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
printf( "data_%cgemm_%s", dt_ch, BLAS );
|
||||
|
||||
p_inc++;
|
||||
printf("( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
(unsigned long)(p_inc),
|
||||
(unsigned long)m,
|
||||
(unsigned long)n,
|
||||
(unsigned long)k, gflops);
|
||||
|
||||
fprintf (fout, "%c %c %c %c %c %ld %ld %ld %lf %lf %ld %ld %lf %lf %ld %6.3f\n", \
|
||||
dt_ch, transA_c, transB_c, packA_c, packB_c, m, n, k, alpha_r, alpha_i, lda, ldb, beta_r, beta_i, ldc, gflops);
|
||||
|
||||
fflush(fout);
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
fclose(fin);
|
||||
fclose(fout);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
@@ -3,8 +3,10 @@
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
92
bench/inputgemmpackcompute.txt
Normal file
92
bench/inputgemmpackcompute.txt
Normal file
@@ -0,0 +1,92 @@
|
||||
sgemm_ S N N P U 1 1 1 1 0 1 1 1 0 1
|
||||
sgemm_ S N N P U 2 2 2 1 0 2 2 1 0 2
|
||||
sgemm_ S N N P U 3 3 3 1 0 3 3 1 0 3
|
||||
sgemm_ S N N P U 4 4 4 1 0 4 4 1 0 4
|
||||
sgemm_ S N N P U 5 5 5 1 0 5 5 1 0 5
|
||||
sgemm_ S N N P U 6 6 6 1 0 6 6 1 0 6
|
||||
sgemm_ S N N P U 7 7 7 1 0 7 7 1 0 7
|
||||
sgemm_ S N N P U 8 8 8 1 0 8 8 1 0 8
|
||||
sgemm_ S N N P U 9 9 9 1 0 9 9 1 0 9
|
||||
sgemm_ S N N P U 10 10 10 1 0 10 10 1 0 10
|
||||
sgemm_ S N N P U 20 20 20 1 0 20 20 1 0 20
|
||||
sgemm_ S N N P U 30 30 30 1 0 30 30 1 0 30
|
||||
sgemm_ S N N P U 40 40 40 1 0 40 40 1 0 40
|
||||
sgemm_ S N N P U 50 50 50 1 0 50 50 1 0 50
|
||||
sgemm_ S N N P U 60 60 60 1 0 60 60 1 0 60
|
||||
sgemm_ S N N P U 70 70 70 1 0 70 70 1 0 70
|
||||
sgemm_ S N N P U 80 80 80 1 0 80 80 1 0 80
|
||||
sgemm_ S N N P U 90 90 90 1 0 90 90 1 0 90
|
||||
sgemm_ S N N P U 100 100 100 1 0 100 100 1 0 100
|
||||
sgemm_ S N N P U 200 200 200 1 0 200 200 1 0 200
|
||||
sgemm_ S N N P U 300 300 300 1 0 300 300 1 0 300
|
||||
sgemm_ S N N P U 400 400 400 1 0 400 400 1 0 400
|
||||
sgemm_ S N N P U 500 500 500 1 0 500 500 1 0 500
|
||||
dgemm_ D N N P U 1 1 1 1 0 1 1 1 0 1
|
||||
dgemm_ D N N P U 2 2 2 1 0 2 2 1 0 2
|
||||
dgemm_ D N N P U 3 3 3 1 0 3 3 1 0 3
|
||||
dgemm_ D N N P U 4 4 4 1 0 4 4 1 0 4
|
||||
dgemm_ D N N P U 5 5 5 1 0 5 5 1 0 5
|
||||
dgemm_ D N N P U 6 6 6 1 0 6 6 1 0 6
|
||||
dgemm_ D N N P U 7 7 7 1 0 7 7 1 0 7
|
||||
dgemm_ D N N P U 8 8 8 1 0 8 8 1 0 8
|
||||
dgemm_ D N N P U 9 9 9 1 0 9 9 1 0 9
|
||||
dgemm_ D N N P U 10 10 10 1 0 10 10 1 0 10
|
||||
dgemm_ D N N P U 20 20 20 1 0 20 20 1 0 20
|
||||
dgemm_ D N N P U 30 30 30 1 0 30 30 1 0 30
|
||||
dgemm_ D N N P U 40 40 40 1 0 40 40 1 0 40
|
||||
dgemm_ D N N P U 50 50 50 1 0 50 50 1 0 50
|
||||
dgemm_ D N N P U 60 60 60 1 0 60 60 1 0 60
|
||||
dgemm_ D N N P U 70 70 70 1 0 70 70 1 0 70
|
||||
dgemm_ D N N P U 80 80 80 1 0 80 80 1 0 80
|
||||
dgemm_ D N N P U 90 90 90 1 0 90 90 1 0 90
|
||||
dgemm_ D N N P U 100 100 100 1 0 100 100 1 0 100
|
||||
dgemm_ D N N P U 200 200 200 1 0 200 200 1 0 200
|
||||
dgemm_ D N N P U 300 300 300 1 0 300 300 1 0 300
|
||||
dgemm_ D N N P U 400 400 400 1 0 400 400 1 0 400
|
||||
dgemm_ D N N P U 500 500 500 1 0 500 500 1 0 500
|
||||
sgemm_ S N N U P 1 1 1 1 0 1 1 1 0 1
|
||||
sgemm_ S N N U P 2 2 2 1 0 2 2 1 0 2
|
||||
sgemm_ S N N U P 3 3 3 1 0 3 3 1 0 3
|
||||
sgemm_ S N N U P 4 4 4 1 0 4 4 1 0 4
|
||||
sgemm_ S N N U P 5 5 5 1 0 5 5 1 0 5
|
||||
sgemm_ S N N U P 6 6 6 1 0 6 6 1 0 6
|
||||
sgemm_ S N N U P 7 7 7 1 0 7 7 1 0 7
|
||||
sgemm_ S N N U P 8 8 8 1 0 8 8 1 0 8
|
||||
sgemm_ S N N U P 9 9 9 1 0 9 9 1 0 9
|
||||
sgemm_ S N N U P 10 10 10 1 0 10 10 1 0 10
|
||||
sgemm_ S N N U P 20 20 20 1 0 20 20 1 0 20
|
||||
sgemm_ S N N U P 30 30 30 1 0 30 30 1 0 30
|
||||
sgemm_ S N N U P 40 40 40 1 0 40 40 1 0 40
|
||||
sgemm_ S N N U P 50 50 50 1 0 50 50 1 0 50
|
||||
sgemm_ S N N U P 60 60 60 1 0 60 60 1 0 60
|
||||
sgemm_ S N N U P 70 70 70 1 0 70 70 1 0 70
|
||||
sgemm_ S N N U P 80 80 80 1 0 80 80 1 0 80
|
||||
sgemm_ S N N U P 90 90 90 1 0 90 90 1 0 90
|
||||
sgemm_ S N N U P 100 100 100 1 0 100 100 1 0 100
|
||||
sgemm_ S N N U P 200 200 200 1 0 200 200 1 0 200
|
||||
sgemm_ S N N U P 300 300 300 1 0 300 300 1 0 300
|
||||
sgemm_ S N N U P 400 400 400 1 0 400 400 1 0 400
|
||||
sgemm_ S N N U P 500 500 500 1 0 500 500 1 0 500
|
||||
dgemm_ D N N U P 1 1 1 1 0 1 1 1 0 1
|
||||
dgemm_ D N N U P 2 2 2 1 0 2 2 1 0 2
|
||||
dgemm_ D N N U P 3 3 3 1 0 3 3 1 0 3
|
||||
dgemm_ D N N U P 4 4 4 1 0 4 4 1 0 4
|
||||
dgemm_ D N N U P 5 5 5 1 0 5 5 1 0 5
|
||||
dgemm_ D N N U P 6 6 6 1 0 6 6 1 0 6
|
||||
dgemm_ D N N U P 7 7 7 1 0 7 7 1 0 7
|
||||
dgemm_ D N N U P 8 8 8 1 0 8 8 1 0 8
|
||||
dgemm_ D N N U P 9 9 9 1 0 9 9 1 0 9
|
||||
dgemm_ D N N U P 10 10 10 1 0 10 10 1 0 10
|
||||
dgemm_ D N N U P 20 20 20 1 0 20 20 1 0 20
|
||||
dgemm_ D N N U P 30 30 30 1 0 30 30 1 0 30
|
||||
dgemm_ D N N U P 40 40 40 1 0 40 40 1 0 40
|
||||
dgemm_ D N N U P 50 50 50 1 0 50 50 1 0 50
|
||||
dgemm_ D N N U P 60 60 60 1 0 60 60 1 0 60
|
||||
dgemm_ D N N U P 70 70 70 1 0 70 70 1 0 70
|
||||
dgemm_ D N N U P 80 80 80 1 0 80 80 1 0 80
|
||||
dgemm_ D N N U P 90 90 90 1 0 90 90 1 0 90
|
||||
dgemm_ D N N U P 100 100 100 1 0 100 100 1 0 100
|
||||
dgemm_ D N N U P 200 200 200 1 0 200 200 1 0 200
|
||||
dgemm_ D N N U P 300 300 300 1 0 300 300 1 0 300
|
||||
dgemm_ D N N U P 400 400 400 1 0 400 400 1 0 400
|
||||
dgemm_ D N N U P 500 500 500 1 0 500 500 1 0 500
|
||||
@@ -1,13 +1,133 @@
|
||||
##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
##Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
|
||||
set(F2C_LIB "libf2c")
|
||||
# Comments:
|
||||
# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
|
||||
# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
|
||||
# the second case because CONFIG_NAME is not yet set.
|
||||
if(NOT DEFINED BLIS_INSTALL_PATH)
|
||||
set(DIST_PATH ${CMAKE_BINARY_DIR})
|
||||
set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY})
|
||||
set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY})
|
||||
else()
|
||||
set(LIB_PATH ${BLIS_INSTALL_PATH}/lib)
|
||||
set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis)
|
||||
endif()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/f2c)
|
||||
# Include the corresponding make_defs.cmake that holds the required compiler options.
|
||||
include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake)
|
||||
|
||||
# Generate F2C library
|
||||
add_library("${F2C_LIB}" STATIC )
|
||||
set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C)
|
||||
# Create a static library using the sources in f2c directory.
|
||||
file(GLOB f2c_sources LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/f2c/*.c)
|
||||
add_library(f2c STATIC ${f2c_sources})
|
||||
target_compile_options(f2c
|
||||
PRIVATE
|
||||
# load-var-for,COPTFLAGS
|
||||
${COPTFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CDBGFLAGS}
|
||||
${CWARNFLAGS}
|
||||
${CPICFLAGS}
|
||||
${CMISCFLAGS}
|
||||
${CLANGFLAGS}
|
||||
# Suppress warnings about uninitialized functions
|
||||
-Wno-maybe-uninitialized -Wno-parentheses -Wfatal-errors
|
||||
)
|
||||
target_compile_definitions(f2c
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${VERS_DEF}
|
||||
${CPPROCFLAGS}
|
||||
-DHAVE_BLIS_H
|
||||
)
|
||||
target_include_directories(f2c
|
||||
BEFORE
|
||||
PRIVATE
|
||||
# Add local header paths
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/f2c
|
||||
# and the path to blis.h
|
||||
${INC_PATH}
|
||||
)
|
||||
target_link_libraries(f2c PRIVATE ${LDFLAGS})
|
||||
if(THREADING_MODEL STREQUAL "openmp")
|
||||
target_link_libraries(f2c PRIVATE OpenMP::OpenMP_C)
|
||||
endif()
|
||||
# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE.
|
||||
set_target_properties(f2c PROPERTIES FOLDER blastest-targets)
|
||||
add_dependencies(f2c flat-header)
|
||||
|
||||
# Gather all local source files.
|
||||
file(GLOB blastest_sources LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/src/*.c)
|
||||
list(TRANSFORM blastest_sources REPLACE ${CMAKE_CURRENT_SOURCE_DIR}/src/ "")
|
||||
|
||||
add_subdirectory(f2c)
|
||||
add_subdirectory(src)
|
||||
# Create one executable for each of the sources.
|
||||
foreach(source ${blastest_sources})
|
||||
string(REPLACE .c "" exec_name ${source})
|
||||
add_executable(${exec_name}.x src/${source})
|
||||
target_compile_options(${exec_name}.x
|
||||
PRIVATE
|
||||
# load-var-for,COPTFLAGS
|
||||
${COPTFLAGS}
|
||||
# get-noopt-cflags-for
|
||||
${CDBGFLAGS}
|
||||
${CWARNFLAGS}
|
||||
${CPICFLAGS}
|
||||
${CMISCFLAGS}
|
||||
${CLANGFLAGS}
|
||||
# Suppress warnings about uninitialized functions
|
||||
-Wno-parentheses -Wno-maybe-uninitialized
|
||||
)
|
||||
target_compile_definitions(${exec_name}.x
|
||||
PRIVATE
|
||||
# in get-noopt-cflags-for
|
||||
${VERS_DEF}
|
||||
${CPPROCFLAGS}
|
||||
-DHAVE_BLIS_H
|
||||
)
|
||||
target_include_directories(${exec_name}.x
|
||||
BEFORE
|
||||
PRIVATE
|
||||
# Add local header paths
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/f2c
|
||||
# and the path to blis.h
|
||||
${INC_PATH}
|
||||
)
|
||||
target_link_libraries(${exec_name}.x PRIVATE f2c libblis ${LDFLAGS})
|
||||
if(THREADING_MODEL STREQUAL "openmp")
|
||||
target_link_libraries(${exec_name}.x PRIVATE OpenMP::OpenMP_C)
|
||||
endif()
|
||||
set_target_properties(${exec_name}.x PROPERTIES CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
|
||||
# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE.
|
||||
set_target_properties(${exec_name}.x PROPERTIES FOLDER blastest-targets)
|
||||
# Add a target for running the tests. Rules are different for level-1 APIs, compared to levels 2 and 3.
|
||||
if(${exec_name} MATCHES 1)
|
||||
add_custom_target(run-${exec_name}
|
||||
COMMAND ${exec_name}.x > out.${exec_name}
|
||||
COMMENT "Running ${exec_name}.x with output redirected to out.${exec_name}"
|
||||
DEPENDS ${exec_name}.x
|
||||
BYPRODUCTS ${CMAKE_BINARY_DIR}/out.${exec_name}
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:libblis>
|
||||
VERBATIM
|
||||
)
|
||||
else()# name has 2 or 3
|
||||
add_custom_target(run-${exec_name}
|
||||
COMMAND ${exec_name}.x < ${CMAKE_CURRENT_SOURCE_DIR}/input/${exec_name}.in
|
||||
COMMENT "Running ${exec_name}.x with input ${CMAKE_CURRENT_SOURCE_DIR}/input/${exec_name}.in and output saved to out.${exec_name}"
|
||||
DEPENDS ${exec_name}.x
|
||||
BYPRODUCTS ${CMAKE_BINARY_DIR}/out.${exec_name}
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:libblis>
|
||||
VERBATIM
|
||||
)
|
||||
endif()
|
||||
# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE.
|
||||
set_target_properties(run-${exec_name} PROPERTIES FOLDER blastest-targets)
|
||||
list(APPEND test_executables "run-${exec_name}")
|
||||
endforeach()
|
||||
|
||||
add_custom_target(testblas DEPENDS ${test_executables})
|
||||
add_custom_target(checkblas
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/check-blastest.py "."
|
||||
DEPENDS testblas
|
||||
WORKING_DIRECTORY $<TARGET_FILE_DIR:libblis>
|
||||
)
|
||||
# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE.
|
||||
set_target_properties(testblas checkblas PROPERTIES FOLDER blastest-targets)
|
||||
@@ -1,59 +0,0 @@
|
||||
##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
|
||||
target_sources("${F2C_LIB}"
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/abs.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/acos.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/asin.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/atan.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/atn2.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/close.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cnjg.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cos.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cosh.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/dim.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/div.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/dolio.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/endfile.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/epsilon.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/err.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/exit_.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/exp.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/fmt.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/fmtlib.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/h_dnnt.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hl_cmp.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/i_dnnt.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/i_len.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/imag.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/int.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/l_cmp.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/lg10.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/log.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/lread.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/lwrite.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mod.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/nint.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/open.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/pow.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/prod.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/rdfmt.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/rewind.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/rsfe.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/s_cmp.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/s_copy.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/s_stop.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sfe.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sig_die.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sign.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sin.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sinh.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sqrt.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/tan.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/tanh.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/util.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/wref.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/wrtfmt.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/wsfe.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/wsle.c
|
||||
)
|
||||
@@ -28,6 +28,7 @@ use or performance of this software.
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef _MSC_VER
|
||||
#include <io.h>
|
||||
#define access _access
|
||||
#endif
|
||||
#include "f2c.h"
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
|
||||
add_executable(cblat1 cblat1.c)
|
||||
target_link_libraries(cblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(cblat2 cblat2.c)
|
||||
target_link_libraries(cblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(cblat3 cblat3.c)
|
||||
target_link_libraries(cblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(dblat1 dblat1.c)
|
||||
target_link_libraries(dblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(dblat2 dblat2.c)
|
||||
target_link_libraries(dblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(dblat3 dblat3.c)
|
||||
target_link_libraries(dblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(sblat1 sblat1.c)
|
||||
target_link_libraries(sblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(sblat2 sblat2.c)
|
||||
target_link_libraries(sblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(sblat3 sblat3.c)
|
||||
target_link_libraries(sblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(zblat1 zblat1.c)
|
||||
target_link_libraries(zblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(zblat2 zblat2.c)
|
||||
target_link_libraries(zblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
|
||||
add_executable(zblat3 zblat3.c)
|
||||
target_link_libraries(zblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved"""
|
||||
"""Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved."""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
#cmakedefine AOCL_DYNAMIC
|
||||
|
||||
#cmakedefine AOCL_BLIS_ZEN
|
||||
|
||||
#cmakedefine BLIS_ENABLE_OPENMP
|
||||
|
||||
#cmakedefine BLIS_ENABLE_JRIR_SLAB
|
||||
|
||||
#cmakedefine BLIS_ENABLE_JRIR_RR
|
||||
|
||||
#cmakedefine BLIS_ENABLE_PBA_POOLS
|
||||
|
||||
#cmakedefine BLIS_ENABLE_SBA_POOLS
|
||||
|
||||
#cmakedefine BLIS_ENABLE_MEM_TRACING
|
||||
|
||||
#cmakedefine BLIS_INT_TYPE_SIZE @INT_TYPE_SIZE@
|
||||
|
||||
#cmakedefine BLIS_BLAS_INT_TYPE_SIZE @BLAS_INT_TYPE_SIZE@
|
||||
|
||||
#cmakedefine BLIS_ENABLE_BLAS
|
||||
|
||||
#cmakedefine BLIS_ENABLE_CBLAS
|
||||
|
||||
#cmakedefine BLIS_ENABLE_MIXED_DT
|
||||
|
||||
#cmakedefine BLIS_ENABLE_MIXED_DT_EXTRA_MEM
|
||||
|
||||
#cmakedefine BLIS_ENABLE_SUP_HANDLING
|
||||
|
||||
#cmakedefine BLIS_ENABLE_MEMKIND
|
||||
|
||||
#cmakedefine BLIS_ENABLE_TRSM_PREINVERSION
|
||||
|
||||
#cmakedefine BLIS_ENABLE_PRAGMA_OMP_SIMD
|
||||
|
||||
#cmakedefine BLIS_ENABLE_SANDBOX
|
||||
|
||||
#cmakedefine BLIS_ENABLE_SHARED
|
||||
|
||||
#cmakedefine BLIS_ENABLE_COMPLEX_RETURN_INTEL
|
||||
|
||||
#cmakedefine DISABLE_BLIS_ARCH_TYPE
|
||||
|
||||
#cmakedefine DISABLE_BLIS_MODEL_TYPE
|
||||
|
||||
#cmakedefine __blis_arch_type_name "@rename_blis_arch_type@"
|
||||
|
||||
#cmakedefine __blis_model_type_name "@rename_blis_model_type@"
|
||||
|
||||
#endif
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All Rights Reserved"""
|
||||
"""Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved."""
|
||||
|
||||
################################################################################
|
||||
# This file is used to mirroring the refkernels folder data into to zen, zen2, #
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user