AOCL-BLAS Release 4.2

This commit is contained in:
sireesha.sanga
2024-02-27 13:24:28 +00:00
1363 changed files with 89534 additions and 27591 deletions

View File

@@ -43,6 +43,7 @@ build_script:
- ps: Push-AppveyorArtifact C:\blis.zip
test_script:
# "make checkblas" does not work with shared linking Windows due to inability to override xerbla_
- if [%LIB_TYPE%]==[shared] set "TEST_TARGET=checkblis-fast"
- if [%LIB_TYPE%]==[static] set "TEST_TARGET=check"
- bash -lc "cd /c/projects/blis && mingw32-make %TEST_TARGET% -j4 V=1"

View File

@@ -1,80 +1,76 @@
language: c
sudo: required
dist: trusty
dist: focal
branches:
only:
- master
- dev
- amd
matrix:
include:
# full testsuite (all tests except for mixed datatype)
# full testsuite (all tests + mixed datatype (gemm_nn only) + salt + SDE + OOT)
- os: linux
compiler: gcc
env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto"
# mixed-datatype testsuite (gemm_nn only)
- os: linux
compiler: gcc
env: OOT=0 TEST=MD SDE=0 THR="none" CONF="auto"
# salt testsuite (fast set of operations+parameters)
- os: linux
compiler: gcc
env: OOT=0 TEST=SALT SDE=0 THR="none" CONF="auto"
# test x86_64 ukrs with SDE
- os: linux
compiler: gcc
env: OOT=0 TEST=0 SDE=1 THR="none" CONF="x86_64"
env: OOT=1 TEST=ALL SDE=1 THR="none" CONF="x86_64" \
PACKAGES="gcc-8 binutils"
# openmp build
- os: linux
compiler: gcc
env: OOT=0 TEST=0 SDE=0 THR="openmp" CONF="auto"
env: OOT=0 TEST=FAST SDE=0 THR="openmp" CONF="auto" \
PACKAGES="gcc-8 binutils"
# pthreads build
- os: linux
compiler: gcc
env: OOT=0 TEST=0 SDE=0 THR="pthreads" CONF="auto"
# out-of-tree build
- os: linux
compiler: gcc
env: OOT=1 TEST=0 SDE=0 THR="none" CONF="auto"
env: OOT=0 TEST=FAST SDE=0 THR="pthreads" CONF="auto" \
PACKAGES="gcc-8 binutils"
# clang build
- os: linux
compiler: clang
env: OOT=0 TEST=0 SDE=0 THR="none" CONF="auto"
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="auto"
# There seems to be some difficulty installing 2 Clang toolchains of different versions.
# Use the TravisCI default.
# PACKAGES="clang-8 binutils"
# macOS with system compiler (clang)
- os: osx
compiler: clang
env: OOT=0 TEST=1 SDE=0 THR="none" CONF="auto"
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="auto"
# cortexa15 build and fast testsuite (qemu)
- os: linux
compiler: arm-linux-gnueabihf-gcc
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa15" \
PACKAGES="gcc-arm-linux-gnueabihf qemu-system-arm qemu-user" \
CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ \
PACKAGES="gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-system-arm qemu-user" \
TESTSUITE_WRAPPER="qemu-arm -cpu cortex-a15 -L /usr/arm-linux-gnueabihf/"
# cortexa57 build and fast testsuite (qemu)
- os: linux
compiler: aarch64-linux-gnu-gcc
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="cortexa57" \
PACKAGES="gcc-aarch64-linux-gnu qemu-system-arm qemu-user" \
CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ \
PACKAGES="gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
TESTSUITE_WRAPPER="qemu-aarch64 -L /usr/aarch64-linux-gnu/"
# armsve build and fast testsuite (qemu)
- os: linux
compiler: aarch64-linux-gnu-gcc-10
env: OOT=0 TEST=FAST SDE=0 THR="none" CONF="armsve" \
CC=aarch64-linux-gnu-gcc-10 CXX=aarch64-linux-gnu-g++-10 \
PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \
TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/"
install:
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/as; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/as /usr/bin/as; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo rm -f /usr/bin/ld; fi
- if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo ln -s /usr/lib/binutils-2.26/bin/ld /usr/bin/ld; fi
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-6"; fi
- if [ -n "$PACKAGES" ]; then sudo apt-get install -y $PACKAGES; fi
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-6
- binutils-2.26
- clang
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi
- if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
script:
- export DIST_PATH=.
- pwd
- if [ $OOT -eq 1 ]; then export DIST_PATH=`pwd`; mkdir ../oot; cd ../oot; chmod -R a-w $DIST_PATH; fi
- pwd
- $DIST_PATH/configure -t $THR CC=$CC $CONF
- $DIST_PATH/configure -p `pwd`/../install -t $THR CC=$CC $CONF
- pwd
- ls -l
- $CC --version
- make -j 2
- make install
- $DIST_PATH/travis/cxx/cxx-test.sh $DIST_PATH $(ls -1 include)
# Qemu SVE is failing sgemmt in some cases. Skip as this issue is not observed on real chip (A64fx).
- if [ "$CONF" = "armsve" ]; then sed -i 's/.*\<gemmt\>.*/0/' $DIST_PATH/testsuite/input.operations.fast; fi
- if [ "$TEST" != "0" ]; then travis_wait 30 $DIST_PATH/travis/do_testsuite.sh; fi
- if [ "$SDE" = "1" ]; then travis_wait 30 $DIST_PATH/travis/do_sde.sh; fi

File diff suppressed because it is too large Load Diff

View File

@@ -42,6 +42,7 @@ but many others have contributed code and feedback, including
Shivaprashanth H (Global Edge)
Jean-Michel Hautbois @jhautbois
Ian Henriksen @insertinterestingnamehere (The University of Texas at Austin)
Greg Henry (Intel)
Minh Quan Ho @hominhquan
Matthew Honnibal @honnibal
Stefan Husmann @stefanhusmann
@@ -50,9 +51,11 @@ but many others have contributed code and feedback, including
Tony Kelman @tkelman
Lee Killough @leekillough (Cray)
Mike Kistler @mkistler (IBM, Austin Research Laboratory)
Ivan Korostelev @ivan23kor (University of Alberta)
Kyungmin Lee @kyungminlee (Ohio State University)
Michael Lehn @michael-lehn
Shmuel Levine @ShmuelLevine
@lschork2
Dave Love @loveshack
Tze Meng Low (The University of Texas at Austin)
Ye Luo @ye-luo (Argonne National Laboratory)
@@ -92,6 +95,7 @@ but many others have contributed code and feedback, including
Paul Springer @springer13 (RWTH Aachen University)
Adam J. Stewart @adamjstewart (University of Illinois at Urbana-Champaign)
Vladimir Sukarev
Chengguo Sun @chengguosun
Santanu Thangaraj (AMD)
Nicholai Tukanov @nicholaiTukanov (The University of Texas at Austin)
Rhys Ulerich @RhysU (The University of Texas at Austin)
@@ -99,6 +103,7 @@ but many others have contributed code and feedback, including
Meghana Vankadari @Meghana-vankadari (AMD)
Kiran Varaganti @kvaragan (AMD)
Natalia Vassilieva (Hewlett Packard Enterprise)
Andrew Wildman @awild82 (University of Washington)
Zhang Xianyi @xianyi (Chinese Academy of Sciences)
Benda Xu @heroxbd
Guodong Xu @docularxu (Linaro.org)
@@ -106,6 +111,7 @@ but many others have contributed code and feedback, including
Costas Yamin @cosstas
Chenhan Yu @ChenhanYu (The University of Texas at Austin)
Roman Yurchak @rth (Symerio)
Stefano Zampini @stefanozampini
M. Zhou @cdluminate
BLIS's development was partially funded by grants from industry

View File

@@ -15,7 +15,7 @@ copyright info. All parties provide their portions of the code under the
Copyright (C) 2018, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -320,6 +320,7 @@ BLASTEST_INPUT_PATH := $(DIST_PATH)/$(BLASTEST_DIR)/input
# The location of the BLAS test suite object directory.
BASE_OBJ_BLASTEST_PATH := $(BASE_OBJ_PATH)/$(BLASTEST_DIR)
BASE_EXE_BLASTEST_PATH := $(BASE_OBJ_BLASTEST_PATH)/$(MK_USE_LIB)
# The locations of the BLAS test suite source code (f2c and drivers).
BLASTEST_F2C_SRC_PATH := $(DIST_PATH)/$(BLASTEST_DIR)/f2c
@@ -347,7 +348,7 @@ BLASTEST_DRV_BASES := $(basename $(notdir $(BLASTEST_DRV_OBJS)))
# The binary executable driver names.
BLASTEST_DRV_BINS := $(addsuffix .x,$(BLASTEST_DRV_BASES))
BLASTEST_DRV_BIN_PATHS := $(addprefix $(BASE_OBJ_BLASTEST_PATH)/,$(BLASTEST_DRV_BINS))
BLASTEST_DRV_BIN_PATHS := $(addprefix $(BASE_EXE_BLASTEST_PATH)/,$(BLASTEST_DRV_BINS))
# Binary executable driver "run-" names
BLASTEST_DRV_BINS_R := $(addprefix run-,$(BLASTEST_DRV_BASES))
@@ -393,6 +394,7 @@ TESTSUITE_SALT_OPS_PATH := $(DIST_PATH)/$(TESTSUITE_DIR)/$(TESTSUITE_SALT_OPS)
# directory.
TESTSUITE_SRC_PATH := $(DIST_PATH)/$(TESTSUITE_DIR)/src
BASE_OBJ_TESTSUITE_PATH := $(BASE_OBJ_PATH)/$(TESTSUITE_DIR)
BASE_EXE_TESTSUITE_PATH := $(BASE_OBJ_PATH)/$(TESTSUITE_DIR)/$(MK_USE_LIB)
# Convert source file paths to object file paths by replacing the base source
# directories with the base object directories, and also replacing the source
@@ -414,7 +416,7 @@ MK_TESTSUITE_OBJS := $(sort \
# unusual environments (e.g. ARM) can run the testsuite through some other
# binary. See .travis.yml for details on how the variable is employed in
# practice.
TESTSUITE_BIN := test_$(LIBBLIS).x
TESTSUITE_BIN := $(BASE_EXE_TESTSUITE_PATH)/test_$(LIBBLIS).x
TESTSUITE_WRAPPER ?=
# The location of the script that checks the BLIS testsuite output.
@@ -504,7 +506,7 @@ endif
flat-header: check-env $(BLIS_H_FLAT)
$(BLIS_H_FLAT): $(FRAME_H99_FILES)
$(BLIS_H_FLAT): $(ALL_H99_FILES)
ifeq ($(ENABLE_VERBOSE),yes)
$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)"
else
@@ -820,7 +822,7 @@ blastest-bin: check-env blastest-f2c $(BLASTEST_DRV_BIN_PATHS)
blastest-run: $(BLASTEST_DRV_BINS_R)
# f2c object file rule.
$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c
$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_F2C_SRC_PATH)/%.c $(BLIS_H_FLAT)
ifeq ($(ENABLE_VERBOSE),yes)
$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@
else
@@ -829,7 +831,7 @@ else
endif
# driver object file rule.
$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c
$(BASE_OBJ_BLASTEST_PATH)/%.o: $(BLASTEST_DRV_SRC_PATH)/%.c $(BLIS_H_FLAT)
ifeq ($(ENABLE_VERBOSE),yes)
$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) $(BLAT_CFLAGS) -c $< -o $@
else
@@ -850,7 +852,8 @@ endif
# first argument: the base name of the BLAS test driver.
define make-blat-rule
$(BASE_OBJ_BLASTEST_PATH)/$(1).x: $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK)
$(BASE_EXE_BLASTEST_PATH)/$(1).x: $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK)
@mkdir -p $(BASE_EXE_BLASTEST_PATH)
ifeq ($(ENABLE_VERBOSE),yes)
$(LINKER) $(BASE_OBJ_BLASTEST_PATH)/$(1).o $(BLASTEST_F2C_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $$@
else
@@ -864,12 +867,12 @@ $(foreach name, $(BLASTEST_DRV_BASES), $(eval $(call make-blat-rule,$(name))))
# A rule to run ?blat1.x driver files.
define make-run-blat1-rule
run-$(1): $(BASE_OBJ_BLASTEST_PATH)/$(1).x
run-$(1): $(BASE_EXE_BLASTEST_PATH)/$(1).x
ifeq ($(ENABLE_VERBOSE),yes)
$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x > out.$(1)
$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x > out.$(1)
else
@echo "Running $(1).x > 'out.$(1)'"
@$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x > out.$(1)
@$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x > out.$(1)
endif
endef
@@ -878,12 +881,12 @@ $(foreach name, $(BLASTEST_DRV1_BASES), $(eval $(call make-run-blat1-rule,$(name
# A rule to run ?blat2.x and ?blat3.x driver files.
define make-run-blat23-rule
run-$(1): $(BASE_OBJ_BLASTEST_PATH)/$(1).x
run-$(1): $(BASE_EXE_BLASTEST_PATH)/$(1).x
ifeq ($(ENABLE_VERBOSE),yes)
$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in
$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in
else
@echo "Running $(1).x < '$(BLASTEST_INPUT_PATH)/$(1).in' (output to 'out.$(1)')"
@$(TESTSUITE_WRAPPER) $(BASE_OBJ_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in
@$(TESTSUITE_WRAPPER) $(BASE_EXE_BLASTEST_PATH)/$(1).x < $(BLASTEST_INPUT_PATH)/$(1).in
endif
endef
@@ -916,7 +919,7 @@ testsuite: testsuite-run
testsuite-bin: check-env $(TESTSUITE_BIN)
# Object file rule.
$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c
$(BASE_OBJ_TESTSUITE_PATH)/%.o: $(TESTSUITE_SRC_PATH)/%.c $(BLIS_H_FLAT)
ifeq ($(ENABLE_VERBOSE),yes)
$(CC) $(call get-user-cflags-for,$(CONFIG_NAME)) -c $< -o $@
else
@@ -926,6 +929,7 @@ endif
# Testsuite binary rule.
$(TESTSUITE_BIN): $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK)
@mkdir -p $(BASE_EXE_TESTSUITE_PATH)
ifeq ($(ENABLE_VERBOSE),yes)
$(LINKER) $(MK_TESTSUITE_OBJS) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
else
@@ -936,13 +940,13 @@ endif
# A rule to run the testsuite using the normal input.* files.
testsuite-run: testsuite-bin
ifeq ($(ENABLE_VERBOSE),yes)
$(TESTSUITE_WRAPPER) ./$(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \
$(TESTSUITE_WRAPPER) $(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \
-o $(TESTSUITE_CONF_OPS_PATH) \
> $(TESTSUITE_OUT_FILE)
else
@echo "Running $(TESTSUITE_BIN) with output redirected to '$(TESTSUITE_OUT_FILE)'"
@$(TESTSUITE_WRAPPER) ./$(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \
@$(TESTSUITE_WRAPPER) $(TESTSUITE_BIN) -g $(TESTSUITE_CONF_GEN_PATH) \
-o $(TESTSUITE_CONF_OPS_PATH) \
> $(TESTSUITE_OUT_FILE)
endif
@@ -1285,7 +1289,7 @@ ifeq ($(IS_CONFIGURED),yes)
ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(BLASTEST_F2C_OBJS) $(BLASTEST_DRV_OBJS)
- $(RM_F) $(BLASTEST_F2C_LIB)
- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
- $(RM_RF) $(BASE_OBJ_BLASTEST_PATH)/{shared,static}
- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
else
@echo "Removing object files from $(BASE_OBJ_BLASTEST_PATH)"
@@ -1293,7 +1297,7 @@ else
@echo "Removing libf2c.a from $(BASE_OBJ_BLASTEST_PATH)"
@- $(RM_F) $(BLASTEST_F2C_LIB)
@echo "Removing binaries from $(BASE_OBJ_BLASTEST_PATH)"
@- $(RM_F) $(BLASTEST_DRV_BIN_PATHS)
@- $(RM_RF) $(BASE_OBJ_BLASTEST_PATH)/{shared,static}
@echo "Removing driver output files 'out.*'"
@- $(RM_F) $(addprefix out.,$(BLASTEST_DRV_BASES))
endif # ENABLE_VERBOSE
@@ -1328,13 +1332,13 @@ cleanblistesttop:
ifeq ($(IS_CONFIGURED),yes)
ifeq ($(ENABLE_VERBOSE),yes)
- $(RM_F) $(MK_TESTSUITE_OBJS)
- $(RM_F) $(TESTSUITE_BIN)
- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static}
- $(RM_F) $(TESTSUITE_OUT_FILE)
else
@echo "Removing object files from $(BASE_OBJ_TESTSUITE_PATH)"
@- $(RM_F) $(MK_TESTSUITE_OBJS)
@echo "Removing binary $(TESTSUITE_BIN)"
@- $(RM_F) $(TESTSUITE_BIN)
@- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static}
@echo "Removing $(TESTSUITE_OUT_FILE)"
@- $(RM_F) $(TESTSUITE_OUT_FILE)
endif # ENABLE_VERBOSE
@@ -1344,13 +1348,13 @@ cleanblistestdir:
ifeq ($(IS_CONFIGURED),yes)
ifeq ($(ENABLE_VERBOSE),yes)
- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static}
- $(MAKE) -C $(VEND_TESTCPP_DIR) clean
else
@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)"
@- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)"
@- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
@echo "Removing binary $(TESTSUITE_BIN)"
@- $(RM_RF) $(BASE_OBJ_TESTSUITE_PATH)/{shared,static}
@$(MAKE) -C $(VEND_TESTCPP_DIR) clean
endif # ENABLE_VERBOSE
endif # IS_CONFIGURED

206
addon/CMakeLists.txt Normal file
View File

@@ -0,0 +1,206 @@
##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. ##
# Writing a function that will be used to generate the required object
# libraries for the required addons.
function(generate_addon_targets addon_target)
# Collect all subdirectory paths that have at least one file with suffix in ADDON_C99_SUFS list.
get_filepaths_with_suffixes(LOCAL_SOURCE_C99_FILES "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_C99_SUFS}")
# We want to break the files above in 2 categories, files in kernel directory and the rest.
# Only list files in kernel directory.
set(LOCAL_KERNEL_FILES_C99 ${LOCAL_SOURCE_FILES})
list(FILTER LOCAL_KERNEL_FILES_C99 INCLUDE REGEX ${addon_target}/kernels/)
# All C99 files, except of the ones in kernels directory.
list(REMOVE_ITEM LOCAL_SOURCE_C99_FILES ${LOCAL_KERNEL_FILES_C99})
# Collect all subdirectory paths that have at least one file with suffix in ADDON_H99_SUFS list.
get_dirpaths_with_suffixes(CADDONINCFLAGS "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_H99_SUFS}")
# Only generate the object library if there is at least one source file.
list(LENGTH LOCAL_SOURCE_C99_FILES size)
if(size GREATER 0)
# Create an object library using the source file list above.
add_library(${addon_target}_C99_ADDON
OBJECT
${LOCAL_SOURCE_C99_FILES}
)
# Include the corresponding make_defs.cmake that holds the required compiler options.
include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake)
# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets.
# mimicing get-addon-c99flags-for
target_compile_options(${addon_target}_C99_ADDON
PRIVATE
# load-var-for,COPTFLAGS
${COPTFLAGS}
# get-noopt-cflags-for
${CDBGFLAGS}
# get-noopt-cflags-for
${CWARNFLAGS}
# get-noopt-cflags-for
${CMISCFLAGS}
# get-noopt-cflags-for
${CLANGFLAGS}
# in get-addon-c99flags-for
${BUILD_SYMFLAGS}
)
target_compile_definitions(${addon_target}_C99_ADDON
PRIVATE
# in get-noopt-cflags-for
${CPPROCFLAGS}
# in get-noopt-cflags-for
${VERS_DEF}
# in get-addon-c99flags-for
${BUILD_CPPFLAGS}
)
target_include_directories(${addon_target}_C99_ADDON
BEFORE
PRIVATE
# in get-noopt-cflags-for
${CINFLAGS}
# in get-addon-c99flags-for
${CADDONINCFLAGS}
)
if(THREADING_MODEL STREQUAL "openmp")
# Equivalent to CTHREADFLAGS in get-noopt-cflags-for
target_link_libraries(${addon_target}_C99_ADDON PRIVATE OpenMP::OpenMP_C)
elseif(THREADING_MODEL STREQUAL "pthreads")
# in get-noopt-cflags-for
target_compile_options(${addon_target}_C99_ADDON PRIVATE ${CTHREADFLAGS})
endif()
if(BUILD_SHARED_LIBS)
# Equivalent to CPICFLAGS in get-noopt-cflags-for
set_target_properties(${addon_target}_C99_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_dependencies(${addon_target}_C99_ADDON flat-header)
# Put all those targets under object-libs-targets folder name so that they appear all together in IDE.
set_target_properties(${addon_target}_C99_ADDON PROPERTIES FOLDER object-libs-targets)
endif()
# Only generate the object library if there is at least one source file.
list(LENGTH LOCAL_KERNEL_FILES_C99 size)
if(size GREATER 0)
# Create an object library using the kernel source file list above.
add_library(${addon_target}_C99_KERNEL_ADDON
OBJECT
${LOCAL_KERNEL_FILES_C99}
)
# Include the corresponding make_defs.cmake that holds the required compiler options.
include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake)
# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets.
# mimicing get-addon-c99flags-for
target_compile_options(${addon_target}_C99_KERNEL_ADDON
PRIVATE
# load-var-for,CKOPTFLAGS
${CKOPTFLAGS}
# load-var-for,CKVECFLAGS
${CKVECFLAGS}
# get-noopt-cflags-for
${CDBGFLAGS}
# get-noopt-cflags-for
${CWARNFLAGS}
# get-noopt-cflags-for
${CMISCFLAGS}
# get-noopt-cflags-for
${CLANGFLAGS}
# in get-addon-kernel-c99flags-for
${BUILD_SYMFLAGS}
)
target_compile_definitions(${addon_target}_C99_KERNEL_ADDON
PRIVATE
# in get-noopt-cflags-for
${CPPROCFLAGS}
# in get-noopt-cflags-for
${VERS_DEF}
# in get-addon-kernel-c99flags-for
${BUILD_CPPFLAGS}
)
target_include_directories(${addon_target}_C99_KERNEL_ADDON
BEFORE
PRIVATE
# in get-noopt-cflags-for
${CINFLAGS}
# in get-addon-kernel-c99flags-for
${CADDONINCFLAGS}
)
if(THREADING_MODEL STREQUAL "openmp")
# Equivalent to CTHREADFLAGS in get-noopt-cflags-for
target_link_libraries(${addon_target}_C99_KERNEL_ADDON PRIVATE OpenMP::OpenMP_C)
elseif(THREADING_MODEL STREQUAL "pthreads")
# in get-noopt-cflags-for
target_compile_options(${addon_target}_C99_KERNEL_ADDON PRIVATE ${CTHREADFLAGS})
endif()
if(BUILD_SHARED_LIBS)
# Equivalent to CPICFLAGS in get-noopt-cflags-for
set_target_properties(${addon_target}_C99_KERNEL_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_dependencies(${addon_target}_C99_KERNEL_ADDON flat-header)
# Put all those targets under object-libs-targets folder name so that they appear all together in IDE.
set_target_properties(${addon_target}_C99_KERNEL_ADDON PROPERTIES FOLDER object-libs-targets)
endif()
# Collect all subdirectory paths that have at least one file with suffix in ADDON_CXX_SUFS list.
get_filepaths_with_suffixes(LOCAL_SOURCE_CXX_FILES "${CMAKE_CURRENT_SOURCE_DIR}/${addon_target}" "${ADDON_CXX_SUFS}")
# Only generate the object library if there is at least one source file.
list(LENGTH LOCAL_SOURCE_CXX_FILES size)
if(size GREATER 0)
# Create an object library using the source file list above.
add_library(${addon_target}_CXX_ADDON
OBJECT
${LOCAL_SOURCE_CXX_FILES}
)
# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets.
# mimicing get-addon-cxxflags-for
target_compile_options(${addon_target}_CXX_ADDON
PRIVATE
# load-var-for,COPTFLAGS
${COPTFLAGS}
# get-noopt-cxxflags-for
${CDBGFLAGS}
# get-noopt-cxxflags-for
${CWARNFLAGS}
# get-noopt-cxxflags-for
${CMISCFLAGS}
# get-noopt-cxxflags-for
${CXXLANGFLAGS}
# in get-addon-cxxflags-for
${BUILD_SYMFLAGS}
)
target_compile_definitions(${addon_target}_CXX_ADDON
PRIVATE
# in get-noopt-cflags-for
${CPPROCFLAGS}
# in get-noopt-cflags-for
${VERS_DEF}
# in get-addon-cxxflags-for
${BUILD_CPPFLAGS}
)
target_include_directories(${addon_target}_CXX_ADDON
BEFORE
PRIVATE
# in get-noopt-cflags-for
${CINFLAGS}
# in get-addon-cxxflags-for
${CADDONINCFLAGS}
)
if(THREADING_MODEL STREQUAL "openmp")
# Equivalent to CTHREADFLAGS in get-noopt-cflags-for
target_link_libraries(${addon_target}_CXX_ADDON PRIVATE OpenMP::OpenMP_C)
elseif(THREADING_MODEL STREQUAL "pthreads")
# in get-noopt-cflags-for
target_compile_options(${addon_target}_CXX_ADDON PRIVATE ${CTHREADFLAGS})
endif()
if(BUILD_SHARED_LIBS)
# Equivalent to CPICFLAGS in get-noopt-cflags-for
set_target_properties(${addon_target}_CXX_ADDON PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_dependencies(${addon_target}_CXX_ADDON flat-header)
# Put all those targets under object-libs-targets folder name so that they appear all together in IDE.
set_target_properties(${addon_target}_CXX_ADDON PROPERTIES FOLDER object-libs-targets)
endif()
endfunction()
# Generate targets for each of the addons.
foreach(ADDON ${ENABLE_ADDON})
generate_addon_targets(${ADDON})
endforeach()

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -43,7 +43,7 @@
#include "lpgemm_post_ops.h"
#include "lpgemm_kernels.h"
#include "lpgemm_utils_kernels.h"
#include "lpgemm_packb_bf16.h"
#include "lpgemm_pack_bf16.h"
#include "lpgemm_packb_s16.h"
#include "lpgemm_packa.h"
#include "lpgemm_packb.h"

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -85,12 +85,34 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
{
trans_t blis_trans;
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( trans, &blis_trans );
if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
( k <= 0 ) || ( n <= 0 ) || ( ldb < n ) )
( k <= 0 ) || ( n <= 0 ) || ( bli_is_notrans( blis_trans ) && ( ldb < n ) ) ||
( bli_is_trans( blis_trans ) && ( ldb < k ) ) )
{
return; // Error.
}
inc_t rs_b, cs_b;
if( ( order == 'r') || ( order == 'R' ) )
{
rs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
cs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
}
else if ( ( order == 'c' ) || ( order == 'C' ) )
{
rs_b = bli_is_notrans( blis_trans ) ? 1 : ldb;
cs_b = bli_is_notrans( blis_trans ) ? ldb : 1;
}
else
{
return; // Error
}
// Check if avx512_bf16 ISA is supported, lpgemm matmul only works with it.
if ( bli_cpuid_is_avx512bf16_supported() == FALSE )
{
@@ -117,7 +139,7 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
@@ -128,7 +150,8 @@ AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
// Create dummy original b obj;
lpgemm_obj_t b;
b.storage.aligned_buffer = ( void* )input_buf_addr;
b.rs = ldb;
b.rs = rs_b;
b.cs = cs_b;
b.width = n;
b.length = k;

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_post_ops.h"
#include "lpgemm_thread_decor_openmp.h"
@@ -73,57 +74,42 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"bf16bf16f32obf16",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
bli_param_map_netlib_to_blis_trans( transb, &blis_transb );
/* Perform BLAS parameter checking. */
// Transpose not supported.
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) );
bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) );
inc_t rs_a = lda;
inc_t cs_a = 1;
if ( bli_is_trans( blis_transa ) )
{
return; // Error.
rs_a = 1;
cs_a = lda;
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
inc_t rs_b = ldb;
inc_t cs_b = 1;
bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
// Row major input expected with leading dimensions >= row stride.
if ( ( is_row_major == TRUE ) &&
( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
if( bli_is_trans( blis_transb ) )
{
return; // Error.
}
// Column major input expected with leading dimensions >= column stride.
else if ( ( is_column_major == TRUE ) &&
( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
{
return; // Error.
rs_b = 1;
cs_b = ldb;
}
// Check if dimensions are valid.
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
const inc_t cs_b = 1;
const inc_t rs_c = ldc;
const inc_t cs_c = 1;
@@ -133,6 +119,21 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
// Reorder is not supported for A matrix
if( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) )
{
bli_print_msg(" Reordering of A matrix is not supported in row major case.", __FILE__, __LINE__ );
return;
}
// Inputs swapped in column major, A becomes B from kernel point of view.
// Reorder is not supported for column major matrices.
else if ( ( is_column_major == TRUE ) && ( ( mtag_b == REORDERED ) || ( mtag_a == REORDERED ) ) )
{
bli_print_msg(" Reordering of column major matrices is not supported.", __FILE__, __LINE__ );
return;
}
// From 5-loop function point of view,
// B matrix needs to be packed in a certain format in order to be loaded
// and used in bf16 instrution. As such the mtag_b always needs to be either
// packed or reordered. B matrix as it is (unpacked) cannot be used, and
@@ -147,30 +148,34 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
mtag_a = PACK;
}
// Only unpacked A supported now.
if ( ( is_row_major == TRUE ) && ( mtag_a != UNPACKED ) )
// From 5-loop function point of view,
// A matrix when in column major storage needs to be packed to row-major
// storage as kernel expects A matrix to be in row-major format.
if( ( is_row_major == TRUE ) && ( bli_is_trans(blis_transa ) ) )
{
return; // Error.
mtag_a = PACK;
}
// Inputs swapped in column major, B becomes A from kernel point of view.
else if ( ( is_column_major == TRUE ) && ( mtag_b != UNPACKED ) )
// Inputs swapped in column major, A becomes B from kernel point of view.
else if ( ( is_column_major == TRUE ) && ( bli_is_trans(blis_transb ) ) )
{
return; // Error.
mtag_b = PACK;
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
@@ -186,7 +191,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
( float* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, BF16
);
}
else
@@ -199,7 +204,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
( float* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, BF16
);
}
#else
@@ -214,7 +219,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
( float* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, BF16
);
}
else
@@ -227,7 +232,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
( float* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, BF16
);
}
#endif

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_post_ops.h"
#include "lpgemm_thread_decor_openmp.h"
@@ -73,58 +74,42 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"bf16bf16f32obf16",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
bli_param_map_netlib_to_blis_trans( transb, &blis_transb );
/* Perform BLAS parameter checking. */
// Transpose not supported.
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
// Row major input expected with leading dimensions >= row stride.
if ( ( is_row_major == TRUE ) &&
( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
{
return; // Error.
}
// Column major input expected with leading dimensions >= column stride.
else if ( ( is_column_major == TRUE ) &&
( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
{
return; // Error.
}
// Check if dimensions are valid.
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
{
return; // Error.
}
bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) );
bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) );
// The strides are set assuming a row major kernel.
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
const inc_t cs_b = 1;
inc_t rs_a = lda;
inc_t cs_a = 1;
if ( bli_is_trans( blis_transa ) )
{
rs_a = 1;
cs_a = lda;
}
inc_t rs_b = ldb;
inc_t cs_b = 1;
if( bli_is_trans( blis_transb ) )
{
rs_b = 1;
cs_b = ldb;
}
const inc_t rs_c = ldc;
const inc_t cs_c = 1;
@@ -134,12 +119,21 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
if ( ( is_column_major == TRUE ) && ( mtag_b == REORDERED ) )
// Reorder is not supported for A matrix
if( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) )
{
// Reorder not supported with column major inputs.
bli_print_msg(" Reordering of A matrix is not supported in row major case.", __FILE__, __LINE__ );
return;
}
// Inputs swapped in column major, A becomes B from kernel point of view.
// Reorder is not supported for column major matrices.
else if ( ( is_column_major == TRUE ) && ( ( mtag_b == REORDERED ) || ( mtag_a == REORDERED ) ) )
{
bli_print_msg(" Reordering of column major matrices is not supported.", __FILE__, __LINE__ );
return;
}
// From 5-loop function point of view
// B matrix needs to be packed in a certain format in order to be loaded
// and used in bf16 instrution. As such the mtag_b always needs to be either
// packed or reordered. B matrix as it is (unpacked) cannot be used, and
@@ -154,30 +148,34 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
mtag_a = PACK;
}
// Only unpacked A supported now.
if ( ( is_row_major == TRUE ) && ( mtag_a != UNPACKED ) )
// From 5-loop function point of view,
// A matrix when in column major storage needs to be packed to row-major
// storage as kernel expects A matrix to be in row-major format.
if( ( is_row_major == TRUE ) && ( bli_is_trans(blis_transa ) ) )
{
return; // Error.
mtag_a = PACK;
}
// Inputs swapped in column major, B becomes A from kernel point of view.
else if ( ( is_column_major == TRUE ) && ( mtag_b != UNPACKED ) )
// Inputs swapped in column major, A becomes B from kernel point of view.
else if ( ( is_column_major == TRUE ) && ( bli_is_trans(blis_transb ) ) )
{
return; // Error.
mtag_b = PACK;
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
@@ -193,7 +191,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, F32
);
}
else
@@ -206,7 +204,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, F32
);
}
#else
@@ -221,7 +219,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, F32
);
}
else
@@ -234,7 +232,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, F32
);
}
#endif

View File

@@ -0,0 +1,104 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// yet to add validity check for postops
#define AOCL_GEMM_CHECK( op_str, \
order, transa, transb, \
m, n, k, \
a, lda, mtag_a, \
b, ldb, mtag_b, \
c, ldc \
) \
{ \
int32_t info = 0; \
bool col_stored, row_stored; \
bool nota, notb, ta, tb; \
\
col_stored = ( order == 'c' ) || ( order == 'C' ); \
row_stored = ( order == 'r' ) || ( order == 'R' ); \
\
nota = ( transa == 'n' ) || ( transa == 'N' ); \
notb = ( transb == 'n' ) || ( transb == 'N' ); \
\
ta = ( transa == 't' ) || ( transa == 'T' ); \
tb = ( transb == 't' ) || ( transb == 'T' ); \
\
if( ( order != 'r') && ( order != 'R' ) && ( order != 'c' ) && ( order != 'C' ) ) \
info = 1; \
else if( ( transa != 'n' ) && ( transa != 'N' ) && ( transa != 't' ) && ( transa != 'T' ) ) \
info = 2; \
else if( ( transb != 'n' ) && ( transb != 'N' ) && ( transb != 't' ) && ( transb != 'T' ) ) \
info = 3; \
else if ( m <= 0 ) \
info = 4; \
else if ( n <= 0 ) \
info = 5; \
else if ( k <= 0 ) \
info = 6; \
else if ( a == NULL ) \
info = 8; \
else if ( row_stored && ( ( nota && ( lda < k ) ) || ( ta && ( lda < m ) ) ) ) \
info = 9; \
else if ( col_stored && ( ( nota && ( lda < m ) ) || ( ta && ( lda < k ) ) ) ) \
info = 9; \
else if ( ( mtag_a != 'n' ) && ( mtag_a != 'N' ) && \
( mtag_a != 'p' ) && ( mtag_a != 'P' ) && \
( mtag_a != 'r' ) && ( mtag_a != 'R' ) ) \
info = 10; \
else if ( b == NULL ) \
info = 11; \
else if ( row_stored && ( ( notb && ( ldb < n ) ) || ( tb && ( ldb < k ) ) ) ) \
info = 12; \
else if ( col_stored && ( ( notb && ( ldb < k ) ) || ( tb && ( ldb < n ) ) ) ) \
info = 12; \
else if ( ( mtag_b != 'n' ) && ( mtag_b != 'N' ) && \
( mtag_b != 'p' ) && ( mtag_b != 'P' ) && \
( mtag_b != 'r' ) && ( mtag_b != 'R' ) ) \
info = 13; \
else if ( c == NULL ) \
info = 15; \
else if ( row_stored && ( ldc < n ) ) \
info = 16; \
else if ( col_stored && ( ldc < m ) ) \
info = 16; \
\
if( info != 0 ) \
{ \
char print_msg[ 100 ]; \
\
sprintf( print_msg, "** On entry to %6s, parameter number %2i had an illegal value", op_str, info); \
bli_print_msg(print_msg, __FILE__, __LINE__); \
return; \
} \
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_post_ops.h"
#include "lpgemm_thread_decor_openmp.h"
@@ -64,13 +65,16 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), transa, transb, m, n, k,\
(void*)&alpha, lda, ldb, (void*)&beta, ldc);
// Null check for pointers.
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
{
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \
"Invalid pointers provided for input parameters.");
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"f32f32f32of32",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
@@ -86,36 +90,8 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
// Row major input expected with leading dimensions >= row stride.
if ( ( is_row_major == TRUE ) &&
( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
{
return; // Error.
}
// Column major input expected with leading dimensions >= column stride.
else if ( ( is_column_major == TRUE ) &&
( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
{
return; // Error.
}
// Check if dimensions are valid.
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
{
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \
"Invalid matrix dimensions.");
return; // Error.
}
bool is_row_major = ( ( order == 'r' ) || ( order == 'R' ) );
bool is_column_major = ( ( order == 'c' ) || ( order == 'C' ) );
// The strides are set assuming a row major kernel.
const inc_t rs_a = lda;
@@ -168,17 +144,19 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
@@ -197,7 +175,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, F32
);
}
else
@@ -210,7 +188,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, F32
);
}
#else
@@ -229,7 +207,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, F32
);
}
else
@@ -242,7 +220,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, F32
);
}
#endif

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -42,6 +42,8 @@
#define AOCL_GEMM_GET_REORDER_BUF_SIZE(LP_SFX) \
BLIS_EXPORT_ADDON siz_t aocl_get_reorder_buf_size_ ## LP_SFX \
( \
const char order, \
const char trans, \
const char mat_type, \
const dim_t k, \
const dim_t n \
@@ -60,6 +62,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(s8s8s16os16);
#define AOCL_GEMM_REORDER(B_type,LP_SFX) \
BLIS_EXPORT_ADDON void aocl_reorder_ ## LP_SFX \
( \
const char order, \
const char trans, \
const char mat_type, \
const B_type* input_buf_addr, \
B_type* reorder_buf_addr, \
@@ -106,6 +110,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16);
AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32);
AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8);
AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8);
AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8);
AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16);
AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32);
AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8);

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_5loop_interface_apis.h"
#include "lpgemm_config.h"
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ((a == NULL) || (b == NULL) || (c == NULL))
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"s8s8s16os16",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
@@ -75,31 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
// Row major input expected with leading dimensions equal to row stride.
if ((lda != k) || (ldb != n) || (ldc != n))
{
return; // Error.
}
// Check if dimensions are valid.
if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
@@ -125,22 +116,25 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
// Only unpacked A supported now.
if (mtag_a != UNPACKED)
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global(&rntm_g);
bli_membrk_rntm_set_membrk(&rntm_g);
bli_pba_rntm_set_pba(&rntm_g);
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );
@@ -153,7 +147,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, S16
);
#else
lpgemm_s8s8s16o16_thread_decorator
@@ -164,7 +158,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, S16
);
#endif
}

View File

@@ -118,7 +118,7 @@ AOCL_GEMM_REORDER(int8_t,s8s8s16os16)
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global(&rntm_g);
bli_membrk_rntm_set_membrk(&rntm_g);
bli_pba_rntm_set_pba(&rntm_g);
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );

View File

@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_5loop_interface_apis.h"
#include "lpgemm_config.h"
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ((a == NULL) || (b == NULL) || (c == NULL))
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"s8s8s16os8",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
@@ -75,31 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
// Row major input expected with leading dimensions equal to row stride.
if ((lda != k) || (ldb != n) || (ldc != n))
{
return; // Error.
}
// Check if dimensions are valid.
if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
@@ -125,22 +116,25 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
// Only unpacked A supported now.
if (mtag_a != UNPACKED)
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global(&rntm_g);
bli_membrk_rntm_set_membrk(&rntm_g);
bli_pba_rntm_set_pba(&rntm_g);
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );
@@ -153,7 +147,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
( int16_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, S8
);
#else
lpgemm_s8s8s16o16_thread_decorator
@@ -164,7 +158,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
( int16_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, S8
);
#endif
}

View File

@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_post_ops.h"
#include "lpgemm_thread_decor_openmp.h"
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"s8s8s32os32",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
@@ -75,32 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
// Row major input expected with leading dimensions equal to row stride.
if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
{
return; // Error.
}
// Check if dimensions are valid.
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
@@ -126,22 +116,25 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
// Only unpacked A supported now.
if ( mtag_a != UNPACKED )
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );
@@ -154,7 +147,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, S32
);
#else
lpgemm_s8s8s32o32_thread_decorator
@@ -165,7 +158,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, S32
);
#endif
}

View File

@@ -118,7 +118,7 @@ AOCL_GEMM_REORDER(int8_t,s8s8s32os32)
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );

View File

@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_post_ops.h"
#include "lpgemm_thread_decor_openmp.h"
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"s8s8s32os8",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
@@ -75,32 +81,16 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
// Row major input expected with leading dimensions equal to row stride.
if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
{
return; // Error.
}
// Check if dimensions are valid.
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
@@ -126,22 +116,25 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
// Only unpacked A supported now.
if ( mtag_a != UNPACKED )
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );
@@ -154,7 +147,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
( int32_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, S8
);
#else
lpgemm_s8s8s32o32_thread_decorator
@@ -165,7 +158,7 @@ AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
( int32_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, S8
);
#endif
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_5loop_interface_apis.h"
#include "lpgemm_config.h"
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ((a == NULL) || (b == NULL) || (c == NULL))
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"u8s8s16os16",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
@@ -75,31 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
// Row major input expected with leading dimensions equal to row stride.
if ((lda != k) || (ldb != n) || (ldc != n))
{
return; // Error.
}
// Check if dimensions are valid.
if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
@@ -125,22 +116,25 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
// Only unpacked A supported now.
if (mtag_a != UNPACKED)
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global(&rntm_g);
bli_membrk_rntm_set_membrk(&rntm_g);
bli_pba_rntm_set_pba(&rntm_g);
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
@@ -153,7 +147,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, S16
);
#else
lpgemm_u8s8s16o16_thread_decorator
@@ -164,7 +158,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, S16
);
#endif
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -117,7 +117,7 @@ AOCL_GEMM_REORDER(int8_t,u8s8s16os16)
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global(&rntm_g);
bli_membrk_rntm_set_membrk(&rntm_g);
bli_pba_rntm_set_pba(&rntm_g);
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_5loop_interface_apis.h"
#include "lpgemm_config.h"
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ((a == NULL) || (b == NULL) || (c == NULL))
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"u8s8s16os8",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
@@ -75,31 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
// Row major input expected with leading dimensions equal to row stride.
if ((lda != k) || (ldb != n) || (ldc != n))
{
return; // Error.
}
// Check if dimensions are valid.
if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
@@ -125,22 +116,25 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
// Only unpacked A supported now.
if (mtag_a != UNPACKED)
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global(&rntm_g);
bli_membrk_rntm_set_membrk(&rntm_g);
bli_pba_rntm_set_pba(&rntm_g);
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
@@ -153,7 +147,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
( int16_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, S8
);
#else
lpgemm_u8s8s16o16_thread_decorator
@@ -164,7 +158,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
( int16_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, S8
);
#endif
}

View File

@@ -0,0 +1,164 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_5loop_interface_apis.h"
#include "lpgemm_config.h"
#include "lpgemm_utils.h"
#include "lpgemm_thread_decor_openmp.h"
#include "lpgemm_post_ops.h"
AOCL_GEMM_MATMUL(uint8_t,int8_t,uint8_t,int16_t,u8s8s16ou8)
{
trans_t blis_transa;
trans_t blis_transb;
// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
{
bli_print_msg(" AVX2 ISA not supported by processor, "
"cannot perform u8s8s16 gemm.", __FILE__, __LINE__ );
return; // Error.
}
/* Initialize BLIS. */
bli_init_auto();
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// check for validity of params.
AOCL_GEMM_CHECK
(
"u8s8s16ou8",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
bli_param_map_netlib_to_blis_trans(transb, &blis_transb);
/* Perform BLAS parameter checking. */
// Transpose not supported.
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
const inc_t cs_b = 1;
const inc_t rs_c = ldc;
const inc_t cs_c = 1;
AOCL_MEMORY_TAG mtag_a;
AOCL_MEMORY_TAG mtag_b;
bli_param_map_char_to_lpmtag(mem_format_a, &mtag_a);
bli_param_map_char_to_lpmtag(mem_format_b, &mtag_b);
// B matrix needs to be packed in a certain format in order to be loaded
// and used in VNNI instrution. As such the mtag_b always needs to be either
// packed or reordered. B matrix as it is (unpacked) cannot be used, and
// the mtag_b is set to packed to enable runtime packing.
if (mtag_b == UNPACKED)
{
mtag_b = PACK;
}
// Only unpacked A supported now.
if (mtag_a != UNPACKED)
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global(&rntm_g);
bli_pba_rntm_set_pba(&rntm_g);
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
#ifdef BLIS_ENABLE_OPENMP
lpgemm_u8s8s16o16_openmp_thread_decorator
(
m, n, k,
a, rs_a, cs_a, mtag_a,
b, rs_b, cs_b, mtag_b,
( int16_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, U8
);
#else
lpgemm_u8s8s16o16_thread_decorator
(
m, n, k,
a, rs_a, cs_a, mtag_a,
b, rs_b, cs_b, mtag_b,
( int16_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, U8
);
#endif
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_post_ops.h"
#include "lpgemm_thread_decor_openmp.h"
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"u8s8s32os32",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
@@ -75,32 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
// Row major input expected with leading dimensions equal to row stride.
if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
{
return; // Error.
}
// Check if dimensions are valid.
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
@@ -126,22 +116,25 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
// Only unpacked A supported now.
if ( mtag_a != UNPACKED )
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );
@@ -154,7 +147,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, S32
);
#else
lpgemm_u8s8s32o32_thread_decorator
@@ -165,7 +158,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, FALSE
post_op_list, S32
);
#endif
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -117,7 +117,7 @@ AOCL_GEMM_REORDER(int8_t,u8s8s32os32)
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,6 +34,7 @@
#include "blis.h"
#include "aocl_gemm_interface_apis.h"
#include "aocl_gemm_check.h"
#include "lpgemm_types.h"
#include "lpgemm_post_ops.h"
#include "lpgemm_thread_decor_openmp.h"
@@ -60,11 +61,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
// Set MC, NC, KC, NR, MR.
aocl_lpgemm_init_global_cntx();
// Null check for pointers.
if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
{
return; // Error.
}
// check for validity of params.
AOCL_GEMM_CHECK
(
"u8s8s32os8",
order, transa, transb,
m, n, k,
a, lda, mem_format_a,
b, ldb, mem_format_b,
c, ldc
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
@@ -75,32 +81,16 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
( blis_transb != BLIS_NO_TRANSPOSE ) )
{
bli_print_msg(" Transpose of matrices is not supported.", __FILE__, __LINE__ );
return; // Error.
}
// Sanitize order input.
char order_use =
( ( order == 'r' ) || ( order == 'R' ) ||
( order == 'c' ) || ( order == 'C' ) ) ?
order : 'r';
if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
if ( ( order != 'r' ) && ( order != 'R' ) )
{
bli_print_msg(" Operation only supports row-major matrices.", __FILE__, __LINE__ );
return; // Only row major supported.
}
// Row major input expected with leading dimensions equal to row stride.
if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
{
return; // Error.
}
// Check if dimensions are valid.
if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
{
return; // Error.
}
const inc_t rs_a = lda;
const inc_t cs_a = 1;
const inc_t rs_b = ldb;
@@ -126,22 +116,25 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
// Only unpacked A supported now.
if ( mtag_a != UNPACKED )
{
bli_print_msg(" A matrix needs to be unpacked.", __FILE__, __LINE__ );
return; // Error.
}
// Convert post op struct to post op linked list format.
lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
lpgemm_translate_to_post_ops_list
err_t err = lpgemm_translate_to_post_ops_list
(
post_op_unparsed, post_op_list,
( void* )c, ( void* )( &order_use )
( void* )c, ( void* )( &order )
);
if( err != BLIS_SUCCESS ) return;
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_g;
bli_rntm_init_from_global( &rntm_g );
bli_membrk_rntm_set_membrk( &rntm_g );
bli_pba_rntm_set_pba( &rntm_g );
lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );
@@ -154,7 +147,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
( int32_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, S8
);
#else
lpgemm_u8s8s32o32_thread_decorator
@@ -165,7 +158,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
( int32_t* )c, rs_c, cs_c,
alpha, beta,
&rntm_g, lcntx_g,
post_op_list, TRUE
post_op_list, S8
);
#endif
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -37,7 +37,7 @@
#include "lpgemm_func_map.h"
#include "lpgemm_blksz_map.h"
#include "lpgemm_kernels.h"
#include "lpgemm_packb_bf16.h"
#include "lpgemm_pack_bf16.h"
#include "lpgemm_packb_s16.h"
#include "lpgemm_packa.h"
#include "lpgemm_packb.h"

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -56,7 +56,7 @@
#define LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI_BF16 \
PAMACRO(U8S8S16OS16, NULL) \
PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
PAMACRO(BF16BF16F32OF32, NULL) \
PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \
PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
PAMACRO(S8S8S16OS16, NULL) \
@@ -84,7 +84,7 @@
#define LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI \
PAMACRO(U8S8S16OS16, NULL) \
PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
PAMACRO(BF16BF16F32OF32, NULL) \
PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \
PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
PAMACRO(S8S8S16OS16, NULL) \
@@ -112,7 +112,7 @@
#define LPGEMM_PACKA_FUNC_MAP_AVX512 \
PAMACRO(U8S8S16OS16, NULL) \
PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
PAMACRO(BF16BF16F32OF32, NULL) \
PAMACRO(BF16BF16F32OF32, packa_mr16_bf16bf16f32of32) \
PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
PAMACRO(S8S8S16OS16, NULL) \

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,12 +34,14 @@
#include "blis.h"
#include "lpgemm_5loop_interface_apis.h"
#include "lpgemm_packb_bf16.h"
#include "lpgemm_pack_bf16.h"
#include "lpgemm_kernels.h"
#include "lpgemm_utils.h"
#include "lpgemm_thrinfo_utils.h"
#include "lpgemm_config.h"
// Kernel function prototypes
typedef void (*lpgemm_rowvar_bf16)
(
@@ -73,6 +75,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
const int16_t* a_use = NULL;
dim_t cs_a_use = cs_a;
dim_t rs_a_use = rs_a;
dim_t a_block_stride = 0;
const int16_t* b_use = NULL;
@@ -86,8 +89,11 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
// Pack buffer for B.
bfloat16* pack_b_buffer_bf16;
bfloat16* pack_a_buffer_bf16;
mem_t mem_b = BLIS_MEM_INITIALIZER;
mem_t mem_a = BLIS_MEM_INITIALIZER;
siz_t mem_b_size_req = 0;
siz_t mem_a_size_req = 0;
dim_t packb_min_NR = 16;
// Temporary buffer for C accumulation when downscaling is required.
@@ -109,7 +115,8 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
bool is_first_k = FALSE;
lpgemm_post_op_attr post_ops_attr;
if ( c_downscale == TRUE )
post_ops_attr.c_stor_type = c_downscale;
if ( c_downscale < F32 )
{
post_ops_attr.buf_downscale = c;
}
@@ -149,12 +156,12 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
);
}
if ( c_downscale == FALSE )
if ( c_downscale == F32 )
{
c_use_jc = c + jc;
}
// Temp accumulaton buffer for C allocation.
else if ( c_downscale == TRUE )
else if ( c_downscale < F32 )
{
// Buffer memory is only required if output needs to be
// persisted across iterations of the pc/KC loop.
@@ -167,7 +174,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
lpgemm_alloc_mem_panel
(
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
&mem_scale_c, rntm
);
@@ -254,11 +261,11 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
if ( ( jc_packb_end > jc_packb_start ) &&
( jc_packb_start < ( jc + nc0 ) ) )
{
( ( packb_bf16 )lcntx->packb_fun_ptr )
( ( pack_bf16 )lcntx->packb_fun_ptr )
(
pack_b_buffer_bf16 + ( jc_packb_start * kc0_updated ),
( b + ( rs_b * pc ) + ( cs_b * jc ) +
( cs_b * jc_packb_start ) ), rs_b,
( cs_b * jc_packb_start ) ), rs_b, cs_b,
( jc_packb_end - jc_packb_start ), kc0,
&rs_b_use, &cs_b_use
);
@@ -297,7 +304,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
// Only per thread C matrix is stored in temp buffer, so both
// per thread jc and ic start should be normalized to zero.
if ( c_downscale == TRUE )
if ( c_downscale < F32 )
{
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
}
@@ -315,6 +322,31 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
// Non bf16 based kernel requires update to this code.
cs_a_use = 2;
a_block_stride = rs_a;
rs_a_use = rs_a;
}
else if ( mtag_a == PACK )
{
mem_a_size_req = sizeof( bfloat16 ) * mc0 * kc0;
lpgemm_alloc_mem_panel
(
mem_a_size_req, BLIS_BUFFER_FOR_A_BLOCK,
&mem_a, rntm
);
pack_a_buffer_bf16 =
( bfloat16* ) bli_mem_buffer( &mem_a );
( ( pack_bf16 )lcntx->packa_fun_ptr )
(
pack_a_buffer_bf16,
( a + ( rs_a * ic ) + ( cs_a * pc )), rs_a, cs_a,
mc0, kc0,
&rs_a_use, &cs_a_use
);
a_use = pack_a_buffer_bf16;
a_block_stride = rs_a_use;
}
for ( dim_t jr = 0; jr < nc0; jr += NR )
@@ -330,7 +362,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
( ( lpgemm_rowvar_bf16 )lcntx->kern_fun_ptr )
(
mc0, nr0, kc0,
a_use, rs_a, cs_a_use, a_block_stride,
a_use, rs_a_use, cs_a_use, a_block_stride,
( b_use + ( jr * kc0_updated ) ), rs_b_use, cs_b_use,
( c_use_ic + jr ), rs_c_use, 1,
alpha, beta0,
@@ -360,15 +392,22 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
{
if ( bli_mem_is_alloc( &mem_b ) )
{
bli_membrk_release( rntm, &mem_b );
bli_pba_release( rntm, &mem_b );
}
}
}
if ( c_downscale == TRUE )
if( mtag_a == PACK )
{
if ( bli_mem_is_alloc( &mem_a ) )
{
bli_pba_release(rntm, &mem_a);
}
}
if ( c_downscale < F32 )
{
if ( bli_mem_is_alloc( &mem_scale_c ) )
{
bli_membrk_release( rntm, &mem_scale_c );
bli_pba_release( rntm, &mem_scale_c );
}
}
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -35,7 +35,7 @@
#include "blis.h"
#include "lpgemm_utils.h"
#include "lpgemm_reorder_bf16.h"
#include "lpgemm_packb_bf16.h"
#include "lpgemm_pack_bf16.h"
#include "lpgemm_config.h"
#include "aocl_bf16_type.h"
@@ -53,6 +53,7 @@ void reorderb_nr64_bf16bf16f32of32
// Extracting the matrix properties from the lpgemm object
dim_t rs_b = b->rs;
dim_t cs_b = b->cs;
dim_t n = b->width;
dim_t k = b->length;
@@ -148,14 +149,14 @@ void reorderb_nr64_bf16bf16f32of32
// st = ( jc_cur_loop * k ) <traverse blocks 1,2,3,4>
// + ( n_sub_updated * pc ) <traverse block 5>
// + ( NC' * kc0_updated) <traverse block 6>
( ( packb_bf16 )lcntx->packb_fun_ptr )
( ( pack_bf16 )lcntx->packb_fun_ptr )
(
( ( ( bfloat16* )b_reorder->storage.aligned_buffer ) +
( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
( jc_cur_loop_rem * kc0_updated ) ),
( ( bfloat16* )b_reorder->storage.aligned_buffer ) +
( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
( jc_cur_loop_rem * kc0_updated ),
( ( ( bfloat16* )b->storage.aligned_buffer ) +
( rs_b * pc ) + jc ),
rs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
( rs_b * pc ) + (jc * cs_b)),
rs_b, cs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
);
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -150,7 +150,8 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32)
bool is_first_k = FALSE;
lpgemm_post_op_attr post_ops_attr;
if ( c_downscale == TRUE )
post_ops_attr.c_stor_type = c_downscale;
if ( c_downscale < F32 )
{
post_ops_attr.buf_downscale = c;
}
@@ -395,7 +396,7 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32)
{
if ( bli_mem_is_alloc( &mem_b ) )
{
bli_membrk_release( rntm, &mem_b );
bli_pba_release( rntm, &mem_b );
}
}
}
@@ -403,7 +404,7 @@ LPGEMM_5LOOP(float,float,float,f32f32f32of32)
{
if ( bli_mem_is_alloc( &mem_a ) )
{
bli_membrk_release( rntm, &mem_a );
bli_pba_release( rntm, &mem_a );
}
}
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -62,7 +62,7 @@ void lpgemm_rowvar_ ## LP_SFX \
lpgemm_thrinfo_t* thread, \
lpgemm_cntx_t* lcntx, \
lpgemm_post_op* post_op_list, \
bool c_downscale \
AOCL_STORAGE_TYPE c_downscale \
) \
LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32);

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -55,7 +55,7 @@ BLIS_INLINE void lpgemm_set_node_params
post_op_node->next = NULL;
}
void lpgemm_translate_to_post_ops_list
err_t lpgemm_translate_to_post_ops_list
(
aocl_post_op* post_op_unparsed,
lpgemm_post_op* post_op_list,
@@ -70,7 +70,7 @@ void lpgemm_translate_to_post_ops_list
post_op_list, POST_OPS_DISABLE,
NULL, NULL, NULL, NULL, FALSE
);
return;
return BLIS_SUCCESS;
}
if ( ( post_op_unparsed->seq_length > AOCL_MAX_POST_OPS ) )
@@ -80,7 +80,7 @@ void lpgemm_translate_to_post_ops_list
post_op_list, POST_OPS_DISABLE,
NULL, NULL, NULL, NULL, FALSE
);
return; //Error, seq length exceeds max post ops permitted.
return BLIS_SUCCESS; //Error, seq length exceeds max post ops permitted.
}
dim_t e_i = 0; //Multiple eltwise supported.
@@ -110,6 +110,11 @@ void lpgemm_translate_to_post_ops_list
tmp_code = POST_OPS_RELU;
break;
case PRELU:
if( ( post_op_unparsed->eltwise + e_i )->algo.alpha == NULL )
{
bli_print_msg(" Post_op.alpha is NULL. Exiting..", __FILE__, __LINE__ );
return BLIS_NULL_POINTER;
}
tmp_code = POST_OPS_RELU_SCALE;
break;
case GELU_TANH:
@@ -119,6 +124,12 @@ void lpgemm_translate_to_post_ops_list
tmp_code = POST_OPS_GELU_ERF;
break;
case CLIP:
if( ( ( post_op_unparsed->eltwise + e_i )->algo.alpha == NULL ) ||
( ( post_op_unparsed->eltwise + e_i )->algo.beta == NULL ) )
{
bli_print_msg(" Post_op.clip min or max value is NULL. Exiting..", __FILE__, __LINE__ );
return BLIS_NULL_POINTER;
}
tmp_code = POST_OPS_CLIP;
break;
default:
@@ -137,6 +148,11 @@ void lpgemm_translate_to_post_ops_list
}
break;
case BIAS:
if( post_op_unparsed->bias.bias == NULL )
{
bli_print_msg(" Post_op.bias is NULL. Exiting..", __FILE__, __LINE__ );
return BLIS_NULL_POINTER;
}
lpgemm_set_node_params
(
( post_op_list + i ), POST_OPS_BIAS,
@@ -145,6 +161,12 @@ void lpgemm_translate_to_post_ops_list
);
break;
case SCALE:
if( ( post_op_unparsed->sum.scale_factor == NULL ) ||
( post_op_unparsed->sum.zero_point == NULL ) )
{
bli_print_msg(" Post_op.scale scale_factor or zero_point is NULL. Exiting..", __FILE__, __LINE__ );
return BLIS_NULL_POINTER;
}
lpgemm_set_node_params
(
( post_op_list + i ), POST_OPS_DOWNSCALE,
@@ -163,4 +185,5 @@ void lpgemm_translate_to_post_ops_list
( post_op_list + i )->next = ( post_op_list + i + 1);
}
}
return BLIS_SUCCESS;
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -70,12 +70,13 @@ typedef struct lpgemm_post_op_attr_t
void* buf_downscale;
bool is_first_k;
bool is_last_k;
AOCL_STORAGE_TYPE c_stor_type;
dim_t b_sum_offset;
int32_t* b_col_sum_vec;
int16_t* b_col_sum_vec_s16;
} lpgemm_post_op_attr;
void lpgemm_translate_to_post_ops_list
err_t lpgemm_translate_to_post_ops_list
(
aocl_post_op* post_op_unparsed,
lpgemm_post_op* post_op_list,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -42,6 +42,24 @@ typedef enum
INT32 = 2
} AOCL_ARRAY_TYPE;
// Enum to denote the storage data type (output matrix).
// It is expected that the enum entries are in ascending order of
// storage data type size.
typedef enum
{
S8 = 0,
U8 = 1,
S16 = 2,
U16 = 3,
BF16 = 4,
S32 = 5,
U32 = 6,
F32 = 7,
S64 = 8,
U64 = 9,
F64 = 10
} AOCL_STORAGE_TYPE;
// Enum name template:A_mat_type ## B_mat_type ## Accumulate_type ## C_mat_type.
typedef enum
{

View File

@@ -116,7 +116,8 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
bool is_first_k = FALSE;
lpgemm_post_op_attr post_ops_attr;
if ( c_downscale == TRUE )
post_ops_attr.c_stor_type = c_downscale;
if ( c_downscale < S16 )
{
post_ops_attr.buf_downscale = c;
}
@@ -156,12 +157,12 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
);
}
if ( c_downscale == FALSE )
if ( c_downscale == S16 )
{
c_use_jc = c + jc;
}
// Temp accumulaton buffer for C allocation.
else if ( c_downscale == TRUE )
else if ( c_downscale < S16 )
{
// Buffer memory is only required if output needs to be
// persisted across iterations of the pc/KC loop.
@@ -174,7 +175,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
lpgemm_alloc_mem_panel
(
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
&mem_scale_c, rntm
);
@@ -329,7 +330,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
// Only per thread C matrix is stored in temp buffer, so both
// per thread jc and ic start should be normalized to zero.
if ( c_downscale == TRUE )
if ( c_downscale < S16 )
{
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
}
@@ -388,15 +389,15 @@ LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
{
if (bli_mem_is_alloc(&mem_b))
{
bli_membrk_release(rntm, &mem_b);
bli_pba_release(rntm, &mem_b);
}
}
}
if ( c_downscale == TRUE )
if ( c_downscale < S16 )
{
if ( bli_mem_is_alloc( &mem_scale_c ) )
{
bli_membrk_release( rntm, &mem_scale_c );
bli_pba_release( rntm, &mem_scale_c );
}
}
}

View File

@@ -123,7 +123,8 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
bool is_first_k = FALSE;
lpgemm_post_op_attr post_ops_attr;
if ( c_downscale == TRUE )
post_ops_attr.c_stor_type = c_downscale;
if ( c_downscale < S32 )
{
post_ops_attr.buf_downscale = c;
}
@@ -163,12 +164,12 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
);
}
if ( c_downscale == FALSE )
if ( c_downscale == S32 )
{
c_use_jc = c + jc;
}
// Temp accumulaton buffer for C allocation.
else if ( c_downscale == TRUE )
else if ( c_downscale < S32 )
{
// Buffer memory is only required if output needs to be
// persisted across iterations of the pc/KC loop.
@@ -181,7 +182,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
lpgemm_alloc_mem_panel
(
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
&mem_scale_c, rntm
);
@@ -335,7 +336,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
// Only per thread C matrix is stored in temp buffer, so both
// per thread jc and ic start should be normalized to zero.
if ( c_downscale == TRUE )
if ( c_downscale < S32 )
{
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
}
@@ -426,7 +427,7 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
{
if ( bli_mem_is_alloc( &mem_b ) )
{
bli_membrk_release( rntm, &mem_b );
bli_pba_release( rntm, &mem_b );
}
}
}
@@ -434,14 +435,14 @@ LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
{
if ( bli_mem_is_alloc( &mem_a ) )
{
bli_membrk_release( rntm, &mem_a );
bli_pba_release( rntm, &mem_a );
}
}
if ( c_downscale == TRUE )
if ( c_downscale < S32 )
{
if ( bli_mem_is_alloc( &mem_scale_c ) )
{
bli_membrk_release( rntm, &mem_scale_c );
bli_pba_release( rntm, &mem_scale_c );
}
}
}

View File

@@ -123,7 +123,7 @@ BLIS_INLINE void lpgemm_alloc_mem_panel
{
if ( bli_mem_is_unalloc( mem ) )
{
bli_membrk_acquire_m
bli_pba_acquire_m
(
rntm_l,
size_req,
@@ -136,8 +136,8 @@ BLIS_INLINE void lpgemm_alloc_mem_panel
siz_t mem_size = bli_mem_size( mem );
if ( mem_size < size_req )
{
bli_membrk_release( rntm_l, mem );
bli_membrk_acquire_m
bli_pba_release( rntm_l, mem );
bli_pba_acquire_m
(
rntm_l,
size_req,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -268,7 +268,7 @@ BLIS_INLINE void lpgemm_adjust_ic_jc_ways
}
}
BLIS_INLINE void lpgemm_u8s8s16o16_get_threading
BLIS_INLINE void lpgemm_s16o16_get_threading
(
dim_t* n_threads,
dim_t* ic_ways,
@@ -276,7 +276,8 @@ BLIS_INLINE void lpgemm_u8s8s16o16_get_threading
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm_g
rntm_t* rntm_g,
AOCL_OPERATION_TYPE op_type
)
{
*n_threads = bli_rntm_num_threads( rntm_g );
@@ -295,19 +296,176 @@ BLIS_INLINE void lpgemm_u8s8s16o16_get_threading
else if ( ( *n_threads ) > 1 )
{
dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S16OS16 );
dim_t NR = lpgemm_get_block_size_NR_global_cntx( op_type );
dim_t MR = lpgemm_get_block_size_MR_global_cntx( op_type );
dim_t mr_blks = ( m + MR - 1 ) / MR;
dim_t nr_blks = ( n + NR - 1 ) / NR;
if ( n <= NR )
{
// If n is less than micro panel dimension, allocating all threads
// to ic resulted in gains.
( *ic_ways ) = ( *n_threads );
( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads );
( *jc_ways ) = 1;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( m <= MR )
{
( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads );
( *ic_ways ) = 1;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else
{
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) )
{
( *ic_ways ) = mr_blks;
( *jc_ways ) = nr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( mr_blks < ( *ic_ways ) )
{
( *ic_ways ) = mr_blks;
dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) );
( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( nr_blks < ( *jc_ways ) )
{
( *jc_ways ) = nr_blks;
dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) );
( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
}
}
else
{
// Setting all the values to 1 in case n_threads <= 1. This ensures
// the threading parameters are valid.
*n_threads = 1;
*jc_ways = 1;
*ic_ways = 1;
}
}
BLIS_INLINE void lpgemm_u8s8s16o16_get_threading
(
dim_t* n_threads,
dim_t* ic_ways,
dim_t* jc_ways,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm_g
)
{
lpgemm_s16o16_get_threading
(
n_threads,
ic_ways, jc_ways,
m, n, k, rntm_g,
U8S8S16OS16
);
}
BLIS_INLINE void lpgemm_s8s8s16o16_get_threading
(
dim_t* n_threads,
dim_t* ic_ways,
dim_t* jc_ways,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm_g
)
{
lpgemm_s16o16_get_threading
(
n_threads,
ic_ways, jc_ways,
m, n, k, rntm_g,
S8S8S16OS16
);
}
BLIS_INLINE void lpgemm_s32o32_get_threading
(
dim_t* n_threads,
dim_t* ic_ways,
dim_t* jc_ways,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm_g,
AOCL_OPERATION_TYPE op_type
)
{
*n_threads = bli_rntm_num_threads( rntm_g );
*jc_ways = bli_rntm_jc_ways( rntm_g );
*ic_ways = bli_rntm_ic_ways( rntm_g );
if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
{
// If BLIS_IC_NT or JC_NT are set.
// Default cases.
*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
*n_threads = ( *jc_ways ) * ( *ic_ways );
}
else if ( ( *n_threads ) > 1 )
{
dim_t NR = lpgemm_get_block_size_NR_global_cntx( op_type );
dim_t MR = lpgemm_get_block_size_MR_global_cntx( op_type );
dim_t mr_blks = ( m + MR - 1 ) / MR;
dim_t nr_blks = ( n + NR - 1 ) / NR;
if ( n <= NR )
{
( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads );
( *jc_ways ) = 1;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( m <= MR )
{
( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads );
( *ic_ways ) = 1;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else
{
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) )
{
( *ic_ways ) = mr_blks;
( *jc_ways ) = nr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( mr_blks < ( *ic_ways ) )
{
( *ic_ways ) = mr_blks;
dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) );
( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( nr_blks < ( *jc_ways ) )
{
( *jc_ways ) = nr_blks;
dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) );
( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else
{
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
(
MR, NR, m, n,
n_threads, ic_ways, jc_ways
);
}
}
}
else
@@ -331,52 +489,33 @@ BLIS_INLINE void lpgemm_u8s8s32o32_get_threading
rntm_t* rntm_g
)
{
*n_threads = bli_rntm_num_threads( rntm_g );
*jc_ways = bli_rntm_jc_ways( rntm_g );
*ic_ways = bli_rntm_ic_ways( rntm_g );
lpgemm_s32o32_get_threading
(
n_threads,
ic_ways, jc_ways,
m, n, k, rntm_g,
U8S8S32OS32
);
}
if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
{
// If BLIS_IC_NT or JC_NT are set.
// Default cases.
*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
*n_threads = ( *jc_ways ) * ( *ic_ways );
}
else if ( ( *n_threads ) > 1 )
{
dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S32OS32 );
dim_t MR = lpgemm_get_block_size_MR_global_cntx( U8S8S32OS32 );
if ( n <= NR )
{
// If n is less than micro panel dimension, allocating all threads
// to ic resulted in gains.
( *ic_ways ) = ( *n_threads );
( *jc_ways ) = 1;
}
else
{
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
(
MR, NR, m, n,
n_threads, ic_ways, jc_ways
);
}
}
else
{
// Setting all the values to 1 in case n_threads <= 1. This ensures
// the threading parameters are valid.
*n_threads = 1;
*jc_ways = 1;
*ic_ways = 1;
}
BLIS_INLINE void lpgemm_s8s8s32o32_get_threading
(
dim_t* n_threads,
dim_t* ic_ways,
dim_t* jc_ways,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm_g
)
{
lpgemm_s32o32_get_threading
(
n_threads,
ic_ways, jc_ways,
m, n, k, rntm_g,
S8S8S32OS32
);
}
BLIS_INLINE void lpgemm_bf16bf16f32of32_get_threading
@@ -408,24 +547,53 @@ BLIS_INLINE void lpgemm_bf16bf16f32of32_get_threading
dim_t NR = lpgemm_get_block_size_NR_global_cntx( BF16BF16F32OF32 );
dim_t MR = lpgemm_get_block_size_MR_global_cntx( BF16BF16F32OF32 );
dim_t mr_blks = ( m + MR - 1 ) / MR;
dim_t nr_blks = ( n + NR - 1 ) / NR;
if ( n <= NR )
{
// If n is less than micro panel dimension, allocating all threads
// to ic resulted in gains.
( *ic_ways ) = ( *n_threads );
( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads );
( *jc_ways ) = 1;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( m <= MR )
{
( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads );
( *ic_ways ) = 1;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else
{
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
(
MR, NR, m, n,
n_threads, ic_ways, jc_ways
);
if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) )
{
( *ic_ways ) = mr_blks;
( *jc_ways ) = nr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( mr_blks < ( *ic_ways ) )
{
( *ic_ways ) = mr_blks;
dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) );
( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( nr_blks < ( *jc_ways ) )
{
( *jc_ways ) = nr_blks;
dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) );
( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else
{
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
(
MR, NR, m, n,
n_threads, ic_ways, jc_ways
);
}
}
}
else
@@ -485,15 +653,55 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
}
else if ( ( *n_threads ) > 1 )
{
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
dim_t mr_blks = ( m + MR - 1 ) / MR;
dim_t nr_blks = ( n + NR - 1 ) / NR;
lpgemm_adjust_ic_jc_ways
(
m, n, k,
MC, NC, KC, MR, NR,
n_threads, ic_ways, jc_ways, 5
);
if ( n <= NR )
{
( *ic_ways ) = ( mr_blks < ( *n_threads ) ) ? mr_blks : ( *n_threads );
( *jc_ways ) = 1;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( m <= MR )
{
( *jc_ways ) = ( nr_blks < ( *n_threads ) ) ? nr_blks : ( *n_threads );
( *ic_ways ) = 1;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else
{
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
if ( ( mr_blks < ( *ic_ways ) ) && ( nr_blks < ( *jc_ways ) ) )
{
( *ic_ways ) = mr_blks;
( *jc_ways ) = nr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( mr_blks < ( *ic_ways ) )
{
( *ic_ways ) = mr_blks;
dim_t rem_jc_ways = ( dim_t )( ( *n_threads ) / ( *ic_ways ) );
( *jc_ways ) = ( rem_jc_ways < nr_blks ) ? rem_jc_ways : nr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else if ( nr_blks < ( *jc_ways ) )
{
( *jc_ways ) = nr_blks;
dim_t rem_ic_ways = ( dim_t )( ( *n_threads ) / ( *jc_ways ) );
( *ic_ways ) = ( rem_ic_ways < mr_blks ) ? rem_ic_ways : mr_blks;
( *n_threads ) = ( *ic_ways ) * ( *jc_ways );
}
else
{
lpgemm_adjust_ic_jc_ways
(
m, n, k,
MC, NC, KC, MR, NR,
n_threads, ic_ways, jc_ways, 5
);
}
}
}
else
{
@@ -513,9 +721,8 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
if ( ( m >= MT ) && ( n >= NT ) && ( k >= KT ) )
{
if ( ( k > page_size_b_floatx2 ) ||
( ( k <= page_size_b_floatx2 ) &&
( m_ic > MT_2 ) && ( n_jc >= NT ) ) )
if (((k <= page_size_b_floatx2 ) && ( m_ic > MT_2 ) && ( n_jc >= NT ) ) ||
((bli_cpuid_is_avx512_supported() == FALSE ) && (k > page_size_b_floatx2)))
{
bli_rntm_set_pack_b( 1, rntm_g );
bli_rntm_set_pack_a( 1, rntm_g );
@@ -523,118 +730,6 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
}
}
BLIS_INLINE void lpgemm_s8s8s32o32_get_threading
(
dim_t* n_threads,
dim_t* ic_ways,
dim_t* jc_ways,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm_g
)
{
*n_threads = bli_rntm_num_threads( rntm_g );
*jc_ways = bli_rntm_jc_ways( rntm_g );
*ic_ways = bli_rntm_ic_ways( rntm_g );
if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
{
// If BLIS_IC_NT or JC_NT are set.
// Default cases.
*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
*n_threads = ( *jc_ways ) * ( *ic_ways );
}
else if ( ( *n_threads ) > 1 )
{
dim_t NR = lpgemm_get_block_size_NR_global_cntx( S8S8S32OS32 );
dim_t MR = lpgemm_get_block_size_MR_global_cntx( S8S8S32OS32 );
if ( n <= NR )
{
// If n is less than micro panel dimension, allocating all threads
// to ic resulted in gains.
( *ic_ways ) = ( *n_threads );
( *jc_ways ) = 1;
}
else
{
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
(
MR, NR, m, n,
n_threads, ic_ways, jc_ways
);
}
}
else
{
// Setting all the values to 1 in case n_threads <= 1. This ensures
// the threading parameters are valid.
*n_threads = 1;
*jc_ways = 1;
*ic_ways = 1;
}
}
BLIS_INLINE void lpgemm_s8s8s16o16_get_threading
(
dim_t* n_threads,
dim_t* ic_ways,
dim_t* jc_ways,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm_g
)
{
*n_threads = bli_rntm_num_threads( rntm_g );
*jc_ways = bli_rntm_jc_ways( rntm_g );
*ic_ways = bli_rntm_ic_ways( rntm_g );
if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
{
// If BLIS_IC_NT or JC_NT are set.
// Default cases.
*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
*n_threads = ( *jc_ways ) * ( *ic_ways );
}
else if ( ( *n_threads ) > 1 )
{
dim_t NR = lpgemm_get_block_size_NR_global_cntx( S8S8S16OS16 );
if ( n <= NR )
{
// If n is less than micro panel dimension, allocating all threads
// to ic resulted in gains.
( *ic_ways ) = ( *n_threads );
( *jc_ways ) = 1;
}
else
{
// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
}
}
else
{
// Setting all the values to 1 in case n_threads <= 1. This ensures
// the threading parameters are valid.
*n_threads = 1;
*jc_ways = 1;
*ic_ways = 1;
}
}
#define GEN_LPGEMM_OPENMP_DECORATOR(A_type,B_type,C_type,LPGEMM_SFX) \
void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
( \
@@ -657,7 +752,7 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
rntm_t* rntm_g, \
lpgemm_cntx_t* lcntx, \
lpgemm_post_op* post_op_list, \
bool c_downscale \
AOCL_STORAGE_TYPE c_downscale \
) \
{ \
dim_t n_threads; \
@@ -676,14 +771,15 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
/* Set the packing block allocator field of the rntm. This will be
* inherited by all of the child threads when they make local copies of
* the rntm below.*/ \
bli_membrk_rntm_set_membrk( rntm_g ); \
bli_pba_rntm_set_pba( rntm_g ); \
\
thrcomm_t static_lpgemm_comms[BLIS_LPGEMM_NUM_STATIC_COMMS]; \
thrcomm_t* cur_lpgemm_comms = static_lpgemm_comms; \
err_t bli_errors = BLIS_SUCCESS; \
\
if ( jc_ways > BLIS_LPGEMM_NUM_STATIC_COMMS ) \
{ \
cur_lpgemm_comms = bli_malloc_intl( jc_ways * sizeof( thrcomm_t ) ); \
cur_lpgemm_comms = bli_malloc_intl( jc_ways * sizeof( thrcomm_t ), &bli_errors ); \
} \
for ( dim_t i = 0; i < jc_ways; ++i ) \
{ \
@@ -758,7 +854,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
rntm_t* rntm_g, \
lpgemm_cntx_t* lcntx, \
lpgemm_post_op* post_op_list, \
bool c_downscale \
AOCL_STORAGE_TYPE c_downscale \
) \
{ \
dim_t n_threads = 1; \
@@ -770,7 +866,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
/* Set the packing block allocator field of the rntm. This will be
* inherited by all of the child threads when they make local copies of
* the rntm below.*/ \
bli_membrk_rntm_set_membrk( rntm_g ); \
bli_pba_rntm_set_pba( rntm_g ); \
\
thrcomm_t static_lpgemm_comm; \
thrcomm_t* cur_lpgemm_comm = &static_lpgemm_comm; \

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -63,7 +63,7 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
rntm_t* rntm_g, \
lpgemm_cntx_t* lcntx, \
lpgemm_post_op* post_op_list, \
bool c_downscale \
AOCL_STORAGE_TYPE c_downscale \
); \
GEN_LPGEMM_OPENMP_DECORATOR_FN(uint8_t,int8_t,int16_t,u8s8s16o16)
@@ -97,7 +97,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
rntm_t* rntm_g, \
lpgemm_cntx_t* lcntx, \
lpgemm_post_op* post_op_list, \
bool c_downscale \
AOCL_STORAGE_TYPE c_downscale \
); \
GEN_LPGEMM_DECORATOR_FN(uint8_t,int8_t,int16_t,u8s8s16o16)

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -113,7 +113,8 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
bool is_first_k = FALSE;
lpgemm_post_op_attr post_ops_attr;
if ( c_downscale == TRUE )
post_ops_attr.c_stor_type = c_downscale;
if ( c_downscale < S16 )
{
post_ops_attr.buf_downscale = c;
}
@@ -153,12 +154,12 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
);
}
if ( c_downscale == FALSE )
if ( c_downscale == S16 )
{
c_use_jc = c + jc;
}
// Temp accumulaton buffer for C allocation.
else if ( c_downscale == TRUE )
else if ( c_downscale < S16 )
{
// Buffer memory is only required if output needs to be
// persisted across iterations of the pc/KC loop.
@@ -171,7 +172,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
lpgemm_alloc_mem_panel
(
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
&mem_scale_c, rntm
);
@@ -305,7 +306,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
// Only per thread C matrix is stored in temp buffer, so both
// per thread jc and ic start should be normalized to zero.
if ( c_downscale == TRUE )
if ( c_downscale < S16 )
{
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
}
@@ -361,15 +362,15 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
{
if (bli_mem_is_alloc(&mem_b))
{
bli_membrk_release(rntm, &mem_b);
bli_pba_release(rntm, &mem_b);
}
}
}
if ( c_downscale == TRUE )
if ( c_downscale < S16 )
{
if ( bli_mem_is_alloc( &mem_scale_c ) )
{
bli_membrk_release( rntm, &mem_scale_c );
bli_pba_release( rntm, &mem_scale_c );
}
}
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -122,7 +122,8 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
bool is_first_k = FALSE;
lpgemm_post_op_attr post_ops_attr;
if ( c_downscale == TRUE )
post_ops_attr.c_stor_type = c_downscale;
if ( c_downscale < S32 )
{
post_ops_attr.buf_downscale = c;
}
@@ -162,12 +163,12 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
);
}
if ( c_downscale == FALSE )
if ( c_downscale == S32 )
{
c_use_jc = c + jc;
}
// Temp accumulaton buffer for C allocation.
else if ( c_downscale == TRUE )
else if ( c_downscale < S32 )
{
// Buffer memory is only required if output needs to be
// persisted across iterations of the pc/KC loop.
@@ -180,7 +181,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
lpgemm_alloc_mem_panel
(
mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
mem_scale_c_size_req, BLIS_BUFFER_FOR_GEN_USE,
&mem_scale_c, rntm
);
@@ -313,7 +314,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
// Only per thread C matrix is stored in temp buffer, so both
// per thread jc and ic start should be normalized to zero.
if ( c_downscale == TRUE )
if ( c_downscale < S32 )
{
c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
}
@@ -405,7 +406,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
{
if ( bli_mem_is_alloc( &mem_b ) )
{
bli_membrk_release( rntm, &mem_b );
bli_pba_release( rntm, &mem_b );
}
}
}
@@ -413,14 +414,14 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
{
if ( bli_mem_is_alloc( &mem_a ) )
{
bli_membrk_release( rntm, &mem_a );
bli_pba_release( rntm, &mem_a );
}
}
if ( c_downscale == TRUE )
if ( c_downscale < S32 )
{
if ( bli_mem_is_alloc( &mem_scale_c ) )
{
bli_membrk_release( rntm, &mem_scale_c );
bli_pba_release( rntm, &mem_scale_c );
}
}
}

View File

@@ -123,7 +123,7 @@ BLIS_INLINE void lpgemm_alloc_mem_panel
{
if ( bli_mem_is_unalloc( mem ) )
{
bli_membrk_acquire_m
bli_pba_acquire_m
(
rntm_l,
size_req,
@@ -136,8 +136,8 @@ BLIS_INLINE void lpgemm_alloc_mem_panel
siz_t mem_size = bli_mem_size( mem );
if ( mem_size < size_req )
{
bli_membrk_release( rntm_l, mem );
bli_membrk_acquire_m
bli_pba_release( rntm_l, mem );
bli_pba_acquire_m
(
rntm_l,
size_req,

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -47,13 +47,14 @@ BLIS_INLINE dim_t get_packb_bf16bf16f32of32_min_NR()
return 16;
}
typedef void (*packb_bf16)
typedef void (*pack_bf16)
(
bfloat16*,
const bfloat16*,
const dim_t,
const dim_t,
const dim_t,
const dim_t,
dim_t*,
dim_t*
);
@@ -62,11 +63,24 @@ void packb_nr64_bf16bf16f32of32
(
bfloat16* pack_b_buffer_bf16bf16f32of32,
const bfloat16* b,
const dim_t ldb,
const dim_t rs_b,
const dim_t cs_b,
const dim_t NC,
const dim_t KC,
dim_t* rs_b,
dim_t* cs_b
dim_t* rs_p,
dim_t* cs_p
);
void packa_mr16_bf16bf16f32of32
(
bfloat16* pack_a_buffer,
const bfloat16* a,
const dim_t rs_a,
const dim_t cs_a,
const dim_t MC,
const dim_t KC,
dim_t* rs_p,
dim_t* cs_p
);
#endif //BLIS_GEMM_BF16_PACKB

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -93,7 +93,7 @@ void bao_l3_thread_decorator
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// Check for a somewhat obscure OpenMP thread-mismatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );

View File

@@ -1,10 +1,59 @@
##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. ##
##Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. ##
target_sources("${PROJECT_NAME}"
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/aocldtl.c
${CMAKE_CURRENT_SOURCE_DIR}/aocldtl_blis.c
${CMAKE_CURRENT_SOURCE_DIR}/aoclfal.c
${CMAKE_CURRENT_SOURCE_DIR}/aoclflist.c
${CMAKE_CURRENT_SOURCE_DIR}/aoclos.c
)
# Collect all subdirectory paths that have at least one file with suffix in AOCLDTL_SRC_SUFS list.
get_filepaths_with_suffixes(LOCAL_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${AOCLDTL_SRC_SUFS}")
# Create an object library using the source file list above.
add_library(AOCL_DTL
OBJECT
${LOCAL_SOURCE_FILES}
)
# Include the corresponding make_defs.cmake that holds the required compiler options.
include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake)
# Use PRIVATE keyword for option setting since we do not want the properties to propagate in other targets.
# mimicing get-aocldtl-cflags-for
target_compile_options(AOCL_DTL
PRIVATE
# load-var-for,COPTFLAGS
${COPTFLAGS}
# get-noopt-cflags-for
${CDBGFLAGS}
# get-noopt-cflags-for
${CWARNFLAGS}
# get-noopt-cflags-for
${CMISCFLAGS}
# get-noopt-cflags-for
${CLANGFLAGS}
# in get-aocldtl-cflags-for
${BUILD_SYMFLAGS}
)
target_compile_definitions(AOCL_DTL
PRIVATE
# in get-noopt-cflags-for
${VERS_DEF}
# in get-aocldtl-cflags-for
${BUILD_CPPFLAGS}
# in get-aocldtl-cflags-for
${CPPROCFLAGS}
)
target_include_directories(AOCL_DTL
BEFORE
PRIVATE
# in get-noopt-cflags-for
${CINFLAGS}
)
if(THREADING_MODEL STREQUAL "openmp")
# Equivalent to CTHREADFLAGS in get-noopt-cflags-for
target_link_libraries(AOCL_DTL PRIVATE OpenMP::OpenMP_C)
elseif(THREADING_MODEL STREQUAL "pthreads")
# in get-noopt-cflags-for
target_compile_options(AOCL_DTL PRIVATE ${CTHREADFLAGS})
endif()
if(BUILD_SHARED_LIBS)
# Equivalent to CPICFLAGS in get-noopt-cflags-for
set_target_properties(AOCL_DTL PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_dependencies(AOCL_DTL flat-header)
# Put all those targets under object-libs-targets folder name so that they appear all together in IDE.
set_target_properties(AOCL_DTL PROPERTIES FOLDER object-libs-targets)

View File

@@ -5,7 +5,7 @@
* These functions are invoked though macros by
* end user.
*
* Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*=======================================================================*/
#include "blis.h"
@@ -539,11 +539,11 @@ uint64 AOCL_DTL_get_time_spent(void)
#ifdef AOCL_DTL_AUTO_TRACE_ENABLE
/*
Disable intrumentation for these functions as they will also be
called from compiler generated instumation code to trace
Disable instrumentation for these functions as they will also be
called from compiler generated instrumentation code to trace
function execution.
It needs to be part of declration in the C file so can't be
It needs to be part of declaration in the C file so can't be
moved to header file.
WARNING: These functions are automatically invoked. however any function

View File

@@ -5,7 +5,7 @@
* It provides defination for all macros to be
* used by user to add debug/trace information.
*
* Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/

View File

@@ -3,7 +3,7 @@
*
* Description : BLIS library specific debug helpes.
*
* Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/
@@ -92,6 +92,7 @@ void AOCL_DTL_log_gemm_sizes(int8 loglevel,
}
void AOCL_DTL_log_gemm_stats(int8 loglevel,
char dt_type,
const f77_int m,
const f77_int n,
const f77_int k)
@@ -99,14 +100,52 @@ void AOCL_DTL_log_gemm_stats(int8 loglevel,
char buffer[256];
double flops = 2.0 * m * n * k;
if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z')
{
flops = 4.0 * flops;
}
// Execution time is in micro seconds.
Double execution_time = AOCL_DTL_get_time_spent();
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
AOCL_get_requested_threads_count(),
execution_time/1000.0,
flops/(execution_time * 1e3));
if (execution_time != 0.0)
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
AOCL_get_requested_threads_count(),
execution_time/1000.0,
flops/(execution_time * 1e3));
else
sprintf(buffer, " nt=%ld %.3f ms",
AOCL_get_requested_threads_count(),
execution_time/1000.0);
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
}
void AOCL_DTL_log_gemmt_stats(int8 loglevel,
char dt_type,
const f77_int n,
const f77_int k)
{
char buffer[256];
double flops = n * n * k;
if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z')
{
flops = 4.0 * flops;
}
// Execution time is in micro seconds.
Double execution_time = AOCL_DTL_get_time_spent();
if (execution_time != 0.0)
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
AOCL_get_requested_threads_count(),
execution_time/1000.0,
flops/(execution_time * 1e3));
else
sprintf(buffer, " nt=%ld %.3f ms",
AOCL_get_requested_threads_count(),
execution_time/1000.0);
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
}
@@ -131,17 +170,57 @@ void AOCL_DTL_log_trsm_sizes(int8 loglevel,
double alpha_real = 0.0;
double alpha_imag = 0.0;
DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag);
//{S, D, C, Z} side, uplo, transa, diaga, m, n, lda, ldb, alpha_real, alpha_imag
sprintf(buffer, "%c %c %c %c %c %ld %ld %ld %ld %lf %lf\n", dt_type,
sprintf(buffer, "%c %c %c %c %c %ld %ld %ld %ld %lf %lf", dt_type,
side, uploa, transa, diaga,
(dim_t)m, (dim_t)n, (dim_t)lda, (dim_t)ldb,
alpha_real, alpha_imag);
AOCL_DTL_START_PERF_TIMER();
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
}
void AOCL_DTL_log_trsm_stats(int8 loglevel,
char dt_type,
f77_char side,
const f77_int m,
const f77_int n)
{
char buffer[256];
double flops = 0.0;
if (side == 'L' || side =='l')
{
flops = 1.0 * m * n * m;
}
else
{
flops = 1.0 * m * n * n;
}
if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z')
{
flops = 4.0 * flops;
}
// Execution time is in micro seconds.
Double execution_time = AOCL_DTL_get_time_spent();
if (execution_time != 0.0)
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
AOCL_get_requested_threads_count(),
execution_time/1000.0,
flops/(execution_time * 1e3));
else
sprintf(buffer, " nt=%ld %.3f ms",
AOCL_get_requested_threads_count(),
execution_time/1000.0);
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
}
void AOCL_DTL_log_gemmt_sizes(int8 loglevel,
char dt_type,
char uplo,
@@ -165,18 +244,20 @@ void AOCL_DTL_log_gemmt_sizes(int8 loglevel,
double beta_real = 0.0;
double beta_imag = 0.0;
DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag);
DTL_get_complex_parts(dt_type, beta, &beta_real, &beta_imag);
// {S,D,C,Z} {triangC : l or u} {n k lda ldb ldc transa transb alpha_real alpha_imaginary
// beta_real, beta_imaginary}
sprintf(buffer, "%c %c %ld %ld %lu %lu %lu %c %c %lf %lf %lf %lf\n",
sprintf(buffer, "%c %c %ld %ld %lu %lu %lu %c %c %lf %lf %lf %lf",
dt_type, uplo, (dim_t)n, (dim_t)k,
(dim_t)lda, (dim_t)ldb, (dim_t)ldc,
transa, transb,
alpha_real, alpha_imag,
beta_real, beta_imag);
AOCL_DTL_START_PERF_TIMER();
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
}
@@ -639,12 +720,41 @@ void AOCL_DTL_log_nrm2_sizes(int8 loglevel,
{
char buffer[256];
// {S, D, C, Z} {n, incx}
sprintf(buffer, "%c %ld %ld\n",
sprintf(buffer, "%c %ld %ld",
dt_type, (dim_t)n, (dim_t)incx);
AOCL_DTL_START_PERF_TIMER();
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
}
void AOCL_DTL_log_nrm2_stats(int8 loglevel,
char dt_type,
const f77_int n)
{
char buffer[256];
double flops = 2.0 * n;
if (dt_type == 'c' || dt_type == 'C' || dt_type == 'z' || dt_type == 'Z')
{
flops = 2.0 * flops;
}
// Execution time is in micro seconds.
Double execution_time = AOCL_DTL_get_time_spent();
if (execution_time != 0.0)
sprintf(buffer, " nt=%ld %.3f ms %0.3f GFLOPS",
AOCL_get_requested_threads_count(),
execution_time/1000.0,
flops/(execution_time * 1e3));
else
sprintf(buffer, " nt=%ld %.3f ms",
AOCL_get_requested_threads_count(),
execution_time/1000.0);
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
}
//Level-2
void AOCL_DTL_log_syr2_sizes(int8 loglevel,
char dt_type,

View File

@@ -3,7 +3,7 @@
*
* Description : BLIS library specific debug helpes.
*
* Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/
@@ -33,10 +33,17 @@ void AOCL_DTL_log_gemm_sizes(int8 loglevel,
int line);
void AOCL_DTL_log_gemm_stats(int8 loglevel,
char dt_type,
const f77_int m,
const f77_int n,
const f77_int k);
void AOCL_DTL_log_trsm_stats(int8 loglevel,
char dt_type,
f77_char side,
const f77_int m,
const f77_int n);
void AOCL_DTL_log_trsm_sizes(int8 loglevel,
char dt,
f77_char side,
@@ -68,6 +75,11 @@ void AOCL_DTL_log_gemmt_sizes(int8 loglevel,
const char* function_name,
int line);
void AOCL_DTL_log_gemmt_stats(int8 loglevel,
char dt_type,
const f77_int n,
const f77_int k);
void AOCL_DTL_log_hemm_sizes(int8 loglevel,
char dt_type,
const f77_char side,
@@ -243,6 +255,10 @@ void AOCL_DTL_log_nrm2_sizes( int8 loglevel,
const char* function_name,
int line);
void AOCL_DTL_log_nrm2_stats(int8 loglevel,
char dt_type,
const f77_int n);
void AOCL_DTL_log_amax_sizes ( int8 loglevel,
char dt_type,
const f77_int n,
@@ -389,15 +405,23 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
AOCL_DTL_log_gemm_sizes(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc, \
__FILE__, __FUNCTION__, __LINE__);
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, m, n, k) \
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, dt_type, m, n, k) \
if (gbIsLoggingEnabled) \
AOCL_DTL_log_gemm_stats(loglevel, m, n, k);
AOCL_DTL_log_gemm_stats(loglevel, dt_type, m, n, k);
#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, dt_type, n, k) \
if (gbIsLoggingEnabled) \
AOCL_DTL_log_gemmt_stats(loglevel, dt_type, n, k);
#define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb) \
if (gbIsLoggingEnabled) \
AOCL_DTL_log_trsm_sizes(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb, \
__FILE__, __FUNCTION__, __LINE__);
#define AOCL_DTL_LOG_TRSM_STATS(loglevel, dt_type, side, m, n) \
if (gbIsLoggingEnabled) \
AOCL_DTL_log_trsm_stats(loglevel, dt_type, side, m, n);
#define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc) \
if (gbIsLoggingEnabled) \
AOCL_DTL_log_gemmt_sizes(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc, \
@@ -460,6 +484,10 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
if (gbIsLoggingEnabled) \
AOCL_DTL_log_nrm2_sizes(loglevel, dt_type, n, incx, __FILE__,__FUNCTION__,__LINE__);
#define AOCL_DTL_LOG_NRM2_STATS(loglevel, dt_type, n) \
if (gbIsLoggingEnabled) \
AOCL_DTL_log_nrm2_stats(loglevel, dt_type, n);
#define AOCL_DTL_LOG_HEMV_INPUTS(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy) \
if (gbIsLoggingEnabled) \
AOCL_DTL_log_hemv_sizes(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy, \
@@ -531,12 +559,16 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc)
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, m, n, k)
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, dt_type, m, n, k)
#define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb)
#define AOCL_DTL_LOG_TRSM_STATS(loglevel, dt_type, side, m, n)
#define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc)
#define AOCL_DTL_LOG_GEMMT_STATS(loglevel, dt_type, n, k)
#define AOCL_DTL_LOG_HEMM_INPUTS(loglevel, dt_type, side, uplo, m, n, alpha, lda, ldb, beta, ldc)
#define AOCL_DTL_LOG_HERK_INPUTS(loglevel, dt_type, uploc, transa, m, k, alpha, lda, beta, ldc)
@@ -561,6 +593,8 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
#define AOCL_DTL_LOG_NRM2_INPUTS(loglevel, dt_type, n, incx)
#define AOCL_DTL_LOG_NRM2_STATS(loglevel, dt_type, n)
#define AOCL_DTL_LOG_HEMV_INPUTS(loglevel, dt_type, uploa, m, alpha, lda, incx, beta, incy)
#define AOCL_DTL_LOG_HER2_INPUTS(loglevel, dt_type, uploa, m, alpha, incx, incy, lda)

View File

@@ -5,7 +5,7 @@
* libaray, all debug features (except auto trace)
* can be enabled/disabled in this file.
*
* Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/

View File

@@ -3,7 +3,7 @@
*
* Description : Platform/os independed file handling API's
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/

View File

@@ -4,7 +4,7 @@
* Description : Interfaces for platform/os independed file
* handling API's
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/

View File

@@ -5,10 +5,11 @@
* each thread. This is used to log the data
* to correct file as per the current thread id.
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/
#include "blis.h"
#include "aocltpdef.h"
#include "aocldtl.h"
#include "aoclfal.h"
@@ -63,7 +64,11 @@ AOCL_FLIST_Node * AOCL_FLIST_GetNode(AOCL_FLIST_Node *plist, AOCL_TID tid)
{
if (temp->fp == NULL)
{
#ifdef BLIS_ENABLE_PTHREADS
AOCL_DEBUGPRINT("Could not get saved time stamp for thread = %ld", tid);
#else
AOCL_DEBUGPRINT("Could not get saved time stamp for thread = %d", tid);
#endif
}
return temp;
}
@@ -92,7 +97,11 @@ AOCL_FAL_FILE *AOCL_FLIST_GetFile(AOCL_FLIST_Node *plist, AOCL_TID tid)
{
if (temp->fp == NULL)
{
#ifdef BLIS_ENABLE_PTHREADS
AOCL_DEBUGPRINT("File associated with this thread id %ld does not exists or closed", tid);
#else
AOCL_DEBUGPRINT("File associated with this thread id %d does not exists or closed", tid);
#endif
}
return temp->fp;
}
@@ -118,8 +127,11 @@ AOCL_FAL_FILE *AOCL_FLIST_AddFile(const int8 *pchFilePrefix, AOCL_FLIST_Node **p
}
/* We don't have exiting file, lets try to open new one */
#ifdef BLIS_ENABLE_PTHREADS
sprintf(pchFileName, "P%d_T%lu_%s", AOCL_getpid(), tid, pchFilePrefix);
#else
sprintf(pchFileName, "P%d_T%u_%s", AOCL_getpid(), tid, pchFilePrefix);
#endif
file = AOCL_FAL_Open(pchFileName, "wb");
if (file == NULL)
{

View File

@@ -5,7 +5,7 @@
* each thread. This is used to log the deta
* to correct file as per the current thread id.
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/

View File

@@ -3,9 +3,10 @@
*
* Description : Abstraction for os services used by DTL.
*
* Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/
#include "blis.h"
#include "aocltpdef.h"
#include "aocldtl.h"
#include "aoclfal.h"
@@ -20,18 +21,18 @@
#endif
// BLIS TODO: This is workaround to check if BLIS is built with
// openmp support. Ideally we dont' want any library
// openmp support. Ideally we don't want any library
// specific code in dtl.
#include <blis.h>
#if defined(__linux__)
/*
Disable intrumentation for these functions as they will also be
called from compiler generated instumation code to trace
Disable instrumentation for these functions as they will also be
called from compiler generated instrumentation code to trace
function execution.
It needs to be part of declration in the C file so can't be
It needs to be part of declaration in the C file so can't be
moved to header file.
*/
@@ -47,7 +48,10 @@ AOCL_TID AOCL_gettid(void)
return omp_get_thread_num();
#else
#ifdef BLIS_ENABLE_PTHREADS
return pthread_self();
// pthread_self is not suitable for this purpose and may be replaced
// in a later release with something else. It returns a value of type
// pthread_t, which on linux is an unsigned long int.
return (AOCL_TID) pthread_self();
#else
return 0;
#endif
@@ -89,7 +93,11 @@ AOCL_TID AOCL_gettid(void)
return omp_get_thread_num();
#else
#ifdef BLIS_ENABLE_PTHREADS
return pthread_self();
// pthread_self is not suitable for this purpose and may be replaced
// in a later release with something else. It returns a value of type
// pthread_t, whose type may depend upon the operating system. On
// freeBSD it is a pointer to an empty struct.
return (AOCL_TID) pthread_self();
#else
return 0;
#endif

View File

@@ -3,7 +3,7 @@
*
* Description : Abstraction for os services used by DTL.
*
* Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/
@@ -11,7 +11,7 @@
#define _AOCL_OS_H_
#include "aocltpdef.h"
#include "malloc.h"
#include "stdlib.h"
/* The OS Services function declaration */

View File

@@ -4,7 +4,7 @@
*
* Description : Abstraction for various datatypes used by DTL.
*
* Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/
#ifndef AOCL_TYPEDEF_H_
@@ -35,8 +35,11 @@ typedef signed long int int32;
typedef short int int16;
typedef Void *AOCL_HANDLE;
#ifdef BLIS_ENABLE_PTHREADS
typedef long int AOCL_TID;
#else
typedef pid_t AOCL_TID;
#endif
#endif /*AOCL_TYPEDEF_H_ */
/* --------------- End of aocltpdef.h ----------------- */

View File

@@ -7,7 +7,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -3,7 +3,7 @@
*
* Description : Unit test cases for dtl.
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
* Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/

View File

@@ -61,6 +61,13 @@ if(ENABLE_OPENMP)
endif()
target_link_libraries(BenchGer optimized "${LIB_NAME}.lib")
add_executable(BenchNrm2 bench_nrm2.c)
target_link_libraries(BenchNrm2 debug "${LIB_NAME}.lib")
if(ENABLE_OPENMP)
target_link_libraries(BenchNrm2 OpenMP::OpenMP_CXX)
endif()
target_link_libraries(BenchNrm2 optimized "${LIB_NAME}.lib")
add_executable(BenchScalv bench_scalv.c)
target_link_libraries(BenchScalv debug "${LIB_NAME}.lib")
if(ENABLE_OPENMP)

View File

@@ -6,7 +6,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -193,7 +193,8 @@ blis: \
bench_amaxv_blis.x \
bench_copyv_blis.x \
bench_swapv_blis.x \
bench_axpbyv_blis.x
bench_axpbyv_blis.x \
bench_gemm_pack_compute_blis.x
openblas: \
bench_gemm_openblas.x \
@@ -240,7 +241,8 @@ mkl: \
bench_amaxv_mkl.x \
bench_copyv_mkl.x \
bench_swapv_mkl.x \
bench_axpbyv_mkl.x
bench_axpbyv_mkl.x \
bench_gemm_pack_compute_mkl.x
# --Object file rules --

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -157,7 +157,7 @@ void softmax_bench_driver_ ## SOFTMAX_SFX \
GEN_SOFTMAX_BENCH_DRV_FN(float,softmax_f32)
inline float gelu_tanh_f32
static inline float gelu_tanh_f32
(
float temp_accum
)
@@ -168,7 +168,7 @@ inline float gelu_tanh_f32
return temp_accum;
}\
inline float gelu_erf_f32
static inline float gelu_erf_f32
(
float temp_accum
)
@@ -261,10 +261,11 @@ void gelu_bench_main_ ## GELU_SFX \
n_repeats = global_n_repeat; \
} \
\
V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
err_t bli_errors = BLIS_SUCCESS; \
V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \
GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx ) ); \
\
V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \
GEN_FUNC_NAME(fill_array_,V_type)( ref_x, ( n * incx ) ); \
\
GEN_FUNC_NAME(gelu_bench_driver_,GELU_SFX)(n_repeats,n,x,incx); \
@@ -292,10 +293,11 @@ void softmax_bench_main_ ## SOFTMAX_SFX \
n_repeats = global_n_repeat; \
} \
\
V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
err_t bli_errors = BLIS_SUCCESS; \
V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \
GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx ) ); \
\
V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx, &bli_errors ); \
GEN_FUNC_NAME(fill_array_,V_type)( ref_x, ( n * incx ) ); \
\
GEN_FUNC_NAME(softmax_bench_driver_,SOFTMAX_SFX)(n_repeats,n,x,incx); \

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

996
bench/bench_gemm_pack_compute.c Executable file
View File

@@ -0,0 +1,996 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// Benchmark application to process aocl logs generated by BLIS library.
#ifndef DT
#define DT BLIS_DOUBLE
#endif
#ifndef IND
#define IND BLIS_NAT
#endif
#ifndef N_REPEAT
//#define N_REPEAT 100
#endif
#define AOCL_MATRIX_INITIALISATION
#define BUFFER_SIZE 256
/* For BLIS since logs are collected at BLAS interfaces
* we disable cblas interfaces for this benchmark application
*/
#ifdef BLIS_ENABLE_CBLAS
// #define CBLAS
#endif
// #define PRINT
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta, alpha_one;
dim_t m, n, k;
dim_t p_inc = 0; // to keep track of number of inputs
num_t dt;
// ind_t ind;
char dt_ch;
int r, n_repeats;
trans_t transa;
trans_t transb;
double dtime;
double dtime_save;
double gflops;
int packA, packB;
FILE* fin = NULL;
FILE* fout = NULL;
n_repeats = N_REPEAT; // This macro will get from Makefile.
dt = DT;
if (argc < 3)
{
printf("Usage: ./test_gemm_pack_compute_XX.x input.csv output.csv\n");
exit(1);
}
fin = fopen(argv[1], "r");
if (fin == NULL)
{
printf("Error opening the file %s\n", argv[1]);
exit(1);
}
fout = fopen(argv[2], "w");
if (fout == NULL)
{
printf("Error opening output file %s\n", argv[2]);
exit(1);
}
if (argc > 3)
{
n_repeats = atoi(argv[3]);
}
fprintf(fout, "Dt transa transb identifier m n k alphaR alphaI lda ldb betaR betaI ldc gflops\n");
// Following variables are needed for scanf to read inputs properly
// however they are not used in bench.
char api_name[BUFFER_SIZE]; // to store function name, line no present in logs
char dummy_buffer[BUFFER_SIZE];
// Variables extracted from the logs which are used by bench
char stor_scheme, transA_c, transB_c, packA_c, packB_c;
double alpha_r, beta_r, alpha_i, beta_i;
dim_t m_trans, n_trans;
inc_t lda, ldb, ldc;
stor_scheme = 'C'; // By default set it to Column Major
//{S, D, C, Z} transa, transb, packA, packB, m, n, k, alpha_real,
// alpha_imag, lda ldb, beta_real, beta_imag, ldc,
//
// number of threads, execution time, gflops ---> ignored by bench
while (fscanf(fin, "%s %c %c %c %c %c " INT_FS INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS"[^\n]",
api_name, &dt_ch, &transA_c, &transB_c, &packA_c, &packB_c, &m, &n, &k, &alpha_r, &alpha_i,
&lda, &ldb, &beta_r, &beta_i, &ldc) == 16)
{
// Discard any extra data on current line in the input file.
fgets(dummy_buffer, BUFFER_SIZE, fin );
// At BLAS level only column major order is supported.
stor_scheme = 'C';
if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE;
else if (dt_ch == 'S' || dt_ch == 's') dt = BLIS_FLOAT;
else
{
printf("Invalid data type %c\n", dt_ch);
continue;
}
if ( transA_c == 'n' || transA_c == 'N' ) transa = BLIS_NO_TRANSPOSE;
else if ( transA_c == 't' || transA_c == 'T' ) transa = BLIS_TRANSPOSE;
else if ( transA_c == 'c' || transA_c == 'C' ) transa = BLIS_CONJ_TRANSPOSE;
else
{
printf("Invalid option for transA \n");
continue;
}
if ( transB_c == 'n' || transB_c == 'N' ) transb = BLIS_NO_TRANSPOSE;
else if ( transB_c == 't' || transB_c == 'T' ) transb = BLIS_TRANSPOSE;
else if ( transB_c == 'c' || transB_c == 'C' ) transb = BLIS_CONJ_TRANSPOSE;
else
{
printf("Invalid option for transB \n");
continue;
}
if ( packA_c == 'p' || packA_c == 'P' ) packA = TRUE;
else if ( packA_c == 'u' || packA_c == 'U' ) packA = FALSE;
else
{
printf("Invalid option for packA \n");
continue;
}
if ( packB_c == 'p' || packB_c == 'P') packB = TRUE;
else if ( packB_c == 'u' || packB_c == 'U') packB = FALSE;
else
{
printf("Invalid option for packB \n");
continue;
}
bli_obj_create( dt, 1, 1, 0, 0, &alpha);
bli_obj_create( dt, 1, 1, 0, 0, &beta );
bli_obj_create( dt, 1, 1, 0, 0, &alpha_one);
if( (stor_scheme == 'C') || (stor_scheme == 'c') )
{
// leading dimension should be greater than number of rows
// if ((m > lda) || (k > ldb) || (m > ldc)) continue;
// Since this bench app is run on logs generated by AOCL trace logs
// - we have relaxed the checks on the input parameters.
// if A is transpose - A(lda x m), lda >= max(1,k)
// if A is non-transpose - A (lda x k), lda >= max(1,m)
// if B is transpose - B (ldb x k), ldb >= max(1,n)
// if B is non-transpose - B (ldb x n), ldb >= max(1,k)
// C is ldc x n - ldc >= max(1, m)
//if(transa) lda = k; // We will end up overwriting lda
bli_set_dims_with_trans( transa, m, k, &m_trans, &n_trans);
bli_obj_create( dt, m_trans, n_trans, 1, lda, &a);
//if(transb) ldb = n; // we will end up overwriting ldb, ldb >= n
bli_set_dims_with_trans( transb, k, n, &m_trans, &n_trans);
bli_obj_create( dt, m_trans, n_trans, 1, ldb, &b);
bli_obj_create( dt, m, n, 1, ldc, &c);
bli_obj_create( dt, m, n, 1, ldc, &c_save );
}
else if( (stor_scheme == 'r') || (stor_scheme == 'R') )
{
//leading dimension should be greater than number of columns
//if ((k > lda) || (n > ldb) || (n > ldc)) continue;
// Since this bench app is run on logs generated by AOCL trace logs
// - we have relaxed the checks on the input parameters.
// if A is transpose - A(k x lda), lda >= max(1,m)
// if A is non-transpose - A (m x lda), lda >= max(1,k)
// if B is transpose - B (n x ldb), ldb >= max(1,k)
// if B is non-transpose - B (k x ldb ), ldb >= max(1,n)
// C is m x ldc - ldc >= max(1, n)
//if(transa) lda = m; // this will overwrite lda
bli_set_dims_with_trans(transa, m, k, &m_trans, &n_trans);
bli_obj_create( dt, m_trans, n_trans, lda, 1, &a);
//if(transb) ldb = k; // this will overwrite ldb
bli_set_dims_with_trans(transb, k, n, &m_trans, &n_trans);
bli_obj_create( dt, m_trans, n_trans, ldb, 1, &b);
bli_obj_create( dt, m, n, ldc, 1, &c);
bli_obj_create( dt, m, n, ldc, 1, &c_save );
}
else
{
printf("Invalid storage scheme\n");
continue;
}
#ifndef BLIS // Incase if we are using blis interface we don't have to check for col-storage.
#ifndef CBLAS
if( ( stor_scheme == 'R' ) || ( stor_scheme == 'r' ) )
{
printf("BLAS APIs doesn't support row-storage: Enable CBLAS\n");
continue;
}
#endif
#endif
#ifdef AOCL_MATRIX_INITIALISATION
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
#endif
bli_copym( &c, &c_save );
bli_obj_set_conjtrans( transa, &a);
bli_obj_set_conjtrans( transb, &b);
bli_setsc( 1.0, 1.0, &alpha_one );
bli_setsc( alpha_r, alpha_i, &alpha );
bli_setsc( beta_r, beta_i, &beta );
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
#ifdef PRINT
bli_printm( "a", &a, "%4.6f", "" );
bli_printm( "b", &b, "%4.6f", "" );
bli_printm( "c", &c, "%4.6f", "" );
#endif
#ifdef BLIS
printf( "BLAS Extension APIs don't have a BLIS interface."
"Enable CBLAS or BLAS interface!\n" );
#else
#ifdef CBLAS
enum CBLAS_ORDER cblas_order;
enum CBLAS_TRANSPOSE cblas_transa;
enum CBLAS_TRANSPOSE cblas_transb;
enum CBLAS_IDENTIFIER cblas_identifierA;
enum CBLAS_IDENTIFIER cblas_identifierB;
size_t bufSizeA;
size_t bufSizeB;
if ( ( stor_scheme == 'C' ) || ( stor_scheme == 'c' ) )
cblas_order = CblasColMajor;
else
cblas_order = CblasRowMajor;
if( bli_is_trans( transa ) )
cblas_transa = CblasTrans;
else if( bli_is_conjtrans( transa ) )
cblas_transa = CblasConjTrans;
else
cblas_transa = CblasNoTrans;
if( bli_is_trans( transb ) )
cblas_transb = CblasTrans;
else if( bli_is_conjtrans( transb ) )
cblas_transb = CblasConjTrans;
else
cblas_transb = CblasNoTrans;
if ( packA )
cblas_identifierA = CblasAMatrix;
if ( packB )
cblas_identifierB = CblasBMatrix;
#else
f77_char f77_transa;
f77_char f77_transb;
f77_char f77_identifierA;
f77_char f77_identifierB;
f77_int f77_bufSizeA;
f77_int f77_bufSizeB;
f77_char f77_packed = 'P';
f77_identifierA = 'A';
f77_identifierB = 'B';
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
err_t err = BLIS_SUCCESS;
#endif
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
float* alphaonep = bli_obj_buffer( &alpha_one );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
#ifdef CBLAS
float* aBuffer;
float* bBuffer;
if ( packA && !packB )
{
// Only A is pre-packed.
bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
mm,
nn,
kk );
aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
cblas_sgemm_pack( cblas_order,
CblasAMatrix,
cblas_transa,
mm,
nn,
kk,
*alphap,
ap, lda,
aBuffer );
dtime = bli_clock();
cblas_sgemm_compute( cblas_order,
CblasPacked,
cblas_transb,
mm,
nn,
kk,
aBuffer, lda,
bp, ldb,
*betap,
cp, ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user(aBuffer);
}
else if ( !packA && packB )
{
// Only B is pre-packed.
bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
mm,
nn,
kk );
bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
cblas_sgemm_pack( cblas_order,
CblasBMatrix,
cblas_transb,
mm,
nn,
kk,
*alphap,
bp, ldb,
bBuffer );
dtime = bli_clock();
cblas_sgemm_compute( cblas_order,
cblas_transa,
CblasPacked,
mm,
nn,
kk,
ap, lda,
bBuffer, ldb,
*betap,
cp, ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user(bBuffer);
}
else if ( packA && packB )
{
// Both A & B are pre-packed.
bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
mm,
nn,
kk );
aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
mm,
nn,
kk );
bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
cblas_sgemm_pack( cblas_order,
CblasAMatrix,
cblas_transa,
mm,
nn,
kk,
*alphap,
ap, lda,
aBuffer );
cblas_sgemm_pack( cblas_order,
CblasBMatrix,
cblas_transb,
mm,
nn,
kk,
*alphaonep,
bp, ldb,
bBuffer );
dtime = bli_clock();
cblas_sgemm_compute( cblas_order,
CblasPacked,
CblasPacked,
mm,
nn,
kk,
aBuffer, lda,
bBuffer, ldb,
*betap,
cp, ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user(aBuffer);
bli_free_user(bBuffer);
}
else
{
// Neither A nor B is pre-packed.
dtime = bli_clock();
cblas_sgemm_compute( cblas_order,
cblas_transa,
cblas_transb,
mm,
nn,
kk,
ap, lda,
bp, ldb,
*betap,
cp, ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
#else // -- BLAS API --
float* aBuffer;
float* bBuffer;
if ( packA && !packB )
{
// Only A is pre-packed.
f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
&mm,
&nn,
&kk );
aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
sgemm_pack_( &f77_identifierA,
&f77_transa,
&mm,
&nn,
&kk,
alphap,
ap,
(f77_int*)&lda,
aBuffer );
dtime = bli_clock();
sgemm_compute_( &f77_packed,
&f77_transb,
&mm,
&nn,
&kk,
aBuffer, (f77_int*)&lda,
bp, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user( aBuffer );
}
else if ( !packA && packB )
{
// Only B is pre-packed.
f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
&mm,
&nn,
&kk );
bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
sgemm_pack_( &f77_identifierB,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
bp,
(f77_int*)&ldb,
bBuffer );
dtime = bli_clock();
sgemm_compute_( &f77_transa,
&f77_packed,
&mm,
&nn,
&kk,
ap, (f77_int*)&lda,
bBuffer, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user( bBuffer );
}
else if ( packA && packB )
{
// Both A & B are pre-packed.
f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
&mm,
&nn,
&kk );
bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
&mm,
&nn,
&kk );
aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
sgemm_pack_( &f77_identifierA,
&f77_transa,
&mm,
&nn,
&kk,
alphap,
ap,
(f77_int*)&lda,
aBuffer );
sgemm_pack_( &f77_identifierB,
&f77_transb,
&mm,
&nn,
&kk,
alphaonep,
bp,
(f77_int*)&ldb,
bBuffer );
dtime = bli_clock();
sgemm_compute_( &f77_packed,
&f77_packed,
&mm,
&nn,
&kk,
aBuffer, (f77_int*)&lda,
bBuffer, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user(aBuffer);
bli_free_user(bBuffer);
}
else
{
// Neither A nor B is reordered.
dtime = bli_clock();
sgemm_compute_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
ap, (f77_int*)&lda,
bp, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
#endif
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
double* alphap = bli_obj_buffer( &alpha );
double* alphaonep = bli_obj_buffer( &alpha_one );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
#ifdef CBLAS
double* aBuffer;
double* bBuffer;
if ( packA && !packB )
{
// Only A is pre-packed.
bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
mm,
nn,
kk );
aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
cblas_dgemm_pack( cblas_order,
CblasAMatrix,
cblas_transa,
mm,
nn,
kk,
*alphap,
ap, lda,
aBuffer );
dtime = bli_clock();
cblas_dgemm_compute( cblas_order,
CblasPacked,
cblas_transb,
mm,
nn,
kk,
aBuffer, lda,
bp, ldb,
*betap,
cp, ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user(aBuffer);
}
else if ( !packA && packB )
{
// Only B is pre-packed.
bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
mm,
nn,
kk );
cblas_dgemm_pack( cblas_order,
CblasBMatrix,
cblas_transb,
mm,
nn,
kk,
*alphap,
bp, ldb,
bBuffer );
dtime = bli_clock();
cblas_dgemm_compute( cblas_order,
cblas_transa,
CblasPacked,
mm,
nn,
kk,
ap, lda,
bBuffer, ldb,
*betap,
cp, ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user(bBuffer);
}
else if ( packA && packB )
{
// Both A & B are pre-packed.
bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
mm,
nn,
kk );
aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
mm,
nn,
kk );
bBuffer = (double*) bli_malloc_user( bufSizeB, &err );
cblas_dgemm_pack( cblas_order,
CblasAMatrix,
cblas_transa,
mm,
nn,
kk,
*alphap,
ap, lda,
aBuffer );
cblas_dgemm_pack( cblas_order,
CblasBMatrix,
cblas_transb,
mm,
nn,
kk,
*alphap,
bp, ldb,
bBuffer );
dtime = bli_clock();
cblas_dgemm_compute( cblas_order,
CblasPacked,
CblasPacked,
mm,
nn,
kk,
aBuffer, lda,
bBuffer, ldb,
*betap,
cp, ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user(aBuffer);
bli_free_user(bBuffer);
}
else
{
// Neither A nor B is pre-packed.
dtime = bli_clock();
cblas_dgemm_compute( cblas_order,
cblas_transa,
cblas_transb,
mm,
nn,
kk,
ap, lda,
bp, ldb,
*betap,
cp, ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
#else // -- BLAS API --
double* aBuffer;
double* bBuffer;
if ( packA && !packB )
{
// Only A is pre-packed.
f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
&mm,
&nn,
&kk );
aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
dgemm_pack_( &f77_identifierA,
&f77_transa,
&mm,
&nn,
&kk,
alphap,
ap,
(f77_int*)&lda,
aBuffer );
dtime = bli_clock();
dgemm_compute_( &f77_packed,
&f77_transb,
&mm,
&nn,
&kk,
aBuffer, (f77_int*)&lda,
bp, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user( aBuffer );
}
else if ( !packA && packB )
{
// Only B is pre-packed.
f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
&mm,
&nn,
&kk );
bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
dgemm_pack_( &f77_identifierB,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
bp,
(f77_int*)&ldb,
bBuffer );
dtime = bli_clock();
dgemm_compute_( &f77_transa,
&f77_packed,
&mm,
&nn,
&kk,
ap, (f77_int*)&lda,
bBuffer, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user( bBuffer );
}
else if ( packA && packB )
{
// Both A & B are pre-packed.
f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
&mm,
&nn,
&kk );
aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
&mm,
&nn,
&kk );
bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
dgemm_pack_( &f77_identifierA,
&f77_transa,
&mm,
&nn,
&kk,
alphap,
ap,
(f77_int*)&lda,
aBuffer );
dgemm_pack_( &f77_identifierB,
&f77_transb,
&mm,
&nn,
&kk,
alphaonep,
bp,
(f77_int*)&ldb,
bBuffer );
dtime = bli_clock();
dgemm_compute_( &f77_packed,
&f77_packed,
&mm,
&nn,
&kk,
aBuffer, (f77_int*)&lda,
bBuffer, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
bli_free_user(aBuffer);
bli_free_user(bBuffer);
}
else
{
// Neither A nor B is reordered.
dtime = bli_clock();
dgemm_compute_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
ap, (f77_int*)&lda,
bp, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
#endif
}
#endif
#ifdef PRINT
bli_printm( "c compute", &c, "%4.6f", "" );
#endif
}
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
printf( "data_%cgemm_%s", dt_ch, BLAS );
p_inc++;
printf("( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
(unsigned long)(p_inc),
(unsigned long)m,
(unsigned long)n,
(unsigned long)k, gflops);
fprintf (fout, "%c %c %c %c %c %ld %ld %ld %lf %lf %ld %ld %lf %lf %ld %6.3f\n", \
dt_ch, transA_c, transB_c, packA_c, packB_c, m, n, k, alpha_r, alpha_i, lda, ldb, beta_r, beta_i, ldc, gflops);
fflush(fout);
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
fclose(fin);
fclose(fout);
return 0;
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
modification, are permitted provided that the following conditions are
met:

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
modification, are permitted provided that the following conditions are
met:

View File

@@ -3,8 +3,10 @@
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -0,0 +1,92 @@
sgemm_ S N N P U 1 1 1 1 0 1 1 1 0 1
sgemm_ S N N P U 2 2 2 1 0 2 2 1 0 2
sgemm_ S N N P U 3 3 3 1 0 3 3 1 0 3
sgemm_ S N N P U 4 4 4 1 0 4 4 1 0 4
sgemm_ S N N P U 5 5 5 1 0 5 5 1 0 5
sgemm_ S N N P U 6 6 6 1 0 6 6 1 0 6
sgemm_ S N N P U 7 7 7 1 0 7 7 1 0 7
sgemm_ S N N P U 8 8 8 1 0 8 8 1 0 8
sgemm_ S N N P U 9 9 9 1 0 9 9 1 0 9
sgemm_ S N N P U 10 10 10 1 0 10 10 1 0 10
sgemm_ S N N P U 20 20 20 1 0 20 20 1 0 20
sgemm_ S N N P U 30 30 30 1 0 30 30 1 0 30
sgemm_ S N N P U 40 40 40 1 0 40 40 1 0 40
sgemm_ S N N P U 50 50 50 1 0 50 50 1 0 50
sgemm_ S N N P U 60 60 60 1 0 60 60 1 0 60
sgemm_ S N N P U 70 70 70 1 0 70 70 1 0 70
sgemm_ S N N P U 80 80 80 1 0 80 80 1 0 80
sgemm_ S N N P U 90 90 90 1 0 90 90 1 0 90
sgemm_ S N N P U 100 100 100 1 0 100 100 1 0 100
sgemm_ S N N P U 200 200 200 1 0 200 200 1 0 200
sgemm_ S N N P U 300 300 300 1 0 300 300 1 0 300
sgemm_ S N N P U 400 400 400 1 0 400 400 1 0 400
sgemm_ S N N P U 500 500 500 1 0 500 500 1 0 500
dgemm_ D N N P U 1 1 1 1 0 1 1 1 0 1
dgemm_ D N N P U 2 2 2 1 0 2 2 1 0 2
dgemm_ D N N P U 3 3 3 1 0 3 3 1 0 3
dgemm_ D N N P U 4 4 4 1 0 4 4 1 0 4
dgemm_ D N N P U 5 5 5 1 0 5 5 1 0 5
dgemm_ D N N P U 6 6 6 1 0 6 6 1 0 6
dgemm_ D N N P U 7 7 7 1 0 7 7 1 0 7
dgemm_ D N N P U 8 8 8 1 0 8 8 1 0 8
dgemm_ D N N P U 9 9 9 1 0 9 9 1 0 9
dgemm_ D N N P U 10 10 10 1 0 10 10 1 0 10
dgemm_ D N N P U 20 20 20 1 0 20 20 1 0 20
dgemm_ D N N P U 30 30 30 1 0 30 30 1 0 30
dgemm_ D N N P U 40 40 40 1 0 40 40 1 0 40
dgemm_ D N N P U 50 50 50 1 0 50 50 1 0 50
dgemm_ D N N P U 60 60 60 1 0 60 60 1 0 60
dgemm_ D N N P U 70 70 70 1 0 70 70 1 0 70
dgemm_ D N N P U 80 80 80 1 0 80 80 1 0 80
dgemm_ D N N P U 90 90 90 1 0 90 90 1 0 90
dgemm_ D N N P U 100 100 100 1 0 100 100 1 0 100
dgemm_ D N N P U 200 200 200 1 0 200 200 1 0 200
dgemm_ D N N P U 300 300 300 1 0 300 300 1 0 300
dgemm_ D N N P U 400 400 400 1 0 400 400 1 0 400
dgemm_ D N N P U 500 500 500 1 0 500 500 1 0 500
sgemm_ S N N U P 1 1 1 1 0 1 1 1 0 1
sgemm_ S N N U P 2 2 2 1 0 2 2 1 0 2
sgemm_ S N N U P 3 3 3 1 0 3 3 1 0 3
sgemm_ S N N U P 4 4 4 1 0 4 4 1 0 4
sgemm_ S N N U P 5 5 5 1 0 5 5 1 0 5
sgemm_ S N N U P 6 6 6 1 0 6 6 1 0 6
sgemm_ S N N U P 7 7 7 1 0 7 7 1 0 7
sgemm_ S N N U P 8 8 8 1 0 8 8 1 0 8
sgemm_ S N N U P 9 9 9 1 0 9 9 1 0 9
sgemm_ S N N U P 10 10 10 1 0 10 10 1 0 10
sgemm_ S N N U P 20 20 20 1 0 20 20 1 0 20
sgemm_ S N N U P 30 30 30 1 0 30 30 1 0 30
sgemm_ S N N U P 40 40 40 1 0 40 40 1 0 40
sgemm_ S N N U P 50 50 50 1 0 50 50 1 0 50
sgemm_ S N N U P 60 60 60 1 0 60 60 1 0 60
sgemm_ S N N U P 70 70 70 1 0 70 70 1 0 70
sgemm_ S N N U P 80 80 80 1 0 80 80 1 0 80
sgemm_ S N N U P 90 90 90 1 0 90 90 1 0 90
sgemm_ S N N U P 100 100 100 1 0 100 100 1 0 100
sgemm_ S N N U P 200 200 200 1 0 200 200 1 0 200
sgemm_ S N N U P 300 300 300 1 0 300 300 1 0 300
sgemm_ S N N U P 400 400 400 1 0 400 400 1 0 400
sgemm_ S N N U P 500 500 500 1 0 500 500 1 0 500
dgemm_ D N N U P 1 1 1 1 0 1 1 1 0 1
dgemm_ D N N U P 2 2 2 1 0 2 2 1 0 2
dgemm_ D N N U P 3 3 3 1 0 3 3 1 0 3
dgemm_ D N N U P 4 4 4 1 0 4 4 1 0 4
dgemm_ D N N U P 5 5 5 1 0 5 5 1 0 5
dgemm_ D N N U P 6 6 6 1 0 6 6 1 0 6
dgemm_ D N N U P 7 7 7 1 0 7 7 1 0 7
dgemm_ D N N U P 8 8 8 1 0 8 8 1 0 8
dgemm_ D N N U P 9 9 9 1 0 9 9 1 0 9
dgemm_ D N N U P 10 10 10 1 0 10 10 1 0 10
dgemm_ D N N U P 20 20 20 1 0 20 20 1 0 20
dgemm_ D N N U P 30 30 30 1 0 30 30 1 0 30
dgemm_ D N N U P 40 40 40 1 0 40 40 1 0 40
dgemm_ D N N U P 50 50 50 1 0 50 50 1 0 50
dgemm_ D N N U P 60 60 60 1 0 60 60 1 0 60
dgemm_ D N N U P 70 70 70 1 0 70 70 1 0 70
dgemm_ D N N U P 80 80 80 1 0 80 80 1 0 80
dgemm_ D N N U P 90 90 90 1 0 90 90 1 0 90
dgemm_ D N N U P 100 100 100 1 0 100 100 1 0 100
dgemm_ D N N U P 200 200 200 1 0 200 200 1 0 200
dgemm_ D N N U P 300 300 300 1 0 300 300 1 0 300
dgemm_ D N N U P 400 400 400 1 0 400 400 1 0 400
dgemm_ D N N U P 500 500 500 1 0 500 500 1 0 500

View File

@@ -1,13 +1,133 @@
##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.##
##Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.##
set(F2C_LIB "libf2c")
# Comments:
# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
# the second case because CONFIG_NAME is not yet set.
if(NOT DEFINED BLIS_INSTALL_PATH)
set(DIST_PATH ${CMAKE_BINARY_DIR})
set(LIB_PATH ${DIST_PATH}/lib/${BLIS_CONFIG_FAMILY})
set(INC_PATH ${DIST_PATH}/include/${BLIS_CONFIG_FAMILY})
else()
set(LIB_PATH ${BLIS_INSTALL_PATH}/lib)
set(INC_PATH ${BLIS_INSTALL_PATH}/include/blis)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/f2c)
# Include the corresponding make_defs.cmake that holds the required compiler options.
include(${CMAKE_SOURCE_DIR}/config/${BLIS_CONFIG_FAMILY}/make_defs.cmake)
# Generate F2C library
add_library("${F2C_LIB}" STATIC )
set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C)
# Create a static library using the sources in f2c directory.
file(GLOB f2c_sources LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/f2c/*.c)
add_library(f2c STATIC ${f2c_sources})
target_compile_options(f2c
PRIVATE
# load-var-for,COPTFLAGS
${COPTFLAGS}
# get-noopt-cflags-for
${CDBGFLAGS}
${CWARNFLAGS}
${CPICFLAGS}
${CMISCFLAGS}
${CLANGFLAGS}
# Suppress warnings about uninitialized functions
-Wno-maybe-uninitialized -Wno-parentheses -Wfatal-errors
)
target_compile_definitions(f2c
PRIVATE
# in get-noopt-cflags-for
${VERS_DEF}
${CPPROCFLAGS}
-DHAVE_BLIS_H
)
target_include_directories(f2c
BEFORE
PRIVATE
# Add local header paths
${CMAKE_CURRENT_SOURCE_DIR}/f2c
# and the path to blis.h
${INC_PATH}
)
target_link_libraries(f2c PRIVATE ${LDFLAGS})
if(THREADING_MODEL STREQUAL "openmp")
target_link_libraries(f2c PRIVATE OpenMP::OpenMP_C)
endif()
# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE.
set_target_properties(f2c PROPERTIES FOLDER blastest-targets)
add_dependencies(f2c flat-header)
# Gather all local source files.
file(GLOB blastest_sources LIST_DIRECTORIES false ${CMAKE_CURRENT_SOURCE_DIR}/src/*.c)
list(TRANSFORM blastest_sources REPLACE ${CMAKE_CURRENT_SOURCE_DIR}/src/ "")
add_subdirectory(f2c)
add_subdirectory(src)
# Create one executable for each of the sources.
foreach(source ${blastest_sources})
string(REPLACE .c "" exec_name ${source})
add_executable(${exec_name}.x src/${source})
target_compile_options(${exec_name}.x
PRIVATE
# load-var-for,COPTFLAGS
${COPTFLAGS}
# get-noopt-cflags-for
${CDBGFLAGS}
${CWARNFLAGS}
${CPICFLAGS}
${CMISCFLAGS}
${CLANGFLAGS}
# Suppress warnings about uninitialized functions
-Wno-parentheses -Wno-maybe-uninitialized
)
target_compile_definitions(${exec_name}.x
PRIVATE
# in get-noopt-cflags-for
${VERS_DEF}
${CPPROCFLAGS}
-DHAVE_BLIS_H
)
target_include_directories(${exec_name}.x
BEFORE
PRIVATE
# Add local header paths
${CMAKE_CURRENT_SOURCE_DIR}/f2c
# and the path to blis.h
${INC_PATH}
)
target_link_libraries(${exec_name}.x PRIVATE f2c libblis ${LDFLAGS})
if(THREADING_MODEL STREQUAL "openmp")
target_link_libraries(${exec_name}.x PRIVATE OpenMP::OpenMP_C)
endif()
set_target_properties(${exec_name}.x PROPERTIES CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE.
set_target_properties(${exec_name}.x PROPERTIES FOLDER blastest-targets)
# Add a target for running the tests. Rules are different for level-1 APIs, compared to levels 2 and 3.
if(${exec_name} MATCHES 1)
add_custom_target(run-${exec_name}
COMMAND ${exec_name}.x > out.${exec_name}
COMMENT "Running ${exec_name}.x with output redirected to out.${exec_name}"
DEPENDS ${exec_name}.x
BYPRODUCTS ${CMAKE_BINARY_DIR}/out.${exec_name}
WORKING_DIRECTORY $<TARGET_FILE_DIR:libblis>
VERBATIM
)
else()# name has 2 or 3
add_custom_target(run-${exec_name}
COMMAND ${exec_name}.x < ${CMAKE_CURRENT_SOURCE_DIR}/input/${exec_name}.in
COMMENT "Running ${exec_name}.x with input ${CMAKE_CURRENT_SOURCE_DIR}/input/${exec_name}.in and output saved to out.${exec_name}"
DEPENDS ${exec_name}.x
BYPRODUCTS ${CMAKE_BINARY_DIR}/out.${exec_name}
WORKING_DIRECTORY $<TARGET_FILE_DIR:libblis>
VERBATIM
)
endif()
# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE.
set_target_properties(run-${exec_name} PROPERTIES FOLDER blastest-targets)
list(APPEND test_executables "run-${exec_name}")
endforeach()
add_custom_target(testblas DEPENDS ${test_executables})
add_custom_target(checkblas
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/build/cmake/check-blastest.py "."
DEPENDS testblas
WORKING_DIRECTORY $<TARGET_FILE_DIR:libblis>
)
# Put all those targets under blastest-targets-targets folder name so that they appear all together in IDE.
set_target_properties(testblas checkblas PROPERTIES FOLDER blastest-targets)

View File

@@ -1,59 +0,0 @@
##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.##
target_sources("${F2C_LIB}"
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/abs.c
${CMAKE_CURRENT_SOURCE_DIR}/acos.c
${CMAKE_CURRENT_SOURCE_DIR}/asin.c
${CMAKE_CURRENT_SOURCE_DIR}/atan.c
${CMAKE_CURRENT_SOURCE_DIR}/atn2.c
${CMAKE_CURRENT_SOURCE_DIR}/close.c
${CMAKE_CURRENT_SOURCE_DIR}/cnjg.c
${CMAKE_CURRENT_SOURCE_DIR}/cos.c
${CMAKE_CURRENT_SOURCE_DIR}/cosh.c
${CMAKE_CURRENT_SOURCE_DIR}/dim.c
${CMAKE_CURRENT_SOURCE_DIR}/div.c
${CMAKE_CURRENT_SOURCE_DIR}/dolio.c
${CMAKE_CURRENT_SOURCE_DIR}/endfile.c
${CMAKE_CURRENT_SOURCE_DIR}/epsilon.c
${CMAKE_CURRENT_SOURCE_DIR}/err.c
${CMAKE_CURRENT_SOURCE_DIR}/exit_.c
${CMAKE_CURRENT_SOURCE_DIR}/exp.c
${CMAKE_CURRENT_SOURCE_DIR}/fmt.c
${CMAKE_CURRENT_SOURCE_DIR}/fmtlib.c
${CMAKE_CURRENT_SOURCE_DIR}/h_dnnt.c
${CMAKE_CURRENT_SOURCE_DIR}/hl_cmp.c
${CMAKE_CURRENT_SOURCE_DIR}/i_dnnt.c
${CMAKE_CURRENT_SOURCE_DIR}/i_len.c
${CMAKE_CURRENT_SOURCE_DIR}/imag.c
${CMAKE_CURRENT_SOURCE_DIR}/int.c
${CMAKE_CURRENT_SOURCE_DIR}/l_cmp.c
${CMAKE_CURRENT_SOURCE_DIR}/lg10.c
${CMAKE_CURRENT_SOURCE_DIR}/log.c
${CMAKE_CURRENT_SOURCE_DIR}/lread.c
${CMAKE_CURRENT_SOURCE_DIR}/lwrite.c
${CMAKE_CURRENT_SOURCE_DIR}/mod.c
${CMAKE_CURRENT_SOURCE_DIR}/nint.c
${CMAKE_CURRENT_SOURCE_DIR}/open.c
${CMAKE_CURRENT_SOURCE_DIR}/pow.c
${CMAKE_CURRENT_SOURCE_DIR}/prod.c
${CMAKE_CURRENT_SOURCE_DIR}/rdfmt.c
${CMAKE_CURRENT_SOURCE_DIR}/rewind.c
${CMAKE_CURRENT_SOURCE_DIR}/rsfe.c
${CMAKE_CURRENT_SOURCE_DIR}/s_cmp.c
${CMAKE_CURRENT_SOURCE_DIR}/s_copy.c
${CMAKE_CURRENT_SOURCE_DIR}/s_stop.c
${CMAKE_CURRENT_SOURCE_DIR}/sfe.c
${CMAKE_CURRENT_SOURCE_DIR}/sig_die.c
${CMAKE_CURRENT_SOURCE_DIR}/sign.c
${CMAKE_CURRENT_SOURCE_DIR}/sin.c
${CMAKE_CURRENT_SOURCE_DIR}/sinh.c
${CMAKE_CURRENT_SOURCE_DIR}/sqrt.c
${CMAKE_CURRENT_SOURCE_DIR}/tan.c
${CMAKE_CURRENT_SOURCE_DIR}/tanh.c
${CMAKE_CURRENT_SOURCE_DIR}/util.c
${CMAKE_CURRENT_SOURCE_DIR}/wref.c
${CMAKE_CURRENT_SOURCE_DIR}/wrtfmt.c
${CMAKE_CURRENT_SOURCE_DIR}/wsfe.c
${CMAKE_CURRENT_SOURCE_DIR}/wsle.c
)

View File

@@ -28,6 +28,7 @@ use or performance of this software.
#include <unistd.h>
#endif
#ifdef _MSC_VER
#include <io.h>
#define access _access
#endif
#include "f2c.h"

View File

@@ -1,37 +0,0 @@
##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.##
add_executable(cblat1 cblat1.c)
target_link_libraries(cblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(cblat2 cblat2.c)
target_link_libraries(cblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(cblat3 cblat3.c)
target_link_libraries(cblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(dblat1 dblat1.c)
target_link_libraries(dblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(dblat2 dblat2.c)
target_link_libraries(dblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(dblat3 dblat3.c)
target_link_libraries(dblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(sblat1 sblat1.c)
target_link_libraries(sblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(sblat2 sblat2.c)
target_link_libraries(sblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(sblat3 sblat3.c)
target_link_libraries(sblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(zblat1 zblat1.c)
target_link_libraries(zblat1 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(zblat2 zblat2.c)
target_link_libraries(zblat2 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )
add_executable(zblat3 zblat3.c)
target_link_libraries(zblat3 PRIVATE "${F2C_LIB}" "${LIB_NAME}.lib" )

View File

@@ -1,4 +1,4 @@
"""Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved"""
"""Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved."""
import subprocess
import sys

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -1,58 +0,0 @@
/*
* Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
*/
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
#cmakedefine AOCL_DYNAMIC
#cmakedefine AOCL_BLIS_ZEN
#cmakedefine BLIS_ENABLE_OPENMP
#cmakedefine BLIS_ENABLE_JRIR_SLAB
#cmakedefine BLIS_ENABLE_JRIR_RR
#cmakedefine BLIS_ENABLE_PBA_POOLS
#cmakedefine BLIS_ENABLE_SBA_POOLS
#cmakedefine BLIS_ENABLE_MEM_TRACING
#cmakedefine BLIS_INT_TYPE_SIZE @INT_TYPE_SIZE@
#cmakedefine BLIS_BLAS_INT_TYPE_SIZE @BLAS_INT_TYPE_SIZE@
#cmakedefine BLIS_ENABLE_BLAS
#cmakedefine BLIS_ENABLE_CBLAS
#cmakedefine BLIS_ENABLE_MIXED_DT
#cmakedefine BLIS_ENABLE_MIXED_DT_EXTRA_MEM
#cmakedefine BLIS_ENABLE_SUP_HANDLING
#cmakedefine BLIS_ENABLE_MEMKIND
#cmakedefine BLIS_ENABLE_TRSM_PREINVERSION
#cmakedefine BLIS_ENABLE_PRAGMA_OMP_SIMD
#cmakedefine BLIS_ENABLE_SANDBOX
#cmakedefine BLIS_ENABLE_SHARED
#cmakedefine BLIS_ENABLE_COMPLEX_RETURN_INTEL
#cmakedefine DISABLE_BLIS_ARCH_TYPE
#cmakedefine DISABLE_BLIS_MODEL_TYPE
#cmakedefine __blis_arch_type_name "@rename_blis_arch_type@"
#cmakedefine __blis_model_type_name "@rename_blis_model_type@"
#endif

View File

@@ -1,4 +1,4 @@
"""Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All Rights Reserved"""
"""Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved."""
################################################################################
# This file is used to mirroring the refkernels folder data into to zen, zen2, #

Some files were not shown because too many files have changed in this diff Show More