Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
ca22e28
Rename sgemm_direct_sme1.S to sgemm_direct_sme1_2VLx2VL.S
martin-frbg Aug 18, 2025
22c6607
Use ASMNAME to get symbol name from build system; leave x18 unused as…
martin-frbg Aug 18, 2025
89898fc
Add sgemm_direct_performant for switching between direct and regular …
martin-frbg Aug 18, 2025
08a0032
Build symbol name from build system variables
martin-frbg Aug 18, 2025
53d3bb5
Get symbol name from build system; change b.first to b.mi for AppleCl…
martin-frbg Aug 18, 2025
731f4dd
Add VORTEXM4 settings
martin-frbg Aug 18, 2025
e82bcd2
Update ARM64 sgemm_direct object generation
martin-frbg Aug 18, 2025
0203657
Add sgemm_direct_performant for ARM64
martin-frbg Aug 18, 2025
de91afd
Move SGEMM_DIRECT after the CBLAS parameter check and add sgemm_direc…
martin-frbg Aug 18, 2025
202a7a0
Separate VORTEXM4 from VORTEX and ARMV9SME
martin-frbg Aug 18, 2025
e76c390
Add sgemm_direct_performant for ARM64
martin-frbg Aug 18, 2025
ef0b883
Add sgemm_direct_performant for ARM64
martin-frbg Aug 18, 2025
ccfd017
Enable SME on MacOS and add VORTEXM4 to DYNAMIC_ARCH list
martin-frbg Aug 18, 2025
b0a00fb
Add minimal compiler flags for VORTEXM4
martin-frbg Aug 18, 2025
3097046
Add VORTEXM4 target
martin-frbg Aug 18, 2025
4e2a8c1
Split VORTEXM4 from VORTEX target due to SME support
martin-frbg Aug 18, 2025
18f9582
Add VORTEXM4
martin-frbg Aug 18, 2025
ca542f3
Add VORTEXM4
martin-frbg Aug 18, 2025
a4f5fec
Add compiler options for VORTEXM4
martin-frbg Aug 18, 2025
c794d0a
Add VORTEXM4
martin-frbg Aug 18, 2025
4328c91
relax requirements in compiler SME capability check
martin-frbg Aug 18, 2025
426b5f2
Add compiler options for VORTEXM4
martin-frbg Aug 18, 2025
0bc19a1
Update SME kernel details
martin-frbg Aug 18, 2025
bf98e44
Add VORTEXM4 to DYNAMIC_ARCH list
martin-frbg Aug 18, 2025
4609732
Relax version number requirement for AppleClang
martin-frbg Aug 18, 2025
05dbb54
Delete misplaced file
martin-frbg Aug 19, 2025
107c883
Update SME-related kernels
martin-frbg Aug 19, 2025
501728a
adjust register 20 accesses to 21 after moving x18
martin-frbg Aug 20, 2025
edaa73f
Hide the local 2VLx2VL symbol as static is insufficient for this with…
martin-frbg Aug 20, 2025
1ee8879
Add VORTEXM4
martin-frbg Aug 20, 2025
7f89c6f
smh-based direct sgemm currently requires leading dimensions to be sa…
martin-frbg Aug 23, 2025
8e50b8d
Add d8 to d15 to clobber lists as the code does not expressly save them
martin-frbg Aug 23, 2025
b4fc09e
Add registers d8 to d15 to clobber lists as the code does not express…
martin-frbg Aug 23, 2025
1b88c9c
remove debugging printouts
martin-frbg Aug 24, 2025
2b5d8c7
remove debugging printout
martin-frbg Aug 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile.arm64
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,11 @@ FCOMMON_OPT += -march=armv8.3-a
endif
endif

ifeq ($(CORE), VORTEXM4)
CCOMMON_OPT += -march=armv8.4-a+sme
FCOMMON_OPT += -march=armv8.4-a+sme
endif

ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
Expand Down
3 changes: 2 additions & 1 deletion Makefile.system
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ ifndef MACOSX_DEPLOYMENT_TARGET
ifeq ($(ARCH), arm64)
export MACOSX_DEPLOYMENT_TARGET=11.0
export NO_SVE = 1
export NO_SME = 1
# export NO_SME = 1
else
export MACOSX_DEPLOYMENT_TARGET=10.8
endif
Expand Down Expand Up @@ -723,6 +723,7 @@ DYNAMIC_CORE += A64FX
endif
ifneq ($(NO_SME), 1)
DYNAMIC_CORE += ARMV9SME
DYNAMIC_CORE += VORTEXM4
endif
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
Expand Down
1 change: 1 addition & 0 deletions TargetList.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ THUNDERX2T99
TSV110
THUNDERX3T110
VORTEX
VORTEXM4
A64FX
ARMV8SVE
ARMV9SME
Expand Down
6 changes: 3 additions & 3 deletions cmake/arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ if (DYNAMIC_ARCH)
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4)
endif()
elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
endif ()
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19 OR (${CMAKE_C_COMPILER_ID} MATCHES AppleClang AND ${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 17) ) # SME ACLE supported in LLVM >= 19 and AppleClang >= 17
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4)
endif()
endif ()
if (DYNAMIC_LIST)
Expand Down
10 changes: 10 additions & 0 deletions cmake/cc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,16 @@ if (${CORE} STREQUAL ARMV9SME)
endif ()
endif ()

if (${CORE} STREQUAL VORTEXM4)
if (NOT DYNAMIC_ARCH)
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sme")
endif ()
endif ()
endif ()

if (${CORE} STREQUAL CORTEXA510)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
Expand Down
2 changes: 1 addition & 1 deletion cmake/prebuild.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1252,7 +1252,7 @@ endif ()
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "VORTEX")
elseif ("${TCORE}" STREQUAL "VORTEX" OR "${TCORE}" STREQUAL "VORTEXM4")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t32768\n"
Expand Down
3 changes: 3 additions & 0 deletions cmake/system.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,9 @@ if (${TARGET} STREQUAL NEOVERSEV1)
if (${TARGET} STREQUAL ARMV9SME)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3")
endif()
if (${TARGET} STREQUAL VORTEXM4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sme -O3")
endif()
if (${TARGET} STREQUAL A64FX)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
Expand Down
2 changes: 1 addition & 1 deletion cmake/system_check.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ endif()
if (ARM64)
if (NOT NO_SME)
file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME)
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv8.4-a+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME)
if (NO_SME EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME")
endif()
Expand Down
1 change: 1 addition & 0 deletions common_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ int (*shgemm_otcopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *);
#ifdef ARCH_ARM64
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
void (*sgemm_direct_alpha_beta) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float * , BLASLONG);
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
#endif


Expand Down
2 changes: 1 addition & 1 deletion common_s.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
#define SGEMM_DIRECT gotoblas -> sgemm_direct
#elif ARCH_ARM64
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
#define SGEMM_DIRECT gotoblas -> sgemm_direct
#define SGEMM_DIRECT_ALPHA_BETA gotoblas -> sgemm_direct_alpha_beta
#endif
Expand Down
29 changes: 26 additions & 3 deletions cpuid_arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ size_t length64=sizeof(value64);
#define CPU_AMPERE1 25
// Apple
#define CPU_VORTEX 13
#define CPU_VORTEXM4 26
// Fujitsu
#define CPU_A64FX 15
// Phytium
Expand Down Expand Up @@ -113,7 +114,8 @@ static char *cpuname[] = {
"FT2000",
"CORTEXA76",
"NEOVERSEV2",
"AMPERE1"
"AMPERE1",
"VORTEXM4",
};

static char *cpuname_lower[] = {
Expand Down Expand Up @@ -143,7 +145,7 @@ static char *cpuname_lower[] = {
"cortexa76",
"neoversev2",
"ampere1",
"ampere1a"
"vortexm4"
};

static int cpulowperf=0;
Expand Down Expand Up @@ -400,7 +402,7 @@ int detect(void)
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
if (value64 == 1867590060) return CPU_VORTEX; //M4
if (value64 == 1867590060) return CPU_VORTEXM4; //M4
#else
#ifdef OS_WINDOWS
HKEY reghandle;
Expand Down Expand Up @@ -740,6 +742,27 @@ void get_cpuconfig(void)
length64 = sizeof(value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
#endif
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_VORTEXM4:
printf("#define VORTEXM4 \n");
printf("#define HAVE_SME 1 \n");
#ifdef __APPLE__
length64 = sizeof(value64);
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_LINESIZE %lld \n",value64);
printf("#define L1_DATA_LINESIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
printf("#define L1_DATA_SIZE %lld \n",value64);
length64 = sizeof(value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
#endif
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
Expand Down
40 changes: 27 additions & 13 deletions driver/others/dynamic_arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif
#ifdef DYN_VORTEXM4
extern gotoblas_t gotoblas_VORTEXM4;
#else
#error "dont have vortexm4"
#define gotoblas_VORTEXM4 gotoblas_ARMV8
#endif
#ifdef DYN_CORTEXA55
extern gotoblas_t gotoblas_CORTEXA55;
#else
Expand Down Expand Up @@ -155,17 +161,22 @@ extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_ARMV8SVE;
extern gotoblas_t gotoblas_A64FX;
#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#endif
#else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#define gotoblas_A64FX gotoblas_ARMV8
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif
#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
extern gotoblas_t gotoblas_VORTEXM4;
#else
#ifndef NO_SVE
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#else
#define gotoblas_ARMV9SME gotoblas_NEOVERSEN1
#endif
#define gotoblas_VORTEXM4 gotoblas_NEOVERSEN1
#endif

extern gotoblas_t gotoblas_THUNDERX3T110;
Expand All @@ -176,7 +187,7 @@ extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"

#define NUM_CORETYPES 19
#define NUM_CORETYPES 20

/*
* In case asm/hwcap.h is outdated on the build system, make sure
Expand Down Expand Up @@ -216,6 +227,7 @@ static char *corename[] = {
"armv8sve",
"a64fx",
"armv9sme",
"vortexm4",
"unknown"
};

Expand All @@ -239,6 +251,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
if (gotoblas == &gotoblas_A64FX) return corename[17];
if (gotoblas == &gotoblas_ARMV9SME) return corename[18];
if (gotoblas == &gotoblas_VORTEXM4) return corename[19];
return corename[NUM_CORETYPES];
}

Expand Down Expand Up @@ -277,6 +290,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 16: return (&gotoblas_ARMV8SVE);
case 17: return (&gotoblas_A64FX);
case 18: return (&gotoblas_ARMV9SME);
case 19: return (&gotoblas_VORTEXM4);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
Expand All @@ -288,11 +302,11 @@ static gotoblas_t *get_coretype(void) {
char coremsg[128];

#if defined (OS_DARWIN)
//future #if !defined(NO_SME)
// if (support_sme1()) {
// return &gotoblas_ARMV9SME;
// }
// #endif
#if !defined(NO_SME)
if (support_sme1()) {
return &gotoblas_VORTEXM4;
}
#endif
return &gotoblas_NEOVERSEN1;
#endif

Expand Down Expand Up @@ -463,7 +477,7 @@ static gotoblas_t *get_coretype(void) {
}
break;
case 0x61: // Apple
//future if (support_sme1()) return &gotoblas_ARMV9SME;
if (support_sme1()) return &gotoblas_VORTEXM4;
return &gotoblas_NEOVERSEN1;
break;
default:
Expand Down
14 changes: 14 additions & 0 deletions getarch.c
Original file line number Diff line number Diff line change
Expand Up @@ -1654,6 +1654,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "VORTEX"
#endif

#ifdef FORCE_VORTEXM4
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "VORTEXM4"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DVORTEXM4 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SME -DARMV8"
#define LIBNAME "vortexm4"
#define CORENAME "VORTEXM4"
#endif

#ifdef FORCE_A64FX
#define ARMV8
#define FORCE
Expand Down
55 changes: 31 additions & 24 deletions interface/gemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ void NAME(char *TRANSA, char *TRANSB,

int transa, transb, nrowa, nrowb;
blasint info;
int order = -1;

char transA, transB;
IFLOAT *buffer;
Expand Down Expand Up @@ -424,30 +425,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

PRINT_DEBUG_CNAME;

#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_sme1())
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}else if (order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
return;
}
#endif
#endif

#ifndef COMPLEX
args.alpha = (void *)α
args.beta = (void *)β
Expand Down Expand Up @@ -564,6 +541,36 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
return;
}


if ((args.m == 0) || (args.n == 0)) return;
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
#endif
if (order == CblasRowMajor && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (strcmp(gotoblas_corename(), "armv9sme") == 0 || strcmp(gotoblas_corename(), "vortexm4") == 0)
// if (support_sme1())
#endif
if (order == CblasRowMajor && m==lda && n ==ldb && k==ldc && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For RowMajor, shouldn't the leading dimension check be (lda==k && ldb==n && ldc==n) ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

normally yes but arguments have already been reshuffled at this point (I think - I'll recheck when I get back to this later this week)

SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
else
if (order == CblasRowMajor && m==lda && n==ldb && k==ldc && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
return;
}

#endif
#endif

#endif

#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
Expand Down
Loading
Loading