Refactor and extend run-time CPU feature detection on Arm

1) Overhaul the Arm CPU feature detection code, taking inspiration
   from similar recent changes in libaom.
2) Add neon_dotprod and neon_i8mm arch options in the configure,
   build and unit test files, adding appropriate conditional options
   where necessary.
3) Soft-enable run-time CPU feature detection by default for both 32-
   bit and 64-bit Arm platforms.

Change-Id: I3f13317d88324acc5753394351188baa8d18a261
This commit is contained in:
Jonathan Wright
2023-08-19 12:45:36 +01:00
parent 7ee16bc178
commit 148d1085f7
11 changed files with 378 additions and 167 deletions
+6
View File
@@ -143,6 +143,12 @@ $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
# AARCH64
$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod
$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod
$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
# POWER
$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx
+15 -2
View File
@@ -973,10 +973,23 @@ process_common_toolchain() {
# Process architecture variants
case ${toolchain} in
arm*)
# on arm, isa versions are supersets
soft_enable runtime_cpu_detect
# Arm ISA extensions are treated as supersets.
case ${tgt_isa} in
arm64|armv8)
soft_enable neon
for ext in ${ARCH_EXT_LIST_AARCH64}; do
# Disable higher order extensions to simplify dependencies.
if [ "$disable_exts" = "yes" ]; then
if ! disabled $ext; then
RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
disable_feature $ext
fi
elif disabled $ext; then
disable_exts="yes"
else
soft_enable $ext
fi
done
;;
armv7|armv7s)
soft_enable neon
+1 -1
View File
@@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') {
@ALL_ARCHS = filter(qw/neon_asm neon/);
arm;
} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
@ALL_ARCHS = filter(qw/neon/);
@ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm/);
@REQUIRES = filter(qw/neon/);
&require(@REQUIRES);
arm;
Vendored
+8 -1
View File
@@ -252,6 +252,13 @@ ARCH_LIST="
ppc
loongarch
"
ARCH_EXT_LIST_AARCH64="
neon
neon_dotprod
neon_i8mm
"
ARCH_EXT_LIST_X86="
mmx
sse
@@ -271,8 +278,8 @@ ARCH_EXT_LIST_LOONGSON="
"
ARCH_EXT_LIST="
neon
neon_asm
${ARCH_EXT_LIST_AARCH64}
mips32
dspr2
+22 -2
View File
@@ -12,6 +12,9 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
#if VPX_ARCH_ARM
#include "vpx_ports/arm.h"
#endif
#if VPX_ARCH_X86 || VPX_ARCH_X86_64
#include "vpx_ports/x86.h"
#endif
@@ -26,7 +29,7 @@ extern void vpx_dsp_rtcd();
extern void vpx_scale_rtcd();
}
#if VPX_ARCH_X86 || VPX_ARCH_X86_64
#if (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64
static void append_negative_gtest_filter(const char *str) {
std::string filter = ::testing::FLAGS_gtest_filter;
// Negative patterns begin with one '-' followed by a ':' separated list.
@@ -34,11 +37,28 @@ static void append_negative_gtest_filter(const char *str) {
filter += str;
::testing::FLAGS_gtest_filter = filter;
}
#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64
#endif // (!CONFIG_SHARED && VPX_ARCH_ARM) || VPX_ARCH_X86 || VPX_ARCH_X86_64
int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
#if !CONFIG_SHARED
#if VPX_ARCH_AARCH64
const int caps = arm_cpu_caps();
if (!(caps & HAS_NEON_DOTPROD)) {
append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*");
}
if (!(caps & HAS_NEON_I8MM)) {
append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*");
}
#elif VPX_ARCH_ARM
const int caps = arm_cpu_caps();
if (!(caps & HAS_NEON)) {
append_negative_gtest_filter(":NEON.*:NEON/*");
}
#endif // VPX_ARCH_ARM
#endif // !CONFIG_SHARED
#if VPX_ARCH_X86 || VPX_ARCH_X86_64
const int simd_caps = x86_simd_caps();
if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
+89
View File
@@ -0,0 +1,89 @@
/*
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
// Feature detection code for Armv7-A / AArch32.
#include "arm_cpudetect.h"
#if !CONFIG_RUNTIME_CPU_DETECT
static int arm_get_cpu_caps(void) {
// This function should actually be a no-op. There is no way to adjust any of
// these because the RTCD tables do not exist: the functions are called
// statically.
int flags = 0;
#if HAVE_NEON
flags |= HAS_NEON;
#endif // HAVE_NEON
return flags;
}
#elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT
static int arm_get_cpu_caps(void) {
int flags = 0;
#if HAVE_NEON || HAVE_NEON_ASM
// MSVC has no inline __asm support for Arm, but it does let you __emit
// instructions via their assembled hex code.
// All of these instructions should be essentially nops.
__try {
// VORR q0,q0,q0
__emit(0xF2200150);
flags |= HAS_NEON;
} __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
// Ignore exception.
}
#endif // HAVE_NEON || HAVE_NEON_ASM
return flags;
}
#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
static int arm_get_cpu_caps(void) {
int flags = 0;
#if HAVE_NEON || HAVE_NEON_ASM
uint64_t features = android_getCpuFeatures();
if (features & ANDROID_CPU_ARM_FEATURE_NEON) {
flags |= HAS_NEON;
}
#endif // HAVE_NEON || HAVE_NEON_ASM
return flags;
}
#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES)
#include <sys/auxv.h>
// Define hwcap values ourselves: building with an old auxv header where these
// hwcap values are not defined should not prevent features from being enabled.
#define VPX_AARCH32_HWCAP_NEON (1 << 12)
static int arm_get_cpu_caps(void) {
int flags = 0;
unsigned long hwcap = getauxval(AT_HWCAP);
#if HAVE_NEON || HAVE_NEON_ASM
if (hwcap & VPX_AARCH32_HWCAP_NEON) {
flags |= HAS_NEON;
}
#endif // HAVE_NEON || HAVE_NEON_ASM
return flags;
}
#else // end __linux__
#error \
"Runtime CPU detection selected, but no CPU detection method available" \
"for your platform. Rerun configure with --disable-runtime-cpu-detect."
#endif
int arm_cpu_caps(void) {
int flags = 0;
if (arm_cpu_env_flags(&flags)) {
return flags;
}
return arm_get_cpu_caps() & arm_cpu_env_mask();
}
+173
View File
@@ -0,0 +1,173 @@
/*
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "arm_cpudetect.h"
#if defined(__APPLE__)
#include <sys/sysctl.h>
#endif
#if !CONFIG_RUNTIME_CPU_DETECT
static int arm_get_cpu_caps(void) {
// This function should actually be a no-op. There is no way to adjust any of
// these because the RTCD tables do not exist: the functions are called
// statically.
int flags = 0;
#if HAVE_NEON
flags |= HAS_NEON;
#endif // HAVE_NEON
return flags;
}
#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT
// sysctlbyname() parameter documentation for instruction set characteristics:
// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
static INLINE int64_t have_feature(const char *feature) {
int64_t feature_present = 0;
size_t size = sizeof(feature_present);
if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
return 0;
}
return feature_present;
}
static int arm_get_cpu_caps(void) {
int flags = 0;
#if HAVE_NEON
flags |= HAS_NEON;
#endif // HAVE_NEON
#if HAVE_NEON_DOTPROD
if (have_feature("hw.optional.arm.FEAT_DotProd")) {
flags |= HAS_NEON_DOTPROD;
}
#endif // HAVE_NEON_DOTPROD
#if HAVE_NEON_I8MM
if (have_feature("hw.optional.arm.FEAT_I8MM")) {
flags |= HAS_NEON_I8MM;
}
#endif // HAVE_NEON_I8MM
return flags;
}
#elif defined(_MSC_VER) // end __APPLE__
static int arm_get_cpu_caps(void) {
int flags = 0;
// IsProcessorFeaturePresent() parameter documentation:
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
#if HAVE_NEON
flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
#endif // HAVE_NEON
#if HAVE_NEON_DOTPROD
// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK
// 20348, supported by Windows 11 and Windows Server 2022.
#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
flags |= HAS_NEON_DOTPROD;
}
#endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
#endif // HAVE_NEON_DOTPROD
// No I8MM feature detection available on Windows at time of writing.
return flags;
}
#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
static int arm_get_cpu_caps(void) {
int flags = 0;
#if HAVE_NEON
flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
#endif // HAVE_NEON
return flags;
}
#elif defined(__linux__) // end defined(VPX_USE_ANDROID_CPU_FEATURES)
#include <sys/auxv.h>
// Define hwcap values ourselves: building with an old auxv header where these
// hwcap values are not defined should not prevent features from being enabled.
#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
#define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
static int arm_get_cpu_caps(void) {
int flags = 0;
unsigned long hwcap = getauxval(AT_HWCAP);
unsigned long hwcap2 = getauxval(AT_HWCAP2);
#if HAVE_NEON
flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
#endif // HAVE_NEON
#if HAVE_NEON_DOTPROD
if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) {
flags |= HAS_NEON_DOTPROD;
}
#endif // HAVE_NEON_DOTPROD
#if HAVE_NEON_I8MM
if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) {
flags |= HAS_NEON_I8MM;
}
#endif // HAVE_NEON_I8MM
return flags;
}
#elif defined(__Fuchsia__) // end __linux__
#include <zircon/features.h>
#include <zircon/syscalls.h>
// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282.
#ifndef ZX_ARM64_FEATURE_ISA_I8MM
#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
#endif
static int arm_get_cpu_caps(void) {
int flags = 0;
#if HAVE_NEON
flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
#endif // HAVE_NEON
uint32_t features;
zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
if (status != ZX_OK) {
return flags;
}
#if HAVE_NEON_DOTPROD
if (features & ZX_ARM64_FEATURE_ISA_DP) {
flags |= HAS_NEON_DOTPROD;
}
#endif // HAVE_NEON_DOTPROD
#if HAVE_NEON_I8MM
if (features & ZX_ARM64_FEATURE_ISA_I8MM) {
flags |= HAS_NEON_I8MM;
}
#endif // HAVE_NEON_I8MM
return flags;
}
#else // end __Fuchsia__
#error \
"Runtime CPU detection selected, but no CPU detection method available" \
"for your platform. Rerun configure with --disable-runtime-cpu-detect."
#endif
int arm_cpu_caps(void) {
int flags = 0;
if (!arm_cpu_env_flags(&flags)) {
flags = arm_get_cpu_caps() & arm_cpu_env_mask();
}
// Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
if (!(flags & HAS_NEON_DOTPROD)) {
flags &= ~HAS_NEON_I8MM;
}
return flags;
}
+6 -6
View File
@@ -17,12 +17,12 @@
extern "C" {
#endif
/*ARMv5TE "Enhanced DSP" instructions.*/
#define HAS_EDSP 0x01
/*ARMv6 "Parallel" or "Media" instructions.*/
#define HAS_MEDIA 0x02
/*ARMv7 optional NEON instructions.*/
#define HAS_NEON 0x04
// Armv7-A optional Neon instructions, mandatory from Armv8.0-A.
#define HAS_NEON (1 << 0)
// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A.
#define HAS_NEON_DOTPROD (1 << 1)
// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
#define HAS_NEON_I8MM (1 << 2)
int arm_cpu_caps(void);
-154
View File
@@ -1,154 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include <string.h>
#include "./vpx_config.h"
#include "vpx_ports/arm.h"
#ifdef WINAPI_FAMILY
#include <winapifamily.h>
#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#define getenv(x) NULL
#endif
#endif
static int arm_cpu_env_flags(int *flags) {
char *env;
env = getenv("VPX_SIMD_CAPS");
if (env && *env) {
*flags = (int)strtol(env, NULL, 0);
return 0;
}
*flags = 0;
return -1;
}
static int arm_cpu_env_mask(void) {
char *env;
env = getenv("VPX_SIMD_CAPS_MASK");
return env && *env ? (int)strtol(env, NULL, 0) : ~0;
}
#if !CONFIG_RUNTIME_CPU_DETECT
int arm_cpu_caps(void) {
/* This function should actually be a no-op. There is no way to adjust any of
* these because the RTCD tables do not exist: the functions are called
* statically */
int flags;
int mask;
if (!arm_cpu_env_flags(&flags)) {
return flags;
}
mask = arm_cpu_env_mask();
#if HAVE_NEON || HAVE_NEON_ASM
flags |= HAS_NEON;
#endif /* HAVE_NEON || HAVE_NEON_ASM */
return flags & mask;
}
#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#ifndef WIN32_EXTRA_LEAN
#define WIN32_EXTRA_LEAN
#endif
#include <windows.h>
int arm_cpu_caps(void) {
int flags;
int mask;
if (!arm_cpu_env_flags(&flags)) {
return flags;
}
mask = arm_cpu_env_mask();
/* MSVC has no inline __asm support for ARM, but it does let you __emit
* instructions via their assembled hex code.
* All of these instructions should be essentially nops.
*/
#if HAVE_NEON || HAVE_NEON_ASM
if (mask & HAS_NEON) {
__try {
/*VORR q0,q0,q0*/
__emit(0xF2200150);
flags |= HAS_NEON;
} __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
/*Ignore exception.*/
}
}
#endif /* HAVE_NEON || HAVE_NEON_ASM */
return flags & mask;
}
#elif defined(__ANDROID__) /* end _MSC_VER */
#include <cpu-features.h>
int arm_cpu_caps(void) {
int flags;
int mask;
uint64_t features;
if (!arm_cpu_env_flags(&flags)) {
return flags;
}
mask = arm_cpu_env_mask();
features = android_getCpuFeatures();
#if HAVE_NEON || HAVE_NEON_ASM
if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
#endif /* HAVE_NEON || HAVE_NEON_ASM */
return flags & mask;
}
#elif defined(__linux__) /* end __ANDROID__ */
#include <stdio.h>
int arm_cpu_caps(void) {
FILE *fin;
int flags;
int mask;
if (!arm_cpu_env_flags(&flags)) {
return flags;
}
mask = arm_cpu_env_mask();
/* Reading /proc/self/auxv would be easier, but that doesn't work reliably
* on Android.
* This also means that detection will fail in Scratchbox.
*/
fin = fopen("/proc/cpuinfo", "r");
if (fin != NULL) {
/* 512 should be enough for anybody (it's even enough for all the flags
* that x86 has accumulated... so far).
*/
char buf[512];
while (fgets(buf, 511, fin) != NULL) {
#if HAVE_NEON || HAVE_NEON_ASM
if (memcmp(buf, "Features", 8) == 0) {
char *p;
p = strstr(buf, " neon");
if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
flags |= HAS_NEON;
}
}
#endif /* HAVE_NEON || HAVE_NEON_ASM */
}
fclose(fin);
}
return flags & mask;
}
#else /* end __linux__ */
#error \
"--enable-runtime-cpu-detect selected, but no CPU detection method " \
"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
#endif
+52
View File
@@ -0,0 +1,52 @@
/*
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include <string.h>
#include "vpx_config.h"
#include "vpx_ports/arm.h"
#if defined(_MSC_VER)
#undef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#undef WIN32_EXTRA_LEAN
#define WIN32_EXTRA_LEAN
#include <windows.h>
#endif
#ifdef WINAPI_FAMILY
#include <winapifamily.h>
#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#define getenv(x) NULL
#endif
#endif
#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
#define ANDROID_USE_CPU_FEATURES_LIB 1
// Use getauxval() when targeting (64-bit) Android with API level >= 18.
// getauxval() is supported since Android API level 18 (Android 4.3.)
// First Android version with 64-bit support was Android 5.x (API level 21).
#include <cpu-features.h>
#endif
static INLINE int arm_cpu_env_flags(int *flags) {
const char *env = getenv("VPX_SIMD_CAPS");
if (env && *env) {
*flags = (int)strtol(env, NULL, 0);
return 1;
}
return 0;
}
static INLINE int arm_cpu_env_mask(void) {
const char *env = getenv("VPX_SIMD_CAPS_MASK");
return env && *env ? (int)strtol(env, NULL, 0) : ~0;
}
+6 -1
View File
@@ -36,7 +36,12 @@ PORTS_SRCS-yes += x86.h
PORTS_SRCS-yes += x86_abi_support.asm
endif
PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.c
ifeq ($(VPX_ARCH_AARCH64),yes)
PORTS_SRCS-yes += aarch64_cpudetect.c
else
PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c
endif
PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h
PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h
PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c