vpx_dsp,neon: simplify __ARM_FEATURE_DOTPROD check

only check that the macro is defined, the value doesn't have any effect.

from https://arm-software.github.io/acle/main/acle.html:

5.5.7.7.  Dot Product extension
  __ARM_FEATURE_DOTPROD is defined if the dot product data manipulation
  instructions are supported and the vector intrinsics are available.
  Note that this implies:
    - __ARM_NEON == 1

Change-Id: I164fe121ccefda99050a9b6a99738a2b518520f3
This commit is contained in:
James Zern
2022-09-02 12:17:20 -07:00
parent 281dfae835
commit 447e275880
5 changed files with 34 additions and 47 deletions
+9 -12
View File
@@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint32x4_t *const sum) {
@@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint16x8_t *const sum) {
@@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
sad_512_pel_final_neon(sum, sad_array);
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad_2048_pel_final_neon(sum, sad_array);
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
@@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
sad_4096_pel_final_neon(sum, sad_array);
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+16 -24
View File
@@ -21,8 +21,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
return horizontal_add_uint32x4(dp);
@@ -40,8 +39,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
return horizontal_add_uint32x4(prod);
@@ -54,8 +52,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
uint32x4_t prod = vdupq_n_u32(0);
const uint8x16_t ones = vdupq_n_u8(1);
const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
@@ -88,8 +85,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const uint8_t *second_pred) {
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
uint32x4_t prod = vdupq_n_u32(0);
const uint8x16_t ones = vdupq_n_u8(1);
const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
@@ -126,8 +122,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
#endif
}
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -182,7 +177,7 @@ static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
return horizontal_add_uint32x2(prod); \
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -233,14 +228,13 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
SAD8XN(4)
SAD8XN(8)
SAD8XN(16)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -294,7 +288,7 @@ static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint32x4(prod); \
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -348,14 +342,13 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
SAD16XN(8)
SAD16XN(16)
SAD16XN(32)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -420,7 +413,7 @@ static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
return horizontal_add_uint32x4(prod); \
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -484,14 +477,13 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
return horizontal_add_uint16x8(abs); \
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
SAD32XN(16)
SAD32XN(32)
SAD32XN(64)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -559,7 +551,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
}
return prod;
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
const int height) {
@@ -637,7 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
return vpadalq_u16(sum, abs_1);
}
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
#define SAD64XN(n) \
uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+4 -4
View File
@@ -111,7 +111,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
*sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
// The variance helper functions use int16_t for sum. 8 values are accumulated
// and then added (at which point they expand up to int32_t). To avoid overflow,
@@ -254,7 +254,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
@@ -421,7 +421,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
return vget_lane_u32(sse, 0);
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
const unsigned char *ref_ptr, int ref_stride,
@@ -518,4 +518,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+3 -4
View File
@@ -31,8 +31,7 @@
// instructions. This optimization is much faster in speed unit test, but slowed
// down the whole decoder by 5%.
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
@@ -764,7 +763,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
#else
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
const uint8x8_t s0, const uint8x8_t s1,
@@ -1694,4 +1693,4 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+2 -3
View File
@@ -72,8 +72,7 @@ static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
*s7 = vld1q_u8(s);
}
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
(__ARM_FEATURE_DOTPROD == 1)
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
const int8x16_t samples_hi,
@@ -171,7 +170,7 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
return vqrshrun_n_s16(sum, 7);
}
#endif
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,