vpx_dsp,neon: simplify __ARM_FEATURE_DOTPROD check
only check that the macro is defined, the value doesn't have any effect. from https://arm-software.github.io/acle/main/acle.html: 5.5.7.7. Dot Product extension __ARM_FEATURE_DOTPROD is defined if the dot product data manipulation instructions are supported and the vector intrinsics are available. Note that this implies: - __ARM_NEON == 1 Change-Id: I164fe121ccefda99050a9b6a99738a2b518520f3
This commit is contained in:
@@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
|
||||
uint32x4_t *const sum) {
|
||||
@@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
|
||||
}
|
||||
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
|
||||
uint16x8_t *const sum) {
|
||||
@@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
sad_512_pel_final_neon(sum, sad_array);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
@@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
@@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
|
||||
}
|
||||
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
@@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
sad_2048_pel_final_neon(sum, sad_array);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
@@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
|
||||
}
|
||||
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
@@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
sad_4096_pel_final_neon(sum, sad_array);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
+16
-24
@@ -21,8 +21,7 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride) {
|
||||
const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
|
||||
const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
|
||||
const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
|
||||
return horizontal_add_uint32x4(dp);
|
||||
@@ -40,8 +39,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
|
||||
const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
|
||||
const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
|
||||
const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
|
||||
return horizontal_add_uint32x4(prod);
|
||||
@@ -54,8 +52,7 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
|
||||
|
||||
uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride) {
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
uint32x4_t prod = vdupq_n_u32(0);
|
||||
const uint8x16_t ones = vdupq_n_u8(1);
|
||||
const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
|
||||
@@ -88,8 +85,7 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
|
||||
uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const uint8_t *second_pred) {
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
uint32x4_t prod = vdupq_n_u32(0);
|
||||
const uint8x16_t ones = vdupq_n_u8(1);
|
||||
const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
|
||||
@@ -126,8 +122,7 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const int height) {
|
||||
@@ -182,7 +177,7 @@ static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
|
||||
return horizontal_add_uint32x2(prod); \
|
||||
}
|
||||
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const int height) {
|
||||
@@ -233,14 +228,13 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
|
||||
sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
|
||||
return horizontal_add_uint16x8(abs); \
|
||||
}
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
SAD8XN(4)
|
||||
SAD8XN(8)
|
||||
SAD8XN(16)
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const int height) {
|
||||
@@ -294,7 +288,7 @@ static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
|
||||
sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
|
||||
return horizontal_add_uint32x4(prod); \
|
||||
}
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const int height) {
|
||||
@@ -348,14 +342,13 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
|
||||
sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
|
||||
return horizontal_add_uint16x8(abs); \
|
||||
}
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
SAD16XN(8)
|
||||
SAD16XN(16)
|
||||
SAD16XN(32)
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const int height) {
|
||||
@@ -420,7 +413,7 @@ static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
|
||||
return horizontal_add_uint32x4(prod); \
|
||||
}
|
||||
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const int height) {
|
||||
@@ -484,14 +477,13 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
|
||||
sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
|
||||
return horizontal_add_uint16x8(abs); \
|
||||
}
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
SAD32XN(16)
|
||||
SAD32XN(32)
|
||||
SAD32XN(64)
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const int height) {
|
||||
@@ -559,7 +551,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
|
||||
}
|
||||
return prod;
|
||||
}
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
const int height) {
|
||||
@@ -637,7 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
|
||||
return vpadalq_u16(sum, abs_1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
#define SAD64XN(n) \
|
||||
uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \
|
||||
|
||||
@@ -111,7 +111,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
|
||||
*sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
|
||||
}
|
||||
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
// The variance helper functions use int16_t for sum. 8 values are accumulated
|
||||
// and then added (at which point they expand up to int32_t). To avoid overflow,
|
||||
@@ -254,7 +254,7 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
|
||||
vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *ref_ptr, int ref_stride,
|
||||
@@ -421,7 +421,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
|
||||
return vget_lane_u32(sse, 0);
|
||||
}
|
||||
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
|
||||
const unsigned char *ref_ptr, int ref_stride,
|
||||
@@ -518,4 +518,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
|
||||
return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
@@ -31,8 +31,7 @@
|
||||
// instructions. This optimization is much faster in speed unit test, but slowed
|
||||
// down the whole decoder by 5%.
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
|
||||
0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
|
||||
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
|
||||
@@ -764,7 +763,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
|
||||
const uint8x8_t s0, const uint8x8_t s1,
|
||||
@@ -1694,4 +1693,4 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
@@ -72,8 +72,7 @@ static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
|
||||
*s7 = vld1q_u8(s);
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
|
||||
(__ARM_FEATURE_DOTPROD == 1)
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
|
||||
const int8x16_t samples_hi,
|
||||
@@ -171,7 +170,7 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
|
||||
return vqrshrun_n_s16(sum, 7);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
|
||||
const int16x4_t s2, const int16x4_t s3,
|
||||
|
||||
Reference in New Issue
Block a user