Index: source/libvpx/vpx_dsp/arm/sad_neon.c |
diff --git a/source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c b/source/libvpx/vpx_dsp/arm/sad_neon.c |
similarity index 65% |
rename from source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c |
rename to source/libvpx/vpx_dsp/arm/sad_neon.c |
index c4cd856804da7ec7e3f42f8df2b562f27bc62cbf..173f08ac3c3e202764a8dd01a43a9b8877d08289 100644 |
--- a/source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c |
+++ b/source/libvpx/vpx_dsp/arm/sad_neon.c |
@@ -9,11 +9,113 @@ |
*/ |
#include <arm_neon.h> |
-#include "./vp9_rtcd.h" |
+ |
#include "./vpx_config.h" |
#include "vpx/vpx_integer.h" |
+unsigned int vpx_sad8x16_neon( |
+ unsigned char *src_ptr, |
+ int src_stride, |
+ unsigned char *ref_ptr, |
+ int ref_stride) { |
+ uint8x8_t d0, d8; |
+ uint16x8_t q12; |
+ uint32x4_t q1; |
+ uint64x2_t q3; |
+ uint32x2_t d5; |
+ int i; |
+ |
+ d0 = vld1_u8(src_ptr); |
+ src_ptr += src_stride; |
+ d8 = vld1_u8(ref_ptr); |
+ ref_ptr += ref_stride; |
+ q12 = vabdl_u8(d0, d8); |
+ |
+ for (i = 0; i < 15; i++) { |
+ d0 = vld1_u8(src_ptr); |
+ src_ptr += src_stride; |
+ d8 = vld1_u8(ref_ptr); |
+ ref_ptr += ref_stride; |
+ q12 = vabal_u8(q12, d0, d8); |
+ } |
+ |
+ q1 = vpaddlq_u16(q12); |
+ q3 = vpaddlq_u32(q1); |
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), |
+ vreinterpret_u32_u64(vget_high_u64(q3))); |
+ |
+ return vget_lane_u32(d5, 0); |
+} |
+ |
+unsigned int vpx_sad4x4_neon( |
+ unsigned char *src_ptr, |
+ int src_stride, |
+ unsigned char *ref_ptr, |
+ int ref_stride) { |
+ uint8x8_t d0, d8; |
+ uint16x8_t q12; |
+ uint32x2_t d1; |
+ uint64x1_t d3; |
+ int i; |
+ |
+ d0 = vld1_u8(src_ptr); |
+ src_ptr += src_stride; |
+ d8 = vld1_u8(ref_ptr); |
+ ref_ptr += ref_stride; |
+ q12 = vabdl_u8(d0, d8); |
+ |
+ for (i = 0; i < 3; i++) { |
+ d0 = vld1_u8(src_ptr); |
+ src_ptr += src_stride; |
+ d8 = vld1_u8(ref_ptr); |
+ ref_ptr += ref_stride; |
+ q12 = vabal_u8(q12, d0, d8); |
+ } |
+ |
+ d1 = vpaddl_u16(vget_low_u16(q12)); |
+ d3 = vpaddl_u32(d1); |
+ |
+ return vget_lane_u32(vreinterpret_u32_u64(d3), 0); |
+} |
+ |
+unsigned int vpx_sad16x8_neon( |
+ unsigned char *src_ptr, |
+ int src_stride, |
+ unsigned char *ref_ptr, |
+ int ref_stride) { |
+ uint8x16_t q0, q4; |
+ uint16x8_t q12, q13; |
+ uint32x4_t q1; |
+ uint64x2_t q3; |
+ uint32x2_t d5; |
+ int i; |
+ |
+ q0 = vld1q_u8(src_ptr); |
+ src_ptr += src_stride; |
+ q4 = vld1q_u8(ref_ptr); |
+ ref_ptr += ref_stride; |
+ q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); |
+ q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); |
+ |
+ for (i = 0; i < 7; i++) { |
+ q0 = vld1q_u8(src_ptr); |
+ src_ptr += src_stride; |
+ q4 = vld1q_u8(ref_ptr); |
+ ref_ptr += ref_stride; |
+ q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); |
+ q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); |
+ } |
+ |
+ q12 = vaddq_u16(q12, q13); |
+ q1 = vpaddlq_u16(q12); |
+ q3 = vpaddlq_u32(q1); |
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), |
+ vreinterpret_u32_u64(vget_high_u64(q3))); |
+ |
+ return vget_lane_u32(d5, 0); |
+} |
+ |
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, |
const uint16x8_t vec_hi) { |
const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), |
@@ -34,7 +136,7 @@ static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { |
return vget_lane_u32(c, 0); |
} |
-unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, |
+unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, |
const uint8_t *ref, int ref_stride) { |
int i; |
uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
@@ -70,7 +172,7 @@ unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, |
return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); |
} |
-unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, |
+unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, |
const uint8_t *ref, int ref_stride) { |
int i; |
uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
@@ -95,7 +197,7 @@ unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, |
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); |
} |
-unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, |
+unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, |
const uint8_t *ref, int ref_stride) { |
int i; |
uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
@@ -114,7 +216,7 @@ unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, |
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); |
} |
-unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, |
+unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, |
const uint8_t *ref, int ref_stride) { |
int i; |
uint16x8_t vec_accum = vdupq_n_u16(0); |