| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 #include "./vp9_rtcd.h" | 12 |
| 13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
| 14 | 14 |
| 15 #include "vpx/vpx_integer.h" | 15 #include "vpx/vpx_integer.h" |
| 16 | 16 |
| 17 unsigned int vpx_sad8x16_neon( |
| 18 unsigned char *src_ptr, |
| 19 int src_stride, |
| 20 unsigned char *ref_ptr, |
| 21 int ref_stride) { |
| 22 uint8x8_t d0, d8; |
| 23 uint16x8_t q12; |
| 24 uint32x4_t q1; |
| 25 uint64x2_t q3; |
| 26 uint32x2_t d5; |
| 27 int i; |
| 28 |
| 29 d0 = vld1_u8(src_ptr); |
| 30 src_ptr += src_stride; |
| 31 d8 = vld1_u8(ref_ptr); |
| 32 ref_ptr += ref_stride; |
| 33 q12 = vabdl_u8(d0, d8); |
| 34 |
| 35 for (i = 0; i < 15; i++) { |
| 36 d0 = vld1_u8(src_ptr); |
| 37 src_ptr += src_stride; |
| 38 d8 = vld1_u8(ref_ptr); |
| 39 ref_ptr += ref_stride; |
| 40 q12 = vabal_u8(q12, d0, d8); |
| 41 } |
| 42 |
| 43 q1 = vpaddlq_u16(q12); |
| 44 q3 = vpaddlq_u32(q1); |
| 45 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), |
| 46 vreinterpret_u32_u64(vget_high_u64(q3))); |
| 47 |
| 48 return vget_lane_u32(d5, 0); |
| 49 } |
| 50 |
| 51 unsigned int vpx_sad4x4_neon( |
| 52 unsigned char *src_ptr, |
| 53 int src_stride, |
| 54 unsigned char *ref_ptr, |
| 55 int ref_stride) { |
| 56 uint8x8_t d0, d8; |
| 57 uint16x8_t q12; |
| 58 uint32x2_t d1; |
| 59 uint64x1_t d3; |
| 60 int i; |
| 61 |
| 62 d0 = vld1_u8(src_ptr); |
| 63 src_ptr += src_stride; |
| 64 d8 = vld1_u8(ref_ptr); |
| 65 ref_ptr += ref_stride; |
| 66 q12 = vabdl_u8(d0, d8); |
| 67 |
| 68 for (i = 0; i < 3; i++) { |
| 69 d0 = vld1_u8(src_ptr); |
| 70 src_ptr += src_stride; |
| 71 d8 = vld1_u8(ref_ptr); |
| 72 ref_ptr += ref_stride; |
| 73 q12 = vabal_u8(q12, d0, d8); |
| 74 } |
| 75 |
| 76 d1 = vpaddl_u16(vget_low_u16(q12)); |
| 77 d3 = vpaddl_u32(d1); |
| 78 |
| 79 return vget_lane_u32(vreinterpret_u32_u64(d3), 0); |
| 80 } |
| 81 |
| 82 unsigned int vpx_sad16x8_neon( |
| 83 unsigned char *src_ptr, |
| 84 int src_stride, |
| 85 unsigned char *ref_ptr, |
| 86 int ref_stride) { |
| 87 uint8x16_t q0, q4; |
| 88 uint16x8_t q12, q13; |
| 89 uint32x4_t q1; |
| 90 uint64x2_t q3; |
| 91 uint32x2_t d5; |
| 92 int i; |
| 93 |
| 94 q0 = vld1q_u8(src_ptr); |
| 95 src_ptr += src_stride; |
| 96 q4 = vld1q_u8(ref_ptr); |
| 97 ref_ptr += ref_stride; |
| 98 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); |
| 99 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); |
| 100 |
| 101 for (i = 0; i < 7; i++) { |
| 102 q0 = vld1q_u8(src_ptr); |
| 103 src_ptr += src_stride; |
| 104 q4 = vld1q_u8(ref_ptr); |
| 105 ref_ptr += ref_stride; |
| 106 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); |
| 107 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); |
| 108 } |
| 109 |
| 110 q12 = vaddq_u16(q12, q13); |
| 111 q1 = vpaddlq_u16(q12); |
| 112 q3 = vpaddlq_u32(q1); |
| 113 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), |
| 114 vreinterpret_u32_u64(vget_high_u64(q3))); |
| 115 |
| 116 return vget_lane_u32(d5, 0); |
| 117 } |
| 118 |
| 17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, | 119 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, |
| 18 const uint16x8_t vec_hi) { | 120 const uint16x8_t vec_hi) { |
| 19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), | 121 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), |
| 20 vget_high_u16(vec_lo)); | 122 vget_high_u16(vec_lo)); |
| 21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), | 123 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), |
| 22 vget_high_u16(vec_hi)); | 124 vget_high_u16(vec_hi)); |
| 23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); | 125 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); |
| 24 const uint64x2_t b = vpaddlq_u32(a); | 126 const uint64x2_t b = vpaddlq_u32(a); |
| 25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | 127 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), |
| 26 vreinterpret_u32_u64(vget_high_u64(b))); | 128 vreinterpret_u32_u64(vget_high_u64(b))); |
| 27 return vget_lane_u32(c, 0); | 129 return vget_lane_u32(c, 0); |
| 28 } | 130 } |
| 29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { | 131 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { |
| 30 const uint32x4_t a = vpaddlq_u16(vec_16x8); | 132 const uint32x4_t a = vpaddlq_u16(vec_16x8); |
| 31 const uint64x2_t b = vpaddlq_u32(a); | 133 const uint64x2_t b = vpaddlq_u32(a); |
| 32 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), | 134 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), |
| 33 vreinterpret_u32_u64(vget_high_u64(b))); | 135 vreinterpret_u32_u64(vget_high_u64(b))); |
| 34 return vget_lane_u32(c, 0); | 136 return vget_lane_u32(c, 0); |
| 35 } | 137 } |
| 36 | 138 |
| 37 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, | 139 unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, |
| 38 const uint8_t *ref, int ref_stride) { | 140 const uint8_t *ref, int ref_stride) { |
| 39 int i; | 141 int i; |
| 40 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | 142 uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
| 41 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | 143 uint16x8_t vec_accum_hi = vdupq_n_u16(0); |
| 42 for (i = 0; i < 64; ++i) { | 144 for (i = 0; i < 64; ++i) { |
| 43 const uint8x16_t vec_src_00 = vld1q_u8(src); | 145 const uint8x16_t vec_src_00 = vld1q_u8(src); |
| 44 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); | 146 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); |
| 45 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); | 147 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); |
| 46 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); | 148 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); |
| 47 const uint8x16_t vec_ref_00 = vld1q_u8(ref); | 149 const uint8x16_t vec_ref_00 = vld1q_u8(ref); |
| (...skipping 15 matching lines...) Expand all Loading... |
| 63 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), | 165 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), |
| 64 vget_high_u8(vec_ref_32)); | 166 vget_high_u8(vec_ref_32)); |
| 65 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), | 167 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), |
| 66 vget_low_u8(vec_ref_48)); | 168 vget_low_u8(vec_ref_48)); |
| 67 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), | 169 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), |
| 68 vget_high_u8(vec_ref_48)); | 170 vget_high_u8(vec_ref_48)); |
| 69 } | 171 } |
| 70 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); | 172 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); |
| 71 } | 173 } |
| 72 | 174 |
| 73 unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, | 175 unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, |
| 74 const uint8_t *ref, int ref_stride) { | 176 const uint8_t *ref, int ref_stride) { |
| 75 int i; | 177 int i; |
| 76 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | 178 uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
| 77 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | 179 uint16x8_t vec_accum_hi = vdupq_n_u16(0); |
| 78 | 180 |
| 79 for (i = 0; i < 32; ++i) { | 181 for (i = 0; i < 32; ++i) { |
| 80 const uint8x16_t vec_src_00 = vld1q_u8(src); | 182 const uint8x16_t vec_src_00 = vld1q_u8(src); |
| 81 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); | 183 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); |
| 82 const uint8x16_t vec_ref_00 = vld1q_u8(ref); | 184 const uint8x16_t vec_ref_00 = vld1q_u8(ref); |
| 83 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); | 185 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); |
| 84 src += src_stride; | 186 src += src_stride; |
| 85 ref += ref_stride; | 187 ref += ref_stride; |
| 86 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), | 188 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), |
| 87 vget_low_u8(vec_ref_00)); | 189 vget_low_u8(vec_ref_00)); |
| 88 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), | 190 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), |
| 89 vget_high_u8(vec_ref_00)); | 191 vget_high_u8(vec_ref_00)); |
| 90 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), | 192 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), |
| 91 vget_low_u8(vec_ref_16)); | 193 vget_low_u8(vec_ref_16)); |
| 92 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), | 194 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), |
| 93 vget_high_u8(vec_ref_16)); | 195 vget_high_u8(vec_ref_16)); |
| 94 } | 196 } |
| 95 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); | 197 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); |
| 96 } | 198 } |
| 97 | 199 |
| 98 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, | 200 unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, |
| 99 const uint8_t *ref, int ref_stride) { | 201 const uint8_t *ref, int ref_stride) { |
| 100 int i; | 202 int i; |
| 101 uint16x8_t vec_accum_lo = vdupq_n_u16(0); | 203 uint16x8_t vec_accum_lo = vdupq_n_u16(0); |
| 102 uint16x8_t vec_accum_hi = vdupq_n_u16(0); | 204 uint16x8_t vec_accum_hi = vdupq_n_u16(0); |
| 103 | 205 |
| 104 for (i = 0; i < 16; ++i) { | 206 for (i = 0; i < 16; ++i) { |
| 105 const uint8x16_t vec_src = vld1q_u8(src); | 207 const uint8x16_t vec_src = vld1q_u8(src); |
| 106 const uint8x16_t vec_ref = vld1q_u8(ref); | 208 const uint8x16_t vec_ref = vld1q_u8(ref); |
| 107 src += src_stride; | 209 src += src_stride; |
| 108 ref += ref_stride; | 210 ref += ref_stride; |
| 109 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), | 211 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), |
| 110 vget_low_u8(vec_ref)); | 212 vget_low_u8(vec_ref)); |
| 111 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), | 213 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), |
| 112 vget_high_u8(vec_ref)); | 214 vget_high_u8(vec_ref)); |
| 113 } | 215 } |
| 114 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); | 216 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); |
| 115 } | 217 } |
| 116 | 218 |
| 117 unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, | 219 unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, |
| 118 const uint8_t *ref, int ref_stride) { | 220 const uint8_t *ref, int ref_stride) { |
| 119 int i; | 221 int i; |
| 120 uint16x8_t vec_accum = vdupq_n_u16(0); | 222 uint16x8_t vec_accum = vdupq_n_u16(0); |
| 121 | 223 |
| 122 for (i = 0; i < 8; ++i) { | 224 for (i = 0; i < 8; ++i) { |
| 123 const uint8x8_t vec_src = vld1_u8(src); | 225 const uint8x8_t vec_src = vld1_u8(src); |
| 124 const uint8x8_t vec_ref = vld1_u8(ref); | 226 const uint8x8_t vec_ref = vld1_u8(ref); |
| 125 src += src_stride; | 227 src += src_stride; |
| 126 ref += ref_stride; | 228 ref += ref_stride; |
| 127 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); | 229 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); |
| 128 } | 230 } |
| 129 return horizontal_add_16x8(vec_accum); | 231 return horizontal_add_16x8(vec_accum); |
| 130 } | 232 } |
| OLD | NEW |