OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <arm_neon.h> |
| 12 |
| 13 #include "./vpx_dsp_rtcd.h" |
| 14 #include "./vpx_config.h" |
| 15 |
| 16 #include "vpx/vpx_integer.h" |
| 17 #include "vpx_ports/mem.h" |
| 18 |
| 19 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { |
| 20 const int32x4_t a = vpaddlq_s16(v_16x8); |
| 21 const int64x2_t b = vpaddlq_s32(a); |
| 22 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), |
| 23 vreinterpret_s32_s64(vget_high_s64(b))); |
| 24 return vget_lane_s32(c, 0); |
| 25 } |
| 26 |
| 27 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { |
| 28 const int64x2_t b = vpaddlq_s32(v_32x4); |
| 29 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), |
| 30 vreinterpret_s32_s64(vget_high_s64(b))); |
| 31 return vget_lane_s32(c, 0); |
| 32 } |
| 33 |
| 34 // w * h must be less than 2048 or local variable v_sum may overflow. |
| 35 static void variance_neon_w8(const uint8_t *a, int a_stride, |
| 36 const uint8_t *b, int b_stride, |
| 37 int w, int h, uint32_t *sse, int *sum) { |
| 38 int i, j; |
| 39 int16x8_t v_sum = vdupq_n_s16(0); |
| 40 int32x4_t v_sse_lo = vdupq_n_s32(0); |
| 41 int32x4_t v_sse_hi = vdupq_n_s32(0); |
| 42 |
| 43 for (i = 0; i < h; ++i) { |
| 44 for (j = 0; j < w; j += 8) { |
| 45 const uint8x8_t v_a = vld1_u8(&a[j]); |
| 46 const uint8x8_t v_b = vld1_u8(&b[j]); |
| 47 const uint16x8_t v_diff = vsubl_u8(v_a, v_b); |
| 48 const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); |
| 49 v_sum = vaddq_s16(v_sum, sv_diff); |
| 50 v_sse_lo = vmlal_s16(v_sse_lo, |
| 51 vget_low_s16(sv_diff), |
| 52 vget_low_s16(sv_diff)); |
| 53 v_sse_hi = vmlal_s16(v_sse_hi, |
| 54 vget_high_s16(sv_diff), |
| 55 vget_high_s16(sv_diff)); |
| 56 } |
| 57 a += a_stride; |
| 58 b += b_stride; |
| 59 } |
| 60 |
| 61 *sum = horizontal_add_s16x8(v_sum); |
| 62 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); |
| 63 } |
| 64 |
| 65 void vpx_get8x8var_neon(const uint8_t *a, int a_stride, |
| 66 const uint8_t *b, int b_stride, |
| 67 unsigned int *sse, int *sum) { |
| 68 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); |
| 69 } |
| 70 |
| 71 void vpx_get16x16var_neon(const uint8_t *a, int a_stride, |
| 72 const uint8_t *b, int b_stride, |
| 73 unsigned int *sse, int *sum) { |
| 74 variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); |
| 75 } |
| 76 |
| 77 unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, |
| 78 const uint8_t *b, int b_stride, |
| 79 unsigned int *sse) { |
| 80 int sum; |
| 81 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); |
| 82 return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 |
| 83 } |
| 84 |
| 85 unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, |
| 86 const uint8_t *b, int b_stride, |
| 87 unsigned int *sse) { |
| 88 int sum; |
| 89 variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); |
| 90 return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 |
| 91 } |
| 92 |
| 93 unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, |
| 94 const uint8_t *b, int b_stride, |
| 95 unsigned int *sse) { |
| 96 int sum; |
| 97 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); |
| 98 return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 |
| 99 } |
| 100 |
| 101 unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, |
| 102 const uint8_t *b, int b_stride, |
| 103 unsigned int *sse) { |
| 104 int sum1, sum2; |
| 105 uint32_t sse1, sse2; |
| 106 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); |
| 107 variance_neon_w8(a + (32 * a_stride), a_stride, |
| 108 b + (32 * b_stride), b_stride, 32, 32, |
| 109 &sse2, &sum2); |
| 110 *sse = sse1 + sse2; |
| 111 sum1 += sum2; |
| 112 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 |
| 113 } |
| 114 |
| 115 unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, |
| 116 const uint8_t *b, int b_stride, |
| 117 unsigned int *sse) { |
| 118 int sum1, sum2; |
| 119 uint32_t sse1, sse2; |
| 120 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); |
| 121 variance_neon_w8(a + (16 * a_stride), a_stride, |
| 122 b + (16 * b_stride), b_stride, 64, 16, |
| 123 &sse2, &sum2); |
| 124 *sse = sse1 + sse2; |
| 125 sum1 += sum2; |
| 126 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 |
| 127 } |
| 128 |
| 129 unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, |
| 130 const uint8_t *b, int b_stride, |
| 131 unsigned int *sse) { |
| 132 int sum1, sum2; |
| 133 uint32_t sse1, sse2; |
| 134 |
| 135 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); |
| 136 variance_neon_w8(a + (16 * a_stride), a_stride, |
| 137 b + (16 * b_stride), b_stride, 64, 16, |
| 138 &sse2, &sum2); |
| 139 sse1 += sse2; |
| 140 sum1 += sum2; |
| 141 |
| 142 variance_neon_w8(a + (16 * 2 * a_stride), a_stride, |
| 143 b + (16 * 2 * b_stride), b_stride, |
| 144 64, 16, &sse2, &sum2); |
| 145 sse1 += sse2; |
| 146 sum1 += sum2; |
| 147 |
| 148 variance_neon_w8(a + (16 * 3 * a_stride), a_stride, |
| 149 b + (16 * 3 * b_stride), b_stride, |
| 150 64, 16, &sse2, &sum2); |
| 151 *sse = sse1 + sse2; |
| 152 sum1 += sum2; |
| 153 return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 |
| 154 } |
| 155 |
| 156 unsigned int vpx_variance16x8_neon( |
| 157 const unsigned char *src_ptr, |
| 158 int source_stride, |
| 159 const unsigned char *ref_ptr, |
| 160 int recon_stride, |
| 161 unsigned int *sse) { |
| 162 int i; |
| 163 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; |
| 164 uint32x2_t d0u32, d10u32; |
| 165 int64x1_t d0s64, d1s64; |
| 166 uint8x16_t q0u8, q1u8, q2u8, q3u8; |
| 167 uint16x8_t q11u16, q12u16, q13u16, q14u16; |
| 168 int32x4_t q8s32, q9s32, q10s32; |
| 169 int64x2_t q0s64, q1s64, q5s64; |
| 170 |
| 171 q8s32 = vdupq_n_s32(0); |
| 172 q9s32 = vdupq_n_s32(0); |
| 173 q10s32 = vdupq_n_s32(0); |
| 174 |
| 175 for (i = 0; i < 4; i++) { |
| 176 q0u8 = vld1q_u8(src_ptr); |
| 177 src_ptr += source_stride; |
| 178 q1u8 = vld1q_u8(src_ptr); |
| 179 src_ptr += source_stride; |
| 180 __builtin_prefetch(src_ptr); |
| 181 |
| 182 q2u8 = vld1q_u8(ref_ptr); |
| 183 ref_ptr += recon_stride; |
| 184 q3u8 = vld1q_u8(ref_ptr); |
| 185 ref_ptr += recon_stride; |
| 186 __builtin_prefetch(ref_ptr); |
| 187 |
| 188 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); |
| 189 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); |
| 190 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); |
| 191 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); |
| 192 |
| 193 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); |
| 194 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); |
| 195 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); |
| 196 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); |
| 197 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); |
| 198 |
| 199 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); |
| 200 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); |
| 201 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); |
| 202 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); |
| 203 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); |
| 204 |
| 205 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); |
| 206 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); |
| 207 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); |
| 208 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); |
| 209 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); |
| 210 |
| 211 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); |
| 212 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); |
| 213 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); |
| 214 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); |
| 215 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); |
| 216 } |
| 217 |
| 218 q10s32 = vaddq_s32(q10s32, q9s32); |
| 219 q0s64 = vpaddlq_s32(q8s32); |
| 220 q1s64 = vpaddlq_s32(q10s32); |
| 221 |
| 222 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); |
| 223 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); |
| 224 |
| 225 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), |
| 226 vreinterpret_s32_s64(d0s64)); |
| 227 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); |
| 228 |
| 229 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); |
| 230 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); |
| 231 |
| 232 return vget_lane_u32(d0u32, 0); |
| 233 } |
| 234 |
| 235 unsigned int vpx_variance8x16_neon( |
| 236 const unsigned char *src_ptr, |
| 237 int source_stride, |
| 238 const unsigned char *ref_ptr, |
| 239 int recon_stride, |
| 240 unsigned int *sse) { |
| 241 int i; |
| 242 uint8x8_t d0u8, d2u8, d4u8, d6u8; |
| 243 int16x4_t d22s16, d23s16, d24s16, d25s16; |
| 244 uint32x2_t d0u32, d10u32; |
| 245 int64x1_t d0s64, d1s64; |
| 246 uint16x8_t q11u16, q12u16; |
| 247 int32x4_t q8s32, q9s32, q10s32; |
| 248 int64x2_t q0s64, q1s64, q5s64; |
| 249 |
| 250 q8s32 = vdupq_n_s32(0); |
| 251 q9s32 = vdupq_n_s32(0); |
| 252 q10s32 = vdupq_n_s32(0); |
| 253 |
| 254 for (i = 0; i < 8; i++) { |
| 255 d0u8 = vld1_u8(src_ptr); |
| 256 src_ptr += source_stride; |
| 257 d2u8 = vld1_u8(src_ptr); |
| 258 src_ptr += source_stride; |
| 259 __builtin_prefetch(src_ptr); |
| 260 |
| 261 d4u8 = vld1_u8(ref_ptr); |
| 262 ref_ptr += recon_stride; |
| 263 d6u8 = vld1_u8(ref_ptr); |
| 264 ref_ptr += recon_stride; |
| 265 __builtin_prefetch(ref_ptr); |
| 266 |
| 267 q11u16 = vsubl_u8(d0u8, d4u8); |
| 268 q12u16 = vsubl_u8(d2u8, d6u8); |
| 269 |
| 270 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); |
| 271 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); |
| 272 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); |
| 273 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); |
| 274 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); |
| 275 |
| 276 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); |
| 277 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); |
| 278 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); |
| 279 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); |
| 280 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); |
| 281 } |
| 282 |
| 283 q10s32 = vaddq_s32(q10s32, q9s32); |
| 284 q0s64 = vpaddlq_s32(q8s32); |
| 285 q1s64 = vpaddlq_s32(q10s32); |
| 286 |
| 287 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); |
| 288 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); |
| 289 |
| 290 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), |
| 291 vreinterpret_s32_s64(d0s64)); |
| 292 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); |
| 293 |
| 294 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); |
| 295 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); |
| 296 |
| 297 return vget_lane_u32(d0u32, 0); |
| 298 } |
| 299 |
| 300 unsigned int vpx_mse16x16_neon( |
| 301 const unsigned char *src_ptr, |
| 302 int source_stride, |
| 303 const unsigned char *ref_ptr, |
| 304 int recon_stride, |
| 305 unsigned int *sse) { |
| 306 int i; |
| 307 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; |
| 308 int64x1_t d0s64; |
| 309 uint8x16_t q0u8, q1u8, q2u8, q3u8; |
| 310 int32x4_t q7s32, q8s32, q9s32, q10s32; |
| 311 uint16x8_t q11u16, q12u16, q13u16, q14u16; |
| 312 int64x2_t q1s64; |
| 313 |
| 314 q7s32 = vdupq_n_s32(0); |
| 315 q8s32 = vdupq_n_s32(0); |
| 316 q9s32 = vdupq_n_s32(0); |
| 317 q10s32 = vdupq_n_s32(0); |
| 318 |
| 319 for (i = 0; i < 8; i++) { // mse16x16_neon_loop |
| 320 q0u8 = vld1q_u8(src_ptr); |
| 321 src_ptr += source_stride; |
| 322 q1u8 = vld1q_u8(src_ptr); |
| 323 src_ptr += source_stride; |
| 324 q2u8 = vld1q_u8(ref_ptr); |
| 325 ref_ptr += recon_stride; |
| 326 q3u8 = vld1q_u8(ref_ptr); |
| 327 ref_ptr += recon_stride; |
| 328 |
| 329 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); |
| 330 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); |
| 331 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); |
| 332 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); |
| 333 |
| 334 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); |
| 335 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); |
| 336 q7s32 = vmlal_s16(q7s32, d22s16, d22s16); |
| 337 q8s32 = vmlal_s16(q8s32, d23s16, d23s16); |
| 338 |
| 339 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); |
| 340 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); |
| 341 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); |
| 342 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); |
| 343 |
| 344 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); |
| 345 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); |
| 346 q7s32 = vmlal_s16(q7s32, d26s16, d26s16); |
| 347 q8s32 = vmlal_s16(q8s32, d27s16, d27s16); |
| 348 |
| 349 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); |
| 350 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); |
| 351 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); |
| 352 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); |
| 353 } |
| 354 |
| 355 q7s32 = vaddq_s32(q7s32, q8s32); |
| 356 q9s32 = vaddq_s32(q9s32, q10s32); |
| 357 q10s32 = vaddq_s32(q7s32, q9s32); |
| 358 |
| 359 q1s64 = vpaddlq_s32(q10s32); |
| 360 d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); |
| 361 |
| 362 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); |
| 363 return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); |
| 364 } |
| 365 |
| 366 unsigned int vpx_get4x4sse_cs_neon( |
| 367 const unsigned char *src_ptr, |
| 368 int source_stride, |
| 369 const unsigned char *ref_ptr, |
| 370 int recon_stride) { |
| 371 int16x4_t d22s16, d24s16, d26s16, d28s16; |
| 372 int64x1_t d0s64; |
| 373 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; |
| 374 int32x4_t q7s32, q8s32, q9s32, q10s32; |
| 375 uint16x8_t q11u16, q12u16, q13u16, q14u16; |
| 376 int64x2_t q1s64; |
| 377 |
| 378 d0u8 = vld1_u8(src_ptr); |
| 379 src_ptr += source_stride; |
| 380 d4u8 = vld1_u8(ref_ptr); |
| 381 ref_ptr += recon_stride; |
| 382 d1u8 = vld1_u8(src_ptr); |
| 383 src_ptr += source_stride; |
| 384 d5u8 = vld1_u8(ref_ptr); |
| 385 ref_ptr += recon_stride; |
| 386 d2u8 = vld1_u8(src_ptr); |
| 387 src_ptr += source_stride; |
| 388 d6u8 = vld1_u8(ref_ptr); |
| 389 ref_ptr += recon_stride; |
| 390 d3u8 = vld1_u8(src_ptr); |
| 391 src_ptr += source_stride; |
| 392 d7u8 = vld1_u8(ref_ptr); |
| 393 ref_ptr += recon_stride; |
| 394 |
| 395 q11u16 = vsubl_u8(d0u8, d4u8); |
| 396 q12u16 = vsubl_u8(d1u8, d5u8); |
| 397 q13u16 = vsubl_u8(d2u8, d6u8); |
| 398 q14u16 = vsubl_u8(d3u8, d7u8); |
| 399 |
| 400 d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); |
| 401 d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); |
| 402 d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); |
| 403 d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); |
| 404 |
| 405 q7s32 = vmull_s16(d22s16, d22s16); |
| 406 q8s32 = vmull_s16(d24s16, d24s16); |
| 407 q9s32 = vmull_s16(d26s16, d26s16); |
| 408 q10s32 = vmull_s16(d28s16, d28s16); |
| 409 |
| 410 q7s32 = vaddq_s32(q7s32, q8s32); |
| 411 q9s32 = vaddq_s32(q9s32, q10s32); |
| 412 q9s32 = vaddq_s32(q7s32, q9s32); |
| 413 |
| 414 q1s64 = vpaddlq_s32(q9s32); |
| 415 d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); |
| 416 |
| 417 return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); |
| 418 } |
OLD | NEW |