| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license | |
| 5 * that can be found in the LICENSE file in the root of the source | |
| 6 * tree. An additional intellectual property rights grant can be found | |
| 7 * in the file PATENTS. All contributing project authors may | |
| 8 * be found in the AUTHORS file in the root of the source tree. | |
| 9 */ | |
| 10 | |
| 11 #include <arm_neon.h> | |
| 12 #include "vpx_ports/mem.h" | |
| 13 | |
| 14 unsigned int vp8_variance16x16_neon( | |
| 15 const unsigned char *src_ptr, | |
| 16 int source_stride, | |
| 17 const unsigned char *ref_ptr, | |
| 18 int recon_stride, | |
| 19 unsigned int *sse) { | |
| 20 int i; | |
| 21 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; | |
| 22 uint32x2_t d0u32, d10u32; | |
| 23 int64x1_t d0s64, d1s64; | |
| 24 uint8x16_t q0u8, q1u8, q2u8, q3u8; | |
| 25 uint16x8_t q11u16, q12u16, q13u16, q14u16; | |
| 26 int32x4_t q8s32, q9s32, q10s32; | |
| 27 int64x2_t q0s64, q1s64, q5s64; | |
| 28 | |
| 29 q8s32 = vdupq_n_s32(0); | |
| 30 q9s32 = vdupq_n_s32(0); | |
| 31 q10s32 = vdupq_n_s32(0); | |
| 32 | |
| 33 for (i = 0; i < 8; i++) { | |
| 34 q0u8 = vld1q_u8(src_ptr); | |
| 35 src_ptr += source_stride; | |
| 36 q1u8 = vld1q_u8(src_ptr); | |
| 37 src_ptr += source_stride; | |
| 38 __builtin_prefetch(src_ptr); | |
| 39 | |
| 40 q2u8 = vld1q_u8(ref_ptr); | |
| 41 ref_ptr += recon_stride; | |
| 42 q3u8 = vld1q_u8(ref_ptr); | |
| 43 ref_ptr += recon_stride; | |
| 44 __builtin_prefetch(ref_ptr); | |
| 45 | |
| 46 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); | |
| 47 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); | |
| 48 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); | |
| 49 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); | |
| 50 | |
| 51 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | |
| 52 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | |
| 53 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); | |
| 54 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); | |
| 55 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); | |
| 56 | |
| 57 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | |
| 58 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | |
| 59 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); | |
| 60 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); | |
| 61 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); | |
| 62 | |
| 63 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | |
| 64 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | |
| 65 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); | |
| 66 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); | |
| 67 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); | |
| 68 | |
| 69 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); | |
| 70 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); | |
| 71 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); | |
| 72 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); | |
| 73 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); | |
| 74 } | |
| 75 | |
| 76 q10s32 = vaddq_s32(q10s32, q9s32); | |
| 77 q0s64 = vpaddlq_s32(q8s32); | |
| 78 q1s64 = vpaddlq_s32(q10s32); | |
| 79 | |
| 80 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); | |
| 81 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); | |
| 82 | |
| 83 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | |
| 84 vreinterpret_s32_s64(d0s64)); | |
| 85 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | |
| 86 | |
| 87 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); | |
| 88 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | |
| 89 | |
| 90 return vget_lane_u32(d0u32, 0); | |
| 91 } | |
| 92 | |
| 93 unsigned int vp8_variance16x8_neon( | |
| 94 const unsigned char *src_ptr, | |
| 95 int source_stride, | |
| 96 const unsigned char *ref_ptr, | |
| 97 int recon_stride, | |
| 98 unsigned int *sse) { | |
| 99 int i; | |
| 100 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; | |
| 101 uint32x2_t d0u32, d10u32; | |
| 102 int64x1_t d0s64, d1s64; | |
| 103 uint8x16_t q0u8, q1u8, q2u8, q3u8; | |
| 104 uint16x8_t q11u16, q12u16, q13u16, q14u16; | |
| 105 int32x4_t q8s32, q9s32, q10s32; | |
| 106 int64x2_t q0s64, q1s64, q5s64; | |
| 107 | |
| 108 q8s32 = vdupq_n_s32(0); | |
| 109 q9s32 = vdupq_n_s32(0); | |
| 110 q10s32 = vdupq_n_s32(0); | |
| 111 | |
| 112 for (i = 0; i < 4; i++) { // variance16x8_neon_loop | |
| 113 q0u8 = vld1q_u8(src_ptr); | |
| 114 src_ptr += source_stride; | |
| 115 q1u8 = vld1q_u8(src_ptr); | |
| 116 src_ptr += source_stride; | |
| 117 __builtin_prefetch(src_ptr); | |
| 118 | |
| 119 q2u8 = vld1q_u8(ref_ptr); | |
| 120 ref_ptr += recon_stride; | |
| 121 q3u8 = vld1q_u8(ref_ptr); | |
| 122 ref_ptr += recon_stride; | |
| 123 __builtin_prefetch(ref_ptr); | |
| 124 | |
| 125 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); | |
| 126 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); | |
| 127 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); | |
| 128 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); | |
| 129 | |
| 130 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | |
| 131 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | |
| 132 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); | |
| 133 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); | |
| 134 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); | |
| 135 | |
| 136 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | |
| 137 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | |
| 138 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); | |
| 139 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); | |
| 140 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); | |
| 141 | |
| 142 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | |
| 143 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | |
| 144 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); | |
| 145 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); | |
| 146 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); | |
| 147 | |
| 148 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); | |
| 149 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); | |
| 150 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); | |
| 151 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); | |
| 152 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); | |
| 153 } | |
| 154 | |
| 155 q10s32 = vaddq_s32(q10s32, q9s32); | |
| 156 q0s64 = vpaddlq_s32(q8s32); | |
| 157 q1s64 = vpaddlq_s32(q10s32); | |
| 158 | |
| 159 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); | |
| 160 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); | |
| 161 | |
| 162 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | |
| 163 vreinterpret_s32_s64(d0s64)); | |
| 164 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | |
| 165 | |
| 166 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); | |
| 167 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | |
| 168 | |
| 169 return vget_lane_u32(d0u32, 0); | |
| 170 } | |
| 171 | |
| 172 unsigned int vp8_variance8x16_neon( | |
| 173 const unsigned char *src_ptr, | |
| 174 int source_stride, | |
| 175 const unsigned char *ref_ptr, | |
| 176 int recon_stride, | |
| 177 unsigned int *sse) { | |
| 178 int i; | |
| 179 uint8x8_t d0u8, d2u8, d4u8, d6u8; | |
| 180 int16x4_t d22s16, d23s16, d24s16, d25s16; | |
| 181 uint32x2_t d0u32, d10u32; | |
| 182 int64x1_t d0s64, d1s64; | |
| 183 uint16x8_t q11u16, q12u16; | |
| 184 int32x4_t q8s32, q9s32, q10s32; | |
| 185 int64x2_t q0s64, q1s64, q5s64; | |
| 186 | |
| 187 q8s32 = vdupq_n_s32(0); | |
| 188 q9s32 = vdupq_n_s32(0); | |
| 189 q10s32 = vdupq_n_s32(0); | |
| 190 | |
| 191 for (i = 0; i < 8; i++) { // variance8x16_neon_loop | |
| 192 d0u8 = vld1_u8(src_ptr); | |
| 193 src_ptr += source_stride; | |
| 194 d2u8 = vld1_u8(src_ptr); | |
| 195 src_ptr += source_stride; | |
| 196 __builtin_prefetch(src_ptr); | |
| 197 | |
| 198 d4u8 = vld1_u8(ref_ptr); | |
| 199 ref_ptr += recon_stride; | |
| 200 d6u8 = vld1_u8(ref_ptr); | |
| 201 ref_ptr += recon_stride; | |
| 202 __builtin_prefetch(ref_ptr); | |
| 203 | |
| 204 q11u16 = vsubl_u8(d0u8, d4u8); | |
| 205 q12u16 = vsubl_u8(d2u8, d6u8); | |
| 206 | |
| 207 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | |
| 208 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | |
| 209 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); | |
| 210 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); | |
| 211 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); | |
| 212 | |
| 213 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | |
| 214 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | |
| 215 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); | |
| 216 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); | |
| 217 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); | |
| 218 } | |
| 219 | |
| 220 q10s32 = vaddq_s32(q10s32, q9s32); | |
| 221 q0s64 = vpaddlq_s32(q8s32); | |
| 222 q1s64 = vpaddlq_s32(q10s32); | |
| 223 | |
| 224 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); | |
| 225 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); | |
| 226 | |
| 227 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | |
| 228 vreinterpret_s32_s64(d0s64)); | |
| 229 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | |
| 230 | |
| 231 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); | |
| 232 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | |
| 233 | |
| 234 return vget_lane_u32(d0u32, 0); | |
| 235 } | |
| 236 | |
| 237 unsigned int vp8_variance8x8_neon( | |
| 238 const unsigned char *src_ptr, | |
| 239 int source_stride, | |
| 240 const unsigned char *ref_ptr, | |
| 241 int recon_stride, | |
| 242 unsigned int *sse) { | |
| 243 int i; | |
| 244 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; | |
| 245 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; | |
| 246 uint32x2_t d0u32, d10u32; | |
| 247 int64x1_t d0s64, d1s64; | |
| 248 uint16x8_t q11u16, q12u16, q13u16, q14u16; | |
| 249 int32x4_t q8s32, q9s32, q10s32; | |
| 250 int64x2_t q0s64, q1s64, q5s64; | |
| 251 | |
| 252 q8s32 = vdupq_n_s32(0); | |
| 253 q9s32 = vdupq_n_s32(0); | |
| 254 q10s32 = vdupq_n_s32(0); | |
| 255 | |
| 256 for (i = 0; i < 2; i++) { // variance8x8_neon_loop | |
| 257 d0u8 = vld1_u8(src_ptr); | |
| 258 src_ptr += source_stride; | |
| 259 d1u8 = vld1_u8(src_ptr); | |
| 260 src_ptr += source_stride; | |
| 261 d2u8 = vld1_u8(src_ptr); | |
| 262 src_ptr += source_stride; | |
| 263 d3u8 = vld1_u8(src_ptr); | |
| 264 src_ptr += source_stride; | |
| 265 | |
| 266 d4u8 = vld1_u8(ref_ptr); | |
| 267 ref_ptr += recon_stride; | |
| 268 d5u8 = vld1_u8(ref_ptr); | |
| 269 ref_ptr += recon_stride; | |
| 270 d6u8 = vld1_u8(ref_ptr); | |
| 271 ref_ptr += recon_stride; | |
| 272 d7u8 = vld1_u8(ref_ptr); | |
| 273 ref_ptr += recon_stride; | |
| 274 | |
| 275 q11u16 = vsubl_u8(d0u8, d4u8); | |
| 276 q12u16 = vsubl_u8(d1u8, d5u8); | |
| 277 q13u16 = vsubl_u8(d2u8, d6u8); | |
| 278 q14u16 = vsubl_u8(d3u8, d7u8); | |
| 279 | |
| 280 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); | |
| 281 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); | |
| 282 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); | |
| 283 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); | |
| 284 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); | |
| 285 | |
| 286 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); | |
| 287 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); | |
| 288 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); | |
| 289 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); | |
| 290 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); | |
| 291 | |
| 292 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); | |
| 293 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); | |
| 294 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); | |
| 295 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); | |
| 296 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); | |
| 297 | |
| 298 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); | |
| 299 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); | |
| 300 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); | |
| 301 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); | |
| 302 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); | |
| 303 } | |
| 304 | |
| 305 q10s32 = vaddq_s32(q10s32, q9s32); | |
| 306 q0s64 = vpaddlq_s32(q8s32); | |
| 307 q1s64 = vpaddlq_s32(q10s32); | |
| 308 | |
| 309 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); | |
| 310 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); | |
| 311 | |
| 312 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), | |
| 313 vreinterpret_s32_s64(d0s64)); | |
| 314 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); | |
| 315 | |
| 316 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); | |
| 317 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); | |
| 318 | |
| 319 return vget_lane_u32(d0u32, 0); | |
| 320 } | |
| OLD | NEW |