OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <arm_neon.h> |
| 12 #include "./vp9_rtcd.h" |
| 13 #include "./vpx_config.h" |
| 14 |
| 15 #include "vpx/vpx_integer.h" |
| 16 |
| 17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, |
| 18 const uint16x8_t vec_hi) { |
| 19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), |
| 20 vget_high_u16(vec_lo)); |
| 21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), |
| 22 vget_high_u16(vec_hi)); |
| 23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); |
| 24 const uint64x2_t b = vpaddlq_u32(a); |
| 25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), |
| 26 vreinterpret_u32_u64(vget_high_u64(b))); |
| 27 return vget_lane_u32(c, 0); |
| 28 } |
| 29 |
| 30 // Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, |
| 31 // vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo |
| 32 // and vec_sum_ref_hi. |
| 33 static void sad_neon_64(const uint8x16_t vec_src_00, |
| 34 const uint8x16_t vec_src_16, |
| 35 const uint8x16_t vec_src_32, |
| 36 const uint8x16_t vec_src_48, |
| 37 const uint8_t *ref, |
| 38 uint16x8_t *vec_sum_ref_lo, |
| 39 uint16x8_t *vec_sum_ref_hi) { |
| 40 const uint8x16_t vec_ref_00 = vld1q_u8(ref); |
| 41 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); |
| 42 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); |
| 43 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); |
| 44 |
| 45 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), |
| 46 vget_low_u8(vec_ref_00)); |
| 47 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), |
| 48 vget_high_u8(vec_ref_00)); |
| 49 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), |
| 50 vget_low_u8(vec_ref_16)); |
| 51 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), |
| 52 vget_high_u8(vec_ref_16)); |
| 53 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), |
| 54 vget_low_u8(vec_ref_32)); |
| 55 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), |
| 56 vget_high_u8(vec_ref_32)); |
| 57 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), |
| 58 vget_low_u8(vec_ref_48)); |
| 59 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), |
| 60 vget_high_u8(vec_ref_48)); |
| 61 } |
| 62 |
| 63 // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, |
| 64 // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. |
| 65 static void sad_neon_32(const uint8x16_t vec_src_00, |
| 66 const uint8x16_t vec_src_16, |
| 67 const uint8_t *ref, |
| 68 uint16x8_t *vec_sum_ref_lo, |
| 69 uint16x8_t *vec_sum_ref_hi) { |
| 70 const uint8x16_t vec_ref_00 = vld1q_u8(ref); |
| 71 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); |
| 72 |
| 73 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), |
| 74 vget_low_u8(vec_ref_00)); |
| 75 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), |
| 76 vget_high_u8(vec_ref_00)); |
| 77 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), |
| 78 vget_low_u8(vec_ref_16)); |
| 79 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), |
| 80 vget_high_u8(vec_ref_16)); |
| 81 } |
| 82 |
| 83 void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride, |
| 84 const uint8_t* const ref[4], int ref_stride, |
| 85 unsigned int *res) { |
| 86 int i; |
| 87 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); |
| 88 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); |
| 89 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); |
| 90 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); |
| 91 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); |
| 92 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); |
| 93 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); |
| 94 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); |
| 95 const uint8_t *ref0, *ref1, *ref2, *ref3; |
| 96 ref0 = ref[0]; |
| 97 ref1 = ref[1]; |
| 98 ref2 = ref[2]; |
| 99 ref3 = ref[3]; |
| 100 |
| 101 for (i = 0; i < 64; ++i) { |
| 102 const uint8x16_t vec_src_00 = vld1q_u8(src); |
| 103 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); |
| 104 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); |
| 105 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); |
| 106 |
| 107 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, |
| 108 &vec_sum_ref0_lo, &vec_sum_ref0_hi); |
| 109 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, |
| 110 &vec_sum_ref1_lo, &vec_sum_ref1_hi); |
| 111 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, |
| 112 &vec_sum_ref2_lo, &vec_sum_ref2_hi); |
| 113 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, |
| 114 &vec_sum_ref3_lo, &vec_sum_ref3_hi); |
| 115 |
| 116 src += src_stride; |
| 117 ref0 += ref_stride; |
| 118 ref1 += ref_stride; |
| 119 ref2 += ref_stride; |
| 120 ref3 += ref_stride; |
| 121 } |
| 122 |
| 123 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); |
| 124 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); |
| 125 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); |
| 126 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); |
| 127 } |
| 128 |
| 129 void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride, |
| 130 const uint8_t* const ref[4], int ref_stride, |
| 131 unsigned int *res) { |
| 132 int i; |
| 133 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); |
| 134 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); |
| 135 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); |
| 136 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); |
| 137 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); |
| 138 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); |
| 139 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); |
| 140 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); |
| 141 const uint8_t *ref0, *ref1, *ref2, *ref3; |
| 142 ref0 = ref[0]; |
| 143 ref1 = ref[1]; |
| 144 ref2 = ref[2]; |
| 145 ref3 = ref[3]; |
| 146 |
| 147 for (i = 0; i < 32; ++i) { |
| 148 const uint8x16_t vec_src_00 = vld1q_u8(src); |
| 149 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); |
| 150 |
| 151 sad_neon_32(vec_src_00, vec_src_16, ref0, |
| 152 &vec_sum_ref0_lo, &vec_sum_ref0_hi); |
| 153 sad_neon_32(vec_src_00, vec_src_16, ref1, |
| 154 &vec_sum_ref1_lo, &vec_sum_ref1_hi); |
| 155 sad_neon_32(vec_src_00, vec_src_16, ref2, |
| 156 &vec_sum_ref2_lo, &vec_sum_ref2_hi); |
| 157 sad_neon_32(vec_src_00, vec_src_16, ref3, |
| 158 &vec_sum_ref3_lo, &vec_sum_ref3_hi); |
| 159 |
| 160 src += src_stride; |
| 161 ref0 += ref_stride; |
| 162 ref1 += ref_stride; |
| 163 ref2 += ref_stride; |
| 164 ref3 += ref_stride; |
| 165 } |
| 166 |
| 167 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); |
| 168 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); |
| 169 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); |
| 170 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); |
| 171 } |
| 172 |
| 173 void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride, |
| 174 const uint8_t* const ref[4], int ref_stride, |
| 175 unsigned int *res) { |
| 176 int i; |
| 177 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); |
| 178 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); |
| 179 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); |
| 180 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); |
| 181 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); |
| 182 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); |
| 183 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); |
| 184 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); |
| 185 const uint8_t *ref0, *ref1, *ref2, *ref3; |
| 186 ref0 = ref[0]; |
| 187 ref1 = ref[1]; |
| 188 ref2 = ref[2]; |
| 189 ref3 = ref[3]; |
| 190 |
| 191 for (i = 0; i < 16; ++i) { |
| 192 const uint8x16_t vec_src = vld1q_u8(src); |
| 193 const uint8x16_t vec_ref0 = vld1q_u8(ref0); |
| 194 const uint8x16_t vec_ref1 = vld1q_u8(ref1); |
| 195 const uint8x16_t vec_ref2 = vld1q_u8(ref2); |
| 196 const uint8x16_t vec_ref3 = vld1q_u8(ref3); |
| 197 |
| 198 vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), |
| 199 vget_low_u8(vec_ref0)); |
| 200 vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), |
| 201 vget_high_u8(vec_ref0)); |
| 202 vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), |
| 203 vget_low_u8(vec_ref1)); |
| 204 vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), |
| 205 vget_high_u8(vec_ref1)); |
| 206 vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), |
| 207 vget_low_u8(vec_ref2)); |
| 208 vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), |
| 209 vget_high_u8(vec_ref2)); |
| 210 vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), |
| 211 vget_low_u8(vec_ref3)); |
| 212 vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), |
| 213 vget_high_u8(vec_ref3)); |
| 214 |
| 215 src += src_stride; |
| 216 ref0 += ref_stride; |
| 217 ref1 += ref_stride; |
| 218 ref2 += ref_stride; |
| 219 ref3 += ref_stride; |
| 220 } |
| 221 |
| 222 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); |
| 223 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); |
| 224 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); |
| 225 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); |
| 226 } |
OLD | NEW |