OLD | NEW |
(Empty) | |
| 1 // Copyright 2012 Google Inc. All Rights Reserved. |
| 2 // |
| 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- |
| 9 // |
| 10 // ARM NEON version of speed-critical encoding functions. |
| 11 // |
| 12 // adapted from libvpx (http://www.webmproject.org/code/) |
| 13 |
| 14 #include "./dsp.h" |
| 15 |
| 16 #if defined(WEBP_USE_NEON) |
| 17 |
| 18 #include <assert.h> |
| 19 |
| 20 #include "./neon.h" |
| 21 #include "../enc/vp8enci.h" |
| 22 |
| 23 //------------------------------------------------------------------------------ |
| 24 // Transforms (Paragraph 14.4) |
| 25 |
| 26 // Inverse transform. |
| 27 // This code is pretty much the same as TransformOne in the dec_neon.c, except |
| 28 // for subtraction to *ref. See the comments there for algorithmic explanations. |
| 29 |
| 30 static const int16_t kC1 = 20091; |
| 31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. |
| 32 |
| 33 // This code works but is *slower* than the inlined-asm version below |
| 34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to |
| 35 // USE_INTRINSICS define. |
| 36 // With gcc-4.8, it's a little faster speed than inlined-assembly. |
| 37 #if defined(USE_INTRINSICS) |
| 38 |
| 39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. |
| 40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { |
| 41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); |
| 42 } |
| 43 |
| 44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result |
| 45 // to the corresponding rows of 'dst'. |
| 46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, |
| 47 const int16x8_t dst01, |
| 48 const int16x8_t dst23) { |
| 49 // Unsigned saturate to 8b. |
| 50 const uint8x8_t dst01_u8 = vqmovun_s16(dst01); |
| 51 const uint8x8_t dst23_u8 = vqmovun_s16(dst23); |
| 52 |
| 53 // Store the results. |
| 54 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0); |
| 55 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1); |
| 56 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0); |
| 57 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); |
| 58 } |
| 59 |
| 60 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, |
| 61 const uint8_t* const ref, uint8_t* const dst) { |
| 62 uint32x2_t dst01 = vdup_n_u32(0); |
| 63 uint32x2_t dst23 = vdup_n_u32(0); |
| 64 |
| 65 // Load the source pixels. |
| 66 dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0); |
| 67 dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0); |
| 68 dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1); |
| 69 dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1); |
| 70 |
| 71 { |
| 72 // Convert to 16b. |
| 73 const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); |
| 74 const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); |
| 75 |
| 76 // Descale with rounding. |
| 77 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); |
| 78 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); |
| 79 // Add the inverse transform. |
| 80 SaturateAndStore4x4(dst, out01, out23); |
| 81 } |
| 82 } |
| 83 |
| 84 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, |
| 85 int16x8x2_t* const out) { |
| 86 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 |
| 87 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 |
| 88 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... |
| 89 // b0 d0 b1 d1 b2 d2 ... |
| 90 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); |
| 91 } |
| 92 |
| 93 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { |
| 94 // {rows} = in0 | in4 |
| 95 // in8 | in12 |
| 96 // B1 = in4 | in12 |
| 97 const int16x8_t B1 = |
| 98 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1])); |
| 99 // C0 = kC1 * in4 | kC1 * in12 |
| 100 // C1 = kC2 * in4 | kC2 * in12 |
| 101 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1); |
| 102 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2); |
| 103 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]), |
| 104 vget_low_s16(rows->val[1])); // in0 + in8 |
| 105 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]), |
| 106 vget_low_s16(rows->val[1])); // in0 - in8 |
| 107 // c = kC2 * in4 - kC1 * in12 |
| 108 // d = kC1 * in4 + kC2 * in12 |
| 109 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0)); |
| 110 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1)); |
| 111 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b |
| 112 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c |
| 113 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c |
| 114 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c |
| 115 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); |
| 116 Transpose8x2(E0, E1, rows); |
| 117 } |
| 118 |
| 119 static void ITransformOne(const uint8_t* ref, |
| 120 const int16_t* in, uint8_t* dst) { |
| 121 int16x8x2_t rows; |
| 122 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); |
| 123 TransformPass(&rows); |
| 124 TransformPass(&rows); |
| 125 Add4x4(rows.val[0], rows.val[1], ref, dst); |
| 126 } |
| 127 |
| 128 #else |
| 129 |
| 130 static void ITransformOne(const uint8_t* ref, |
| 131 const int16_t* in, uint8_t* dst) { |
| 132 const int kBPS = BPS; |
| 133 const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; |
| 134 |
| 135 __asm__ volatile ( |
| 136 "vld1.16 {q1, q2}, [%[in]] \n" |
| 137 "vld1.16 {d0}, [%[kC1C2]] \n" |
| 138 |
| 139 // d2: in[0] |
| 140 // d3: in[8] |
| 141 // d4: in[4] |
| 142 // d5: in[12] |
| 143 "vswp d3, d4 \n" |
| 144 |
| 145 // q8 = {in[4], in[12]} * kC1 * 2 >> 16 |
| 146 // q9 = {in[4], in[12]} * kC2 >> 16 |
| 147 "vqdmulh.s16 q8, q2, d0[0] \n" |
| 148 "vqdmulh.s16 q9, q2, d0[1] \n" |
| 149 |
| 150 // d22 = a = in[0] + in[8] |
| 151 // d23 = b = in[0] - in[8] |
| 152 "vqadd.s16 d22, d2, d3 \n" |
| 153 "vqsub.s16 d23, d2, d3 \n" |
| 154 |
| 155 // q8 = in[4]/[12] * kC1 >> 16 |
| 156 "vshr.s16 q8, q8, #1 \n" |
| 157 |
| 158 // Add {in[4], in[12]} back after the multiplication. |
| 159 "vqadd.s16 q8, q2, q8 \n" |
| 160 |
| 161 // d20 = c = in[4]*kC2 - in[12]*kC1 |
| 162 // d21 = d = in[4]*kC1 + in[12]*kC2 |
| 163 "vqsub.s16 d20, d18, d17 \n" |
| 164 "vqadd.s16 d21, d19, d16 \n" |
| 165 |
| 166 // d2 = tmp[0] = a + d |
| 167 // d3 = tmp[1] = b + c |
| 168 // d4 = tmp[2] = b - c |
| 169 // d5 = tmp[3] = a - d |
| 170 "vqadd.s16 d2, d22, d21 \n" |
| 171 "vqadd.s16 d3, d23, d20 \n" |
| 172 "vqsub.s16 d4, d23, d20 \n" |
| 173 "vqsub.s16 d5, d22, d21 \n" |
| 174 |
| 175 "vzip.16 q1, q2 \n" |
| 176 "vzip.16 q1, q2 \n" |
| 177 |
| 178 "vswp d3, d4 \n" |
| 179 |
| 180 // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 |
| 181 // q9 = {tmp[4], tmp[12]} * kC2 >> 16 |
| 182 "vqdmulh.s16 q8, q2, d0[0] \n" |
| 183 "vqdmulh.s16 q9, q2, d0[1] \n" |
| 184 |
| 185 // d22 = a = tmp[0] + tmp[8] |
| 186 // d23 = b = tmp[0] - tmp[8] |
| 187 "vqadd.s16 d22, d2, d3 \n" |
| 188 "vqsub.s16 d23, d2, d3 \n" |
| 189 |
| 190 "vshr.s16 q8, q8, #1 \n" |
| 191 "vqadd.s16 q8, q2, q8 \n" |
| 192 |
| 193 // d20 = c = in[4]*kC2 - in[12]*kC1 |
| 194 // d21 = d = in[4]*kC1 + in[12]*kC2 |
| 195 "vqsub.s16 d20, d18, d17 \n" |
| 196 "vqadd.s16 d21, d19, d16 \n" |
| 197 |
| 198 // d2 = tmp[0] = a + d |
| 199 // d3 = tmp[1] = b + c |
| 200 // d4 = tmp[2] = b - c |
| 201 // d5 = tmp[3] = a - d |
| 202 "vqadd.s16 d2, d22, d21 \n" |
| 203 "vqadd.s16 d3, d23, d20 \n" |
| 204 "vqsub.s16 d4, d23, d20 \n" |
| 205 "vqsub.s16 d5, d22, d21 \n" |
| 206 |
| 207 "vld1.32 d6[0], [%[ref]], %[kBPS] \n" |
| 208 "vld1.32 d6[1], [%[ref]], %[kBPS] \n" |
| 209 "vld1.32 d7[0], [%[ref]], %[kBPS] \n" |
| 210 "vld1.32 d7[1], [%[ref]], %[kBPS] \n" |
| 211 |
| 212 "sub %[ref], %[ref], %[kBPS], lsl #2 \n" |
| 213 |
| 214 // (val) + 4 >> 3 |
| 215 "vrshr.s16 d2, d2, #3 \n" |
| 216 "vrshr.s16 d3, d3, #3 \n" |
| 217 "vrshr.s16 d4, d4, #3 \n" |
| 218 "vrshr.s16 d5, d5, #3 \n" |
| 219 |
| 220 "vzip.16 q1, q2 \n" |
| 221 "vzip.16 q1, q2 \n" |
| 222 |
| 223 // Must accumulate before saturating |
| 224 "vmovl.u8 q8, d6 \n" |
| 225 "vmovl.u8 q9, d7 \n" |
| 226 |
| 227 "vqadd.s16 q1, q1, q8 \n" |
| 228 "vqadd.s16 q2, q2, q9 \n" |
| 229 |
| 230 "vqmovun.s16 d0, q1 \n" |
| 231 "vqmovun.s16 d1, q2 \n" |
| 232 |
| 233 "vst1.32 d0[0], [%[dst]], %[kBPS] \n" |
| 234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" |
| 235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" |
| 236 "vst1.32 d1[1], [%[dst]] \n" |
| 237 |
| 238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers |
| 239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants |
| 240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered |
| 241 ); |
| 242 } |
| 243 |
| 244 #endif // USE_INTRINSICS |
| 245 |
| 246 static void ITransform(const uint8_t* ref, |
| 247 const int16_t* in, uint8_t* dst, int do_two) { |
| 248 ITransformOne(ref, in, dst); |
| 249 if (do_two) { |
| 250 ITransformOne(ref + 4, in + 16, dst + 4); |
| 251 } |
| 252 } |
| 253 |
| 254 // Load all 4x4 pixels into a single uint8x16_t variable. |
| 255 static uint8x16_t Load4x4(const uint8_t* src) { |
| 256 uint32x4_t out = vdupq_n_u32(0); |
| 257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); |
| 258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); |
| 259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); |
| 260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); |
| 261 return vreinterpretq_u8_u32(out); |
| 262 } |
| 263 |
| 264 // Forward transform. |
| 265 |
| 266 #if defined(USE_INTRINSICS) |
| 267 |
| 268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, |
| 269 const int16x4_t C, const int16x4_t D, |
| 270 int16x8_t* const out01, |
| 271 int16x8_t* const out32) { |
| 272 const int16x4x2_t AB = vtrn_s16(A, B); |
| 273 const int16x4x2_t CD = vtrn_s16(C, D); |
| 274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), |
| 275 vreinterpret_s32_s16(CD.val[0])); |
| 276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]), |
| 277 vreinterpret_s32_s16(CD.val[1])); |
| 278 *out01 = vreinterpretq_s16_s64( |
| 279 vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]), |
| 280 vreinterpret_s64_s32(tmp13.val[0]))); |
| 281 *out32 = vreinterpretq_s16_s64( |
| 282 vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]), |
| 283 vreinterpret_s64_s32(tmp02.val[1]))); |
| 284 } |
| 285 |
| 286 static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a, |
| 287 const uint8x8_t b) { |
| 288 return vreinterpretq_s16_u16(vsubl_u8(a, b)); |
| 289 } |
| 290 |
| 291 static void FTransform(const uint8_t* src, const uint8_t* ref, |
| 292 int16_t* out) { |
| 293 int16x8_t d0d1, d3d2; // working 4x4 int16 variables |
| 294 { |
| 295 const uint8x16_t S0 = Load4x4(src); |
| 296 const uint8x16_t R0 = Load4x4(ref); |
| 297 const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0)); |
| 298 const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0)); |
| 299 const int16x4_t D0 = vget_low_s16(D0D1); |
| 300 const int16x4_t D1 = vget_high_s16(D0D1); |
| 301 const int16x4_t D2 = vget_low_s16(D2D3); |
| 302 const int16x4_t D3 = vget_high_s16(D2D3); |
| 303 Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2); |
| 304 } |
| 305 { // 1rst pass |
| 306 const int32x4_t kCst937 = vdupq_n_s32(937); |
| 307 const int32x4_t kCst1812 = vdupq_n_s32(1812); |
| 308 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) |
| 309 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) |
| 310 const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3); |
| 311 const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2), |
| 312 vget_high_s16(a0a1_2)); |
| 313 const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2), |
| 314 vget_high_s16(a0a1_2)); |
| 315 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); |
| 316 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); |
| 317 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); |
| 318 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); |
| 319 const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9); |
| 320 const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9); |
| 321 Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2); |
| 322 } |
| 323 { // 2nd pass |
| 324 // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0) |
| 325 const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16)); |
| 326 const int32x4_t kCst51000 = vdupq_n_s32(51000); |
| 327 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) |
| 328 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) |
| 329 const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7)); |
| 330 const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4); |
| 331 const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4); |
| 332 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); |
| 333 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); |
| 334 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); |
| 335 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); |
| 336 const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000); |
| 337 const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000); |
| 338 const int16x4_t a3_eq_0 = |
| 339 vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0))); |
| 340 const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0); |
| 341 vst1_s16(out + 0, out0); |
| 342 vst1_s16(out + 4, out1); |
| 343 vst1_s16(out + 8, out2); |
| 344 vst1_s16(out + 12, out3); |
| 345 } |
| 346 } |
| 347 |
| 348 #else |
| 349 |
| 350 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm |
| 351 static const int16_t kCoeff16[] = { |
| 352 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217 |
| 353 }; |
| 354 static const int32_t kCoeff32[] = { |
| 355 1812, 1812, 1812, 1812, |
| 356 937, 937, 937, 937, |
| 357 12000, 12000, 12000, 12000, |
| 358 51000, 51000, 51000, 51000 |
| 359 }; |
| 360 |
| 361 static void FTransform(const uint8_t* src, const uint8_t* ref, |
| 362 int16_t* out) { |
| 363 const int kBPS = BPS; |
| 364 const uint8_t* src_ptr = src; |
| 365 const uint8_t* ref_ptr = ref; |
| 366 const int16_t* coeff16 = kCoeff16; |
| 367 const int32_t* coeff32 = kCoeff32; |
| 368 |
| 369 __asm__ volatile ( |
| 370 // load src into q4, q5 in high half |
| 371 "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n" |
| 372 "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n" |
| 373 "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n" |
| 374 "vld1.8 {d11}, [%[src_ptr]] \n" |
| 375 |
| 376 // load ref into q6, q7 in high half |
| 377 "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n" |
| 378 "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n" |
| 379 "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n" |
| 380 "vld1.8 {d15}, [%[ref_ptr]] \n" |
| 381 |
| 382 // Pack the high values in to q4 and q6 |
| 383 "vtrn.32 q4, q5 \n" |
| 384 "vtrn.32 q6, q7 \n" |
| 385 |
| 386 // d[0-3] = src - ref |
| 387 "vsubl.u8 q0, d8, d12 \n" |
| 388 "vsubl.u8 q1, d9, d13 \n" |
| 389 |
| 390 // load coeff16 into q8(d16=5352, d17=2217) |
| 391 "vld1.16 {q8}, [%[coeff16]] \n" |
| 392 |
| 393 // load coeff32 high half into q9 = 1812, q10 = 937 |
| 394 "vld1.32 {q9, q10}, [%[coeff32]]! \n" |
| 395 |
| 396 // load coeff32 low half into q11=12000, q12=51000 |
| 397 "vld1.32 {q11,q12}, [%[coeff32]] \n" |
| 398 |
| 399 // part 1 |
| 400 // Transpose. Register dN is the same as dN in C |
| 401 "vtrn.32 d0, d2 \n" |
| 402 "vtrn.32 d1, d3 \n" |
| 403 "vtrn.16 d0, d1 \n" |
| 404 "vtrn.16 d2, d3 \n" |
| 405 |
| 406 "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3 |
| 407 "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2 |
| 408 "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2 |
| 409 "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3 |
| 410 |
| 411 "vadd.s16 d0, d4, d5 \n" // a0 + a1 |
| 412 "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3 |
| 413 "vsub.s16 d2, d4, d5 \n" // a0 - a1 |
| 414 "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3 |
| 415 |
| 416 "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812 |
| 417 "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937 |
| 418 "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812 |
| 419 "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352 |
| 420 |
| 421 // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9 |
| 422 // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9 |
| 423 "vshrn.s32 d1, q9, #9 \n" |
| 424 "vshrn.s32 d3, q10, #9 \n" |
| 425 |
| 426 // part 2 |
| 427 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] |
| 428 "vtrn.32 d0, d2 \n" |
| 429 "vtrn.32 d1, d3 \n" |
| 430 "vtrn.16 d0, d1 \n" |
| 431 "vtrn.16 d2, d3 \n" |
| 432 |
| 433 "vmov.s16 d26, #7 \n" |
| 434 |
| 435 "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12] |
| 436 "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8] |
| 437 "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8] |
| 438 "vadd.s16 d4, d4, d26 \n" // a1 + 7 |
| 439 "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12] |
| 440 |
| 441 "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7 |
| 442 "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7 |
| 443 |
| 444 "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000 |
| 445 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 |
| 446 |
| 447 "vceq.s16 d4, d7, #0 \n" |
| 448 |
| 449 "vshr.s16 d0, d0, #4 \n" |
| 450 "vshr.s16 d2, d2, #4 \n" |
| 451 |
| 452 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 |
| 453 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 |
| 454 |
| 455 "vmvn d4, d4 \n" // !(d1 == 0) |
| 456 // op[4] = (c1*2217 + d1*5352 + 12000)>>16 |
| 457 "vshrn.s32 d1, q11, #16 \n" |
| 458 // op[4] += (d1!=0) |
| 459 "vsub.s16 d1, d1, d4 \n" |
| 460 // op[12]= (d1*2217 - c1*5352 + 51000)>>16 |
| 461 "vshrn.s32 d3, q12, #16 \n" |
| 462 |
| 463 // set result to out array |
| 464 "vst1.16 {q0, q1}, [%[out]] \n" |
| 465 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), |
| 466 [coeff32] "+r"(coeff32) // modified registers |
| 467 : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16), |
| 468 [out] "r"(out) // constants |
| 469 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", |
| 470 "q10", "q11", "q12", "q13" // clobbered |
| 471 ); |
| 472 } |
| 473 |
| 474 #endif |
| 475 |
| 476 #define LOAD_LANE_16b(VALUE, LANE) do { \ |
| 477 (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \ |
| 478 src += stride; \ |
| 479 } while (0) |
| 480 |
| 481 static void FTransformWHT(const int16_t* src, int16_t* out) { |
| 482 const int stride = 16; |
| 483 const int16x4_t zero = vdup_n_s16(0); |
| 484 int32x4x4_t tmp0; |
| 485 int16x4x4_t in; |
| 486 INIT_VECTOR4(in, zero, zero, zero, zero); |
| 487 LOAD_LANE_16b(in.val[0], 0); |
| 488 LOAD_LANE_16b(in.val[1], 0); |
| 489 LOAD_LANE_16b(in.val[2], 0); |
| 490 LOAD_LANE_16b(in.val[3], 0); |
| 491 LOAD_LANE_16b(in.val[0], 1); |
| 492 LOAD_LANE_16b(in.val[1], 1); |
| 493 LOAD_LANE_16b(in.val[2], 1); |
| 494 LOAD_LANE_16b(in.val[3], 1); |
| 495 LOAD_LANE_16b(in.val[0], 2); |
| 496 LOAD_LANE_16b(in.val[1], 2); |
| 497 LOAD_LANE_16b(in.val[2], 2); |
| 498 LOAD_LANE_16b(in.val[3], 2); |
| 499 LOAD_LANE_16b(in.val[0], 3); |
| 500 LOAD_LANE_16b(in.val[1], 3); |
| 501 LOAD_LANE_16b(in.val[2], 3); |
| 502 LOAD_LANE_16b(in.val[3], 3); |
| 503 |
| 504 { |
| 505 // a0 = in[0 * 16] + in[2 * 16] |
| 506 // a1 = in[1 * 16] + in[3 * 16] |
| 507 // a2 = in[1 * 16] - in[3 * 16] |
| 508 // a3 = in[0 * 16] - in[2 * 16] |
| 509 const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]); |
| 510 const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]); |
| 511 const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]); |
| 512 const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]); |
| 513 tmp0.val[0] = vaddq_s32(a0, a1); |
| 514 tmp0.val[1] = vaddq_s32(a3, a2); |
| 515 tmp0.val[2] = vsubq_s32(a3, a2); |
| 516 tmp0.val[3] = vsubq_s32(a0, a1); |
| 517 } |
| 518 { |
| 519 const int32x4x4_t tmp1 = Transpose4x4(tmp0); |
| 520 // a0 = tmp[0 + i] + tmp[ 8 + i] |
| 521 // a1 = tmp[4 + i] + tmp[12 + i] |
| 522 // a2 = tmp[4 + i] - tmp[12 + i] |
| 523 // a3 = tmp[0 + i] - tmp[ 8 + i] |
| 524 const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]); |
| 525 const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]); |
| 526 const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]); |
| 527 const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]); |
| 528 const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1 |
| 529 const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1 |
| 530 const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1 |
| 531 const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1 |
| 532 const int16x4_t out0 = vmovn_s32(b0); |
| 533 const int16x4_t out1 = vmovn_s32(b1); |
| 534 const int16x4_t out2 = vmovn_s32(b2); |
| 535 const int16x4_t out3 = vmovn_s32(b3); |
| 536 |
| 537 vst1_s16(out + 0, out0); |
| 538 vst1_s16(out + 4, out1); |
| 539 vst1_s16(out + 8, out2); |
| 540 vst1_s16(out + 12, out3); |
| 541 } |
| 542 } |
| 543 #undef LOAD_LANE_16b |
| 544 |
| 545 //------------------------------------------------------------------------------ |
| 546 // Texture distortion |
| 547 // |
| 548 // We try to match the spectral content (weighted) between source and |
| 549 // reconstructed samples. |
| 550 |
| 551 // This code works but is *slower* than the inlined-asm version below |
| 552 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to |
| 553 // USE_INTRINSICS define. |
| 554 // With gcc-4.8, it's only slightly slower than the inlined. |
| 555 #if defined(USE_INTRINSICS) |
| 556 |
| 557 // Zero extend an uint16x4_t 'v' to an int32x4_t. |
| 558 static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) { |
| 559 return vreinterpretq_s32_u32(vmovl_u16(v)); |
| 560 } |
| 561 |
| 562 // Does a regular 4x4 transpose followed by an adjustment of the upper columns |
| 563 // in the inner rows to restore the source order of differences, |
| 564 // i.e., a0 - a1 | a3 - a2. |
| 565 static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) { |
| 566 int32x4x4_t out = Transpose4x4(rows); |
| 567 // restore source order in the columns containing differences. |
| 568 const int32x2_t r1h = vget_high_s32(out.val[1]); |
| 569 const int32x2_t r2h = vget_high_s32(out.val[2]); |
| 570 out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h); |
| 571 out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h); |
| 572 return out; |
| 573 } |
| 574 |
| 575 static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1, |
| 576 const uint8x8_t r2r3) { |
| 577 // a0 = in[0] + in[2] | a1 = in[1] + in[3] |
| 578 const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3); |
| 579 // a3 = in[0] - in[2] | a2 = in[1] - in[3] |
| 580 const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3); |
| 581 const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1)); // a0 + a1 |
| 582 const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2)); // a3 + a2 |
| 583 // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations. |
| 584 // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 |
| 585 // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 |
| 586 const int16x8x2_t transpose = |
| 587 vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2)); |
| 588 // tmp[3] = a0 - a1 | tmp[2] = a3 - a2 |
| 589 const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]), |
| 590 vget_low_s16(transpose.val[1])); |
| 591 const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]), |
| 592 vget_high_s16(transpose.val[1])); |
| 593 // [0]: tmp[3] [1]: tmp[2] |
| 594 const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2); |
| 595 const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } }; |
| 596 return res; |
| 597 } |
| 598 |
| 599 static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) { |
| 600 // a0 = tmp[0 + i] + tmp[8 + i]; |
| 601 const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]); |
| 602 // a1 = tmp[4 + i] + tmp[12+ i]; |
| 603 const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]); |
| 604 // a2 = tmp[4 + i] - tmp[12+ i]; |
| 605 const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]); |
| 606 // a3 = tmp[0 + i] - tmp[8 + i]; |
| 607 const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]); |
| 608 const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1)); // abs(a0 + a1) |
| 609 const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2)); // abs(a3 + a2) |
| 610 const int32x4_t b2 = vabdq_s32(a3, a2); // abs(a3 - a2) |
| 611 const int32x4_t b3 = vabdq_s32(a0, a1); // abs(a0 - a1) |
| 612 const int32x4x4_t res = { { b0, b1, b2, b3 } }; |
| 613 return res; |
| 614 } |
| 615 |
| 616 // Calculate the weighted sum of the rows in 'b'. |
| 617 static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b, |
| 618 const int32x4_t w0, const int32x4_t w1, |
| 619 const int32x4_t w2, const int32x4_t w3) { |
| 620 const int32x4_t s0 = vmulq_s32(w0, b.val[0]); |
| 621 const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]); |
| 622 const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]); |
| 623 const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]); |
| 624 const int64x2_t sum1 = vpaddlq_s32(s3); |
| 625 const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1)); |
| 626 return sum2; |
| 627 } |
| 628 |
| 629 #define LOAD_LANE_32b(src, VALUE, LANE) \ |
| 630 (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) |
| 631 |
| 632 // Hadamard transform |
| 633 // Returns the weighted sum of the absolute value of transformed coefficients. |
| 634 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, |
| 635 const uint16_t* const w) { |
| 636 uint32x4_t d0d1 = { 0, 0, 0, 0 }; |
| 637 uint32x4_t d2d3 = { 0, 0, 0, 0 }; |
| 638 LOAD_LANE_32b(a + 0 * BPS, d0d1, 0); // a00 a01 a02 a03 |
| 639 LOAD_LANE_32b(a + 1 * BPS, d0d1, 1); // a10 a11 a12 a13 |
| 640 LOAD_LANE_32b(b + 0 * BPS, d0d1, 2); // b00 b01 b02 b03 |
| 641 LOAD_LANE_32b(b + 1 * BPS, d0d1, 3); // b10 b11 b12 b13 |
| 642 LOAD_LANE_32b(a + 2 * BPS, d2d3, 0); // a20 a21 a22 a23 |
| 643 LOAD_LANE_32b(a + 3 * BPS, d2d3, 1); // a30 a31 a32 a33 |
| 644 LOAD_LANE_32b(b + 2 * BPS, d2d3, 2); // b20 b21 b22 b23 |
| 645 LOAD_LANE_32b(b + 3 * BPS, d2d3, 3); // b30 b31 b32 b33 |
| 646 |
| 647 { |
| 648 // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31 |
| 649 // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33 |
| 650 const uint16x8x2_t tmp = |
| 651 vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3)); |
| 652 const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]); |
| 653 const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]); |
| 654 const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8), |
| 655 vget_low_u8(d2d3u8)); |
| 656 const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8), |
| 657 vget_high_u8(d2d3u8)); |
| 658 const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a); |
| 659 const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b); |
| 660 const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a); |
| 661 const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b); |
| 662 const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0)); |
| 663 const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4)); |
| 664 const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8)); |
| 665 const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12)); |
| 666 const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3); |
| 667 const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3); |
| 668 const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1), |
| 669 vreinterpret_s32_s64(sum2)); |
| 670 const int32x2_t res = vshr_n_s32(diff, 5); |
| 671 return vget_lane_s32(res, 0); |
| 672 } |
| 673 } |
| 674 |
| 675 #undef LOAD_LANE_32b |
| 676 |
| 677 #else |
| 678 |
| 679 // Hadamard transform |
| 680 // Returns the weighted sum of the absolute value of transformed coefficients. |
| 681 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, |
| 682 const uint16_t* const w) { |
| 683 const int kBPS = BPS; |
| 684 const uint8_t* A = a; |
| 685 const uint8_t* B = b; |
| 686 const uint16_t* W = w; |
| 687 int sum; |
| 688 __asm__ volatile ( |
| 689 "vld1.32 d0[0], [%[a]], %[kBPS] \n" |
| 690 "vld1.32 d0[1], [%[a]], %[kBPS] \n" |
| 691 "vld1.32 d2[0], [%[a]], %[kBPS] \n" |
| 692 "vld1.32 d2[1], [%[a]] \n" |
| 693 |
| 694 "vld1.32 d1[0], [%[b]], %[kBPS] \n" |
| 695 "vld1.32 d1[1], [%[b]], %[kBPS] \n" |
| 696 "vld1.32 d3[0], [%[b]], %[kBPS] \n" |
| 697 "vld1.32 d3[1], [%[b]] \n" |
| 698 |
| 699 // a d0/d2, b d1/d3 |
| 700 // d0/d1: 01 01 01 01 |
| 701 // d2/d3: 23 23 23 23 |
| 702 // But: it goes 01 45 23 67 |
| 703 // Notice the middle values are transposed |
| 704 "vtrn.16 q0, q1 \n" |
| 705 |
| 706 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} |
| 707 "vaddl.u8 q2, d0, d2 \n" |
| 708 "vaddl.u8 q10, d1, d3 \n" |
| 709 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} |
| 710 "vsubl.u8 q3, d0, d2 \n" |
| 711 "vsubl.u8 q11, d1, d3 \n" |
| 712 |
| 713 // tmp[0] = a0 + a1 |
| 714 "vpaddl.s16 q0, q2 \n" |
| 715 "vpaddl.s16 q8, q10 \n" |
| 716 |
| 717 // tmp[1] = a3 + a2 |
| 718 "vpaddl.s16 q1, q3 \n" |
| 719 "vpaddl.s16 q9, q11 \n" |
| 720 |
| 721 // No pair subtract |
| 722 // q2 = {a0, a3} |
| 723 // q3 = {a1, a2} |
| 724 "vtrn.16 q2, q3 \n" |
| 725 "vtrn.16 q10, q11 \n" |
| 726 |
| 727 // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2} |
| 728 "vsubl.s16 q12, d4, d6 \n" |
| 729 "vsubl.s16 q13, d5, d7 \n" |
| 730 "vsubl.s16 q14, d20, d22 \n" |
| 731 "vsubl.s16 q15, d21, d23 \n" |
| 732 |
| 733 // separate tmp[3] and tmp[2] |
| 734 // q12 = tmp[3] |
| 735 // q13 = tmp[2] |
| 736 "vtrn.32 q12, q13 \n" |
| 737 "vtrn.32 q14, q15 \n" |
| 738 |
| 739 // Transpose tmp for a |
| 740 "vswp d1, d26 \n" // vtrn.64 |
| 741 "vswp d3, d24 \n" // vtrn.64 |
| 742 "vtrn.32 q0, q1 \n" |
| 743 "vtrn.32 q13, q12 \n" |
| 744 |
| 745 // Transpose tmp for b |
| 746 "vswp d17, d30 \n" // vtrn.64 |
| 747 "vswp d19, d28 \n" // vtrn.64 |
| 748 "vtrn.32 q8, q9 \n" |
| 749 "vtrn.32 q15, q14 \n" |
| 750 |
| 751 // The first Q register is a, the second b. |
| 752 // q0/8 tmp[0-3] |
| 753 // q13/15 tmp[4-7] |
| 754 // q1/9 tmp[8-11] |
| 755 // q12/14 tmp[12-15] |
| 756 |
| 757 // These are still in 01 45 23 67 order. We fix it easily in the addition |
| 758 // case but the subtraction propagates them. |
| 759 "vswp d3, d27 \n" |
| 760 "vswp d19, d31 \n" |
| 761 |
| 762 // a0 = tmp[0] + tmp[8] |
| 763 "vadd.s32 q2, q0, q1 \n" |
| 764 "vadd.s32 q3, q8, q9 \n" |
| 765 |
| 766 // a1 = tmp[4] + tmp[12] |
| 767 "vadd.s32 q10, q13, q12 \n" |
| 768 "vadd.s32 q11, q15, q14 \n" |
| 769 |
| 770 // a2 = tmp[4] - tmp[12] |
| 771 "vsub.s32 q13, q13, q12 \n" |
| 772 "vsub.s32 q15, q15, q14 \n" |
| 773 |
| 774 // a3 = tmp[0] - tmp[8] |
| 775 "vsub.s32 q0, q0, q1 \n" |
| 776 "vsub.s32 q8, q8, q9 \n" |
| 777 |
| 778 // b0 = a0 + a1 |
| 779 "vadd.s32 q1, q2, q10 \n" |
| 780 "vadd.s32 q9, q3, q11 \n" |
| 781 |
| 782 // b1 = a3 + a2 |
| 783 "vadd.s32 q12, q0, q13 \n" |
| 784 "vadd.s32 q14, q8, q15 \n" |
| 785 |
| 786 // b2 = a3 - a2 |
| 787 "vsub.s32 q0, q0, q13 \n" |
| 788 "vsub.s32 q8, q8, q15 \n" |
| 789 |
| 790 // b3 = a0 - a1 |
| 791 "vsub.s32 q2, q2, q10 \n" |
| 792 "vsub.s32 q3, q3, q11 \n" |
| 793 |
| 794 "vld1.64 {q10, q11}, [%[w]] \n" |
| 795 |
| 796 // abs(b0) |
| 797 "vabs.s32 q1, q1 \n" |
| 798 "vabs.s32 q9, q9 \n" |
| 799 // abs(b1) |
| 800 "vabs.s32 q12, q12 \n" |
| 801 "vabs.s32 q14, q14 \n" |
| 802 // abs(b2) |
| 803 "vabs.s32 q0, q0 \n" |
| 804 "vabs.s32 q8, q8 \n" |
| 805 // abs(b3) |
| 806 "vabs.s32 q2, q2 \n" |
| 807 "vabs.s32 q3, q3 \n" |
| 808 |
| 809 // expand w before using. |
| 810 "vmovl.u16 q13, d20 \n" |
| 811 "vmovl.u16 q15, d21 \n" |
| 812 |
| 813 // w[0] * abs(b0) |
| 814 "vmul.u32 q1, q1, q13 \n" |
| 815 "vmul.u32 q9, q9, q13 \n" |
| 816 |
| 817 // w[4] * abs(b1) |
| 818 "vmla.u32 q1, q12, q15 \n" |
| 819 "vmla.u32 q9, q14, q15 \n" |
| 820 |
| 821 // expand w before using. |
| 822 "vmovl.u16 q13, d22 \n" |
| 823 "vmovl.u16 q15, d23 \n" |
| 824 |
| 825 // w[8] * abs(b1) |
| 826 "vmla.u32 q1, q0, q13 \n" |
| 827 "vmla.u32 q9, q8, q13 \n" |
| 828 |
| 829 // w[12] * abs(b1) |
| 830 "vmla.u32 q1, q2, q15 \n" |
| 831 "vmla.u32 q9, q3, q15 \n" |
| 832 |
| 833 // Sum the arrays |
| 834 "vpaddl.u32 q1, q1 \n" |
| 835 "vpaddl.u32 q9, q9 \n" |
| 836 "vadd.u64 d2, d3 \n" |
| 837 "vadd.u64 d18, d19 \n" |
| 838 |
| 839 // Hadamard transform needs 4 bits of extra precision (2 bits in each |
| 840 // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum |
| 841 // precision for coeff is 8bit of input + 4bits of Hadamard transform + |
| 842 // 16bits for w[] + 2 bits of abs() summation. |
| 843 // |
| 844 // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is |
| 845 // A-OK. |
| 846 |
| 847 // sum2 - sum1 |
| 848 "vsub.u32 d0, d2, d18 \n" |
| 849 // abs(sum2 - sum1) |
| 850 "vabs.s32 d0, d0 \n" |
| 851 // abs(sum2 - sum1) >> 5 |
| 852 "vshr.u32 d0, #5 \n" |
| 853 |
| 854 // It would be better to move the value straight into r0 but I'm not |
| 855 // entirely sure how this works with inline assembly. |
| 856 "vmov.32 %[sum], d0[0] \n" |
| 857 |
| 858 : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W) |
| 859 : [kBPS] "r"(kBPS) |
| 860 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", |
| 861 "q10", "q11", "q12", "q13", "q14", "q15" // clobbered |
| 862 ) ; |
| 863 |
| 864 return sum; |
| 865 } |
| 866 |
| 867 #endif // USE_INTRINSICS |
| 868 |
| 869 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, |
| 870 const uint16_t* const w) { |
| 871 int D = 0; |
| 872 int x, y; |
| 873 for (y = 0; y < 16 * BPS; y += 4 * BPS) { |
| 874 for (x = 0; x < 16; x += 4) { |
| 875 D += Disto4x4(a + x + y, b + x + y, w); |
| 876 } |
| 877 } |
| 878 return D; |
| 879 } |
| 880 |
| 881 //------------------------------------------------------------------------------ |
| 882 |
| 883 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, |
| 884 int start_block, int end_block, |
| 885 VP8Histogram* const histo) { |
| 886 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); |
| 887 int j; |
| 888 for (j = start_block; j < end_block; ++j) { |
| 889 int16_t out[16]; |
| 890 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); |
| 891 { |
| 892 int k; |
| 893 const int16x8_t a0 = vld1q_s16(out + 0); |
| 894 const int16x8_t b0 = vld1q_s16(out + 8); |
| 895 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0)); |
| 896 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0)); |
| 897 const uint16x8_t a2 = vshrq_n_u16(a1, 3); |
| 898 const uint16x8_t b2 = vshrq_n_u16(b1, 3); |
| 899 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh); |
| 900 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh); |
| 901 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3)); |
| 902 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3)); |
| 903 // Convert coefficients to bin. |
| 904 for (k = 0; k < 16; ++k) { |
| 905 histo->distribution[out[k]]++; |
| 906 } |
| 907 } |
| 908 } |
| 909 } |
| 910 |
| 911 //------------------------------------------------------------------------------ |
| 912 |
| 913 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, |
| 914 const uint8_t* const b, |
| 915 uint32x4_t* const sum) { |
| 916 const uint8x16_t a0 = vld1q_u8(a); |
| 917 const uint8x16_t b0 = vld1q_u8(b); |
| 918 const uint8x16_t abs_diff = vabdq_u8(a0, b0); |
| 919 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); |
| 920 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); |
| 921 *sum = vpadalq_u16(*sum, prod); // pair-wise add and accumulate |
| 922 } |
| 923 |
| 924 // Horizontal sum of all four uint32_t values in 'sum'. |
| 925 static int SumToInt(uint32x4_t sum) { |
| 926 const uint64x2_t sum2 = vpaddlq_u32(sum); |
| 927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); |
| 928 return (int)sum3; |
| 929 } |
| 930 |
| 931 static int SSE16x16(const uint8_t* a, const uint8_t* b) { |
| 932 uint32x4_t sum = vdupq_n_u32(0); |
| 933 int y; |
| 934 for (y = 0; y < 16; ++y) { |
| 935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); |
| 936 } |
| 937 return SumToInt(sum); |
| 938 } |
| 939 |
| 940 static int SSE16x8(const uint8_t* a, const uint8_t* b) { |
| 941 uint32x4_t sum = vdupq_n_u32(0); |
| 942 int y; |
| 943 for (y = 0; y < 8; ++y) { |
| 944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); |
| 945 } |
| 946 return SumToInt(sum); |
| 947 } |
| 948 |
| 949 static int SSE8x8(const uint8_t* a, const uint8_t* b) { |
| 950 uint32x4_t sum = vdupq_n_u32(0); |
| 951 int y; |
| 952 for (y = 0; y < 8; ++y) { |
| 953 const uint8x8_t a0 = vld1_u8(a + y * BPS); |
| 954 const uint8x8_t b0 = vld1_u8(b + y * BPS); |
| 955 const uint8x8_t abs_diff = vabd_u8(a0, b0); |
| 956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); |
| 957 sum = vpadalq_u16(sum, prod); |
| 958 } |
| 959 return SumToInt(sum); |
| 960 } |
| 961 |
| 962 static int SSE4x4(const uint8_t* a, const uint8_t* b) { |
| 963 const uint8x16_t a0 = Load4x4(a); |
| 964 const uint8x16_t b0 = Load4x4(b); |
| 965 const uint8x16_t abs_diff = vabdq_u8(a0, b0); |
| 966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); |
| 967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); |
| 968 return SumToInt(vpaddlq_u16(prod)); |
| 969 } |
| 970 |
| 971 //------------------------------------------------------------------------------ |
| 972 |
| 973 // Compilation with gcc-4.6.x is problematic for now. |
| 974 #if !defined(WORK_AROUND_GCC) |
| 975 |
| 976 static int16x8_t Quantize(int16_t* const in, |
| 977 const VP8Matrix* const mtx, int offset) { |
| 978 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); |
| 979 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); |
| 980 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); |
| 981 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); |
| 982 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); |
| 983 |
| 984 const int16x8_t a = vld1q_s16(in + offset); // in |
| 985 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) |
| 986 const int16x8_t sign = vshrq_n_s16(a, 15); // sign |
| 987 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen |
| 988 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); |
| 989 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); |
| 990 const uint32x4_t m2 = vhaddq_u32(m0, bias0); |
| 991 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 |
| 992 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), |
| 993 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 |
| 994 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); |
| 995 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); |
| 996 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign |
| 997 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); |
| 998 vst1q_s16(in + offset, c4); |
| 999 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 |
| 1000 return c3; |
| 1001 } |
| 1002 |
| 1003 static const uint8_t kShuffles[4][8] = { |
| 1004 { 0, 1, 2, 3, 8, 9, 16, 17 }, |
| 1005 { 10, 11, 4, 5, 6, 7, 12, 13 }, |
| 1006 { 18, 19, 24, 25, 26, 27, 20, 21 }, |
| 1007 { 14, 15, 22, 23, 28, 29, 30, 31 } |
| 1008 }; |
| 1009 |
| 1010 static int QuantizeBlock(int16_t in[16], int16_t out[16], |
| 1011 const VP8Matrix* const mtx) { |
| 1012 const int16x8_t out0 = Quantize(in, mtx, 0); |
| 1013 const int16x8_t out1 = Quantize(in, mtx, 8); |
| 1014 uint8x8x4_t shuffles; |
| 1015 // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there. |
| 1016 #if defined(__APPLE__) && defined(__aarch64__) && \ |
| 1017 defined(__apple_build_version__) |
| 1018 uint8x16x2_t all_out; |
| 1019 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); |
| 1020 INIT_VECTOR4(shuffles, |
| 1021 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])), |
| 1022 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])), |
| 1023 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])), |
| 1024 vtbl2q_u8(all_out, vld1_u8(kShuffles[3]))); |
| 1025 #else |
| 1026 uint8x8x4_t all_out; |
| 1027 INIT_VECTOR4(all_out, |
| 1028 vreinterpret_u8_s16(vget_low_s16(out0)), |
| 1029 vreinterpret_u8_s16(vget_high_s16(out0)), |
| 1030 vreinterpret_u8_s16(vget_low_s16(out1)), |
| 1031 vreinterpret_u8_s16(vget_high_s16(out1))); |
| 1032 INIT_VECTOR4(shuffles, |
| 1033 vtbl4_u8(all_out, vld1_u8(kShuffles[0])), |
| 1034 vtbl4_u8(all_out, vld1_u8(kShuffles[1])), |
| 1035 vtbl4_u8(all_out, vld1_u8(kShuffles[2])), |
| 1036 vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); |
| 1037 #endif |
| 1038 // Zigzag reordering |
| 1039 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]); |
| 1040 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]); |
| 1041 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); |
| 1042 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); |
| 1043 // test zeros |
| 1044 if (*(uint64_t*)(out + 0) != 0) return 1; |
| 1045 if (*(uint64_t*)(out + 4) != 0) return 1; |
| 1046 if (*(uint64_t*)(out + 8) != 0) return 1; |
| 1047 if (*(uint64_t*)(out + 12) != 0) return 1; |
| 1048 return 0; |
| 1049 } |
| 1050 |
| 1051 #endif // !WORK_AROUND_GCC |
| 1052 |
| 1053 #endif // WEBP_USE_NEON |
| 1054 |
| 1055 //------------------------------------------------------------------------------ |
| 1056 // Entry point |
| 1057 |
| 1058 extern void VP8EncDspInitNEON(void); |
| 1059 |
| 1060 void VP8EncDspInitNEON(void) { |
| 1061 #if defined(WEBP_USE_NEON) |
| 1062 VP8ITransform = ITransform; |
| 1063 VP8FTransform = FTransform; |
| 1064 |
| 1065 VP8FTransformWHT = FTransformWHT; |
| 1066 |
| 1067 VP8TDisto4x4 = Disto4x4; |
| 1068 VP8TDisto16x16 = Disto16x16; |
| 1069 VP8CollectHistogram = CollectHistogram; |
| 1070 VP8SSE16x16 = SSE16x16; |
| 1071 VP8SSE16x8 = SSE16x8; |
| 1072 VP8SSE8x8 = SSE8x8; |
| 1073 VP8SSE4x4 = SSE4x4; |
| 1074 #if !defined(WORK_AROUND_GCC) |
| 1075 VP8EncQuantizeBlock = QuantizeBlock; |
| 1076 #endif |
| 1077 #endif // WEBP_USE_NEON |
| 1078 } |
OLD | NEW |