OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <arm_neon.h> |
| 12 |
| 13 #include "./vpx_config.h" |
| 14 |
| 15 static int16_t cospi_1_64 = 16364; |
| 16 static int16_t cospi_2_64 = 16305; |
| 17 static int16_t cospi_3_64 = 16207; |
| 18 static int16_t cospi_4_64 = 16069; |
| 19 static int16_t cospi_5_64 = 15893; |
| 20 static int16_t cospi_6_64 = 15679; |
| 21 static int16_t cospi_7_64 = 15426; |
| 22 static int16_t cospi_8_64 = 15137; |
| 23 static int16_t cospi_9_64 = 14811; |
| 24 static int16_t cospi_10_64 = 14449; |
| 25 static int16_t cospi_11_64 = 14053; |
| 26 static int16_t cospi_12_64 = 13623; |
| 27 static int16_t cospi_13_64 = 13160; |
| 28 static int16_t cospi_14_64 = 12665; |
| 29 static int16_t cospi_15_64 = 12140; |
| 30 static int16_t cospi_16_64 = 11585; |
| 31 static int16_t cospi_17_64 = 11003; |
| 32 static int16_t cospi_18_64 = 10394; |
| 33 static int16_t cospi_19_64 = 9760; |
| 34 static int16_t cospi_20_64 = 9102; |
| 35 static int16_t cospi_21_64 = 8423; |
| 36 static int16_t cospi_22_64 = 7723; |
| 37 static int16_t cospi_23_64 = 7005; |
| 38 static int16_t cospi_24_64 = 6270; |
| 39 static int16_t cospi_25_64 = 5520; |
| 40 static int16_t cospi_26_64 = 4756; |
| 41 static int16_t cospi_27_64 = 3981; |
| 42 static int16_t cospi_28_64 = 3196; |
| 43 static int16_t cospi_29_64 = 2404; |
| 44 static int16_t cospi_30_64 = 1606; |
| 45 static int16_t cospi_31_64 = 804; |
| 46 |
| 47 #define LOAD_FROM_TRANSPOSED(prev, first, second) \ |
| 48 q14s16 = vld1q_s16(trans_buf + first * 8); \ |
| 49 q13s16 = vld1q_s16(trans_buf + second * 8); |
| 50 |
| 51 #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ |
| 52 qA = vld1q_s16(out + first * 32); \ |
| 53 qB = vld1q_s16(out + second * 32); |
| 54 |
| 55 #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ |
| 56 vst1q_s16(out + first * 32, qA); \ |
| 57 vst1q_s16(out + second * 32, qB); |
| 58 |
| 59 #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ |
| 60 __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ |
| 61 q6s16, q7s16, q8s16, q9s16); |
| 62 static INLINE void __STORE_COMBINE_CENTER_RESULTS( |
| 63 uint8_t *p1, |
| 64 uint8_t *p2, |
| 65 int stride, |
| 66 int16x8_t q6s16, |
| 67 int16x8_t q7s16, |
| 68 int16x8_t q8s16, |
| 69 int16x8_t q9s16) { |
| 70 int16x4_t d8s16, d9s16, d10s16, d11s16; |
| 71 |
| 72 d8s16 = vld1_s16((int16_t *)p1); |
| 73 p1 += stride; |
| 74 d11s16 = vld1_s16((int16_t *)p2); |
| 75 p2 -= stride; |
| 76 d9s16 = vld1_s16((int16_t *)p1); |
| 77 d10s16 = vld1_s16((int16_t *)p2); |
| 78 |
| 79 q7s16 = vrshrq_n_s16(q7s16, 6); |
| 80 q8s16 = vrshrq_n_s16(q8s16, 6); |
| 81 q9s16 = vrshrq_n_s16(q9s16, 6); |
| 82 q6s16 = vrshrq_n_s16(q6s16, 6); |
| 83 |
| 84 q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), |
| 85 vreinterpret_u8_s16(d9s16))); |
| 86 q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16), |
| 87 vreinterpret_u8_s16(d10s16))); |
| 88 q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16), |
| 89 vreinterpret_u8_s16(d11s16))); |
| 90 q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), |
| 91 vreinterpret_u8_s16(d8s16))); |
| 92 |
| 93 d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); |
| 94 d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); |
| 95 d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); |
| 96 d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); |
| 97 |
| 98 vst1_s16((int16_t *)p1, d9s16); |
| 99 p1 -= stride; |
| 100 vst1_s16((int16_t *)p2, d10s16); |
| 101 p2 += stride; |
| 102 vst1_s16((int16_t *)p1, d8s16); |
| 103 vst1_s16((int16_t *)p2, d11s16); |
| 104 return; |
| 105 } |
| 106 |
| 107 #define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ |
| 108 __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ |
| 109 q4s16, q5s16, q6s16, q7s16); |
| 110 static INLINE void __STORE_COMBINE_EXTREME_RESULTS( |
| 111 uint8_t *p1, |
| 112 uint8_t *p2, |
| 113 int stride, |
| 114 int16x8_t q4s16, |
| 115 int16x8_t q5s16, |
| 116 int16x8_t q6s16, |
| 117 int16x8_t q7s16) { |
| 118 int16x4_t d4s16, d5s16, d6s16, d7s16; |
| 119 |
| 120 d4s16 = vld1_s16((int16_t *)p1); |
| 121 p1 += stride; |
| 122 d7s16 = vld1_s16((int16_t *)p2); |
| 123 p2 -= stride; |
| 124 d5s16 = vld1_s16((int16_t *)p1); |
| 125 d6s16 = vld1_s16((int16_t *)p2); |
| 126 |
| 127 q5s16 = vrshrq_n_s16(q5s16, 6); |
| 128 q6s16 = vrshrq_n_s16(q6s16, 6); |
| 129 q7s16 = vrshrq_n_s16(q7s16, 6); |
| 130 q4s16 = vrshrq_n_s16(q4s16, 6); |
| 131 |
| 132 q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16), |
| 133 vreinterpret_u8_s16(d5s16))); |
| 134 q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), |
| 135 vreinterpret_u8_s16(d6s16))); |
| 136 q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), |
| 137 vreinterpret_u8_s16(d7s16))); |
| 138 q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16), |
| 139 vreinterpret_u8_s16(d4s16))); |
| 140 |
| 141 d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); |
| 142 d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); |
| 143 d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); |
| 144 d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); |
| 145 |
| 146 vst1_s16((int16_t *)p1, d5s16); |
| 147 p1 -= stride; |
| 148 vst1_s16((int16_t *)p2, d6s16); |
| 149 p2 += stride; |
| 150 vst1_s16((int16_t *)p2, d7s16); |
| 151 vst1_s16((int16_t *)p1, d4s16); |
| 152 return; |
| 153 } |
| 154 |
| 155 #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ |
| 156 DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); |
| 157 static INLINE void DO_BUTTERFLY( |
| 158 int16x8_t q14s16, |
| 159 int16x8_t q13s16, |
| 160 int16_t first_const, |
| 161 int16_t second_const, |
| 162 int16x8_t *qAs16, |
| 163 int16x8_t *qBs16) { |
| 164 int16x4_t d30s16, d31s16; |
| 165 int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; |
| 166 int16x4_t dCs16, dDs16, dAs16, dBs16; |
| 167 |
| 168 dCs16 = vget_low_s16(q14s16); |
| 169 dDs16 = vget_high_s16(q14s16); |
| 170 dAs16 = vget_low_s16(q13s16); |
| 171 dBs16 = vget_high_s16(q13s16); |
| 172 |
| 173 d30s16 = vdup_n_s16(first_const); |
| 174 d31s16 = vdup_n_s16(second_const); |
| 175 |
| 176 q8s32 = vmull_s16(dCs16, d30s16); |
| 177 q10s32 = vmull_s16(dAs16, d31s16); |
| 178 q9s32 = vmull_s16(dDs16, d30s16); |
| 179 q11s32 = vmull_s16(dBs16, d31s16); |
| 180 q12s32 = vmull_s16(dCs16, d31s16); |
| 181 |
| 182 q8s32 = vsubq_s32(q8s32, q10s32); |
| 183 q9s32 = vsubq_s32(q9s32, q11s32); |
| 184 |
| 185 q10s32 = vmull_s16(dDs16, d31s16); |
| 186 q11s32 = vmull_s16(dAs16, d30s16); |
| 187 q15s32 = vmull_s16(dBs16, d30s16); |
| 188 |
| 189 q11s32 = vaddq_s32(q12s32, q11s32); |
| 190 q10s32 = vaddq_s32(q10s32, q15s32); |
| 191 |
| 192 *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), |
| 193 vqrshrn_n_s32(q9s32, 14)); |
| 194 *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), |
| 195 vqrshrn_n_s32(q10s32, 14)); |
| 196 return; |
| 197 } |
| 198 |
| 199 static INLINE void idct32_transpose_pair( |
| 200 int16_t *input, |
| 201 int16_t *t_buf) { |
| 202 int16_t *in; |
| 203 int i; |
| 204 const int stride = 32; |
| 205 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; |
| 206 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; |
| 207 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; |
| 208 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; |
| 209 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; |
| 210 |
| 211 for (i = 0; i < 4; i++, input += 8) { |
| 212 in = input; |
| 213 q8s16 = vld1q_s16(in); |
| 214 in += stride; |
| 215 q9s16 = vld1q_s16(in); |
| 216 in += stride; |
| 217 q10s16 = vld1q_s16(in); |
| 218 in += stride; |
| 219 q11s16 = vld1q_s16(in); |
| 220 in += stride; |
| 221 q12s16 = vld1q_s16(in); |
| 222 in += stride; |
| 223 q13s16 = vld1q_s16(in); |
| 224 in += stride; |
| 225 q14s16 = vld1q_s16(in); |
| 226 in += stride; |
| 227 q15s16 = vld1q_s16(in); |
| 228 |
| 229 d16s16 = vget_low_s16(q8s16); |
| 230 d17s16 = vget_high_s16(q8s16); |
| 231 d18s16 = vget_low_s16(q9s16); |
| 232 d19s16 = vget_high_s16(q9s16); |
| 233 d20s16 = vget_low_s16(q10s16); |
| 234 d21s16 = vget_high_s16(q10s16); |
| 235 d22s16 = vget_low_s16(q11s16); |
| 236 d23s16 = vget_high_s16(q11s16); |
| 237 d24s16 = vget_low_s16(q12s16); |
| 238 d25s16 = vget_high_s16(q12s16); |
| 239 d26s16 = vget_low_s16(q13s16); |
| 240 d27s16 = vget_high_s16(q13s16); |
| 241 d28s16 = vget_low_s16(q14s16); |
| 242 d29s16 = vget_high_s16(q14s16); |
| 243 d30s16 = vget_low_s16(q15s16); |
| 244 d31s16 = vget_high_s16(q15s16); |
| 245 |
| 246 q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 |
| 247 q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 |
| 248 q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 |
| 249 q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 |
| 250 q12s16 = vcombine_s16(d17s16, d25s16); |
| 251 q13s16 = vcombine_s16(d19s16, d27s16); |
| 252 q14s16 = vcombine_s16(d21s16, d29s16); |
| 253 q15s16 = vcombine_s16(d23s16, d31s16); |
| 254 |
| 255 q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), |
| 256 vreinterpretq_s32_s16(q10s16)); |
| 257 q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16), |
| 258 vreinterpretq_s32_s16(q11s16)); |
| 259 q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16), |
| 260 vreinterpretq_s32_s16(q14s16)); |
| 261 q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16), |
| 262 vreinterpretq_s32_s16(q15s16)); |
| 263 |
| 264 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 |
| 265 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 |
| 266 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 |
| 267 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 |
| 268 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 |
| 269 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 |
| 270 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 |
| 271 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 |
| 272 |
| 273 vst1q_s16(t_buf, q0x2s16.val[0]); |
| 274 t_buf += 8; |
| 275 vst1q_s16(t_buf, q0x2s16.val[1]); |
| 276 t_buf += 8; |
| 277 vst1q_s16(t_buf, q1x2s16.val[0]); |
| 278 t_buf += 8; |
| 279 vst1q_s16(t_buf, q1x2s16.val[1]); |
| 280 t_buf += 8; |
| 281 vst1q_s16(t_buf, q2x2s16.val[0]); |
| 282 t_buf += 8; |
| 283 vst1q_s16(t_buf, q2x2s16.val[1]); |
| 284 t_buf += 8; |
| 285 vst1q_s16(t_buf, q3x2s16.val[0]); |
| 286 t_buf += 8; |
| 287 vst1q_s16(t_buf, q3x2s16.val[1]); |
| 288 t_buf += 8; |
| 289 } |
| 290 return; |
| 291 } |
| 292 |
| 293 static INLINE void idct32_bands_end_1st_pass( |
| 294 int16_t *out, |
| 295 int16x8_t q2s16, |
| 296 int16x8_t q3s16, |
| 297 int16x8_t q6s16, |
| 298 int16x8_t q7s16, |
| 299 int16x8_t q8s16, |
| 300 int16x8_t q9s16, |
| 301 int16x8_t q10s16, |
| 302 int16x8_t q11s16, |
| 303 int16x8_t q12s16, |
| 304 int16x8_t q13s16, |
| 305 int16x8_t q14s16, |
| 306 int16x8_t q15s16) { |
| 307 int16x8_t q0s16, q1s16, q4s16, q5s16; |
| 308 |
| 309 STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); |
| 310 STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); |
| 311 |
| 312 LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); |
| 313 q4s16 = vaddq_s16(q2s16, q1s16); |
| 314 q5s16 = vaddq_s16(q3s16, q0s16); |
| 315 q6s16 = vsubq_s16(q3s16, q0s16); |
| 316 q7s16 = vsubq_s16(q2s16, q1s16); |
| 317 STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); |
| 318 STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); |
| 319 |
| 320 LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); |
| 321 q2s16 = vaddq_s16(q10s16, q1s16); |
| 322 q3s16 = vaddq_s16(q11s16, q0s16); |
| 323 q4s16 = vsubq_s16(q11s16, q0s16); |
| 324 q5s16 = vsubq_s16(q10s16, q1s16); |
| 325 |
| 326 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); |
| 327 q8s16 = vaddq_s16(q4s16, q1s16); |
| 328 q9s16 = vaddq_s16(q5s16, q0s16); |
| 329 q6s16 = vsubq_s16(q5s16, q0s16); |
| 330 q7s16 = vsubq_s16(q4s16, q1s16); |
| 331 STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); |
| 332 STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); |
| 333 |
| 334 LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); |
| 335 q4s16 = vaddq_s16(q2s16, q1s16); |
| 336 q5s16 = vaddq_s16(q3s16, q0s16); |
| 337 q6s16 = vsubq_s16(q3s16, q0s16); |
| 338 q7s16 = vsubq_s16(q2s16, q1s16); |
| 339 STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); |
| 340 STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); |
| 341 |
| 342 LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); |
| 343 q2s16 = vaddq_s16(q12s16, q1s16); |
| 344 q3s16 = vaddq_s16(q13s16, q0s16); |
| 345 q4s16 = vsubq_s16(q13s16, q0s16); |
| 346 q5s16 = vsubq_s16(q12s16, q1s16); |
| 347 |
| 348 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); |
| 349 q8s16 = vaddq_s16(q4s16, q1s16); |
| 350 q9s16 = vaddq_s16(q5s16, q0s16); |
| 351 q6s16 = vsubq_s16(q5s16, q0s16); |
| 352 q7s16 = vsubq_s16(q4s16, q1s16); |
| 353 STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); |
| 354 STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); |
| 355 |
| 356 LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); |
| 357 q4s16 = vaddq_s16(q2s16, q1s16); |
| 358 q5s16 = vaddq_s16(q3s16, q0s16); |
| 359 q6s16 = vsubq_s16(q3s16, q0s16); |
| 360 q7s16 = vsubq_s16(q2s16, q1s16); |
| 361 STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); |
| 362 STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); |
| 363 |
| 364 LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); |
| 365 q2s16 = vaddq_s16(q14s16, q1s16); |
| 366 q3s16 = vaddq_s16(q15s16, q0s16); |
| 367 q4s16 = vsubq_s16(q15s16, q0s16); |
| 368 q5s16 = vsubq_s16(q14s16, q1s16); |
| 369 |
| 370 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); |
| 371 q8s16 = vaddq_s16(q4s16, q1s16); |
| 372 q9s16 = vaddq_s16(q5s16, q0s16); |
| 373 q6s16 = vsubq_s16(q5s16, q0s16); |
| 374 q7s16 = vsubq_s16(q4s16, q1s16); |
| 375 STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); |
| 376 STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); |
| 377 |
| 378 LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); |
| 379 q4s16 = vaddq_s16(q2s16, q1s16); |
| 380 q5s16 = vaddq_s16(q3s16, q0s16); |
| 381 q6s16 = vsubq_s16(q3s16, q0s16); |
| 382 q7s16 = vsubq_s16(q2s16, q1s16); |
| 383 STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); |
| 384 STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); |
| 385 return; |
| 386 } |
| 387 |
| 388 static INLINE void idct32_bands_end_2nd_pass( |
| 389 int16_t *out, |
| 390 uint8_t *dest, |
| 391 int stride, |
| 392 int16x8_t q2s16, |
| 393 int16x8_t q3s16, |
| 394 int16x8_t q6s16, |
| 395 int16x8_t q7s16, |
| 396 int16x8_t q8s16, |
| 397 int16x8_t q9s16, |
| 398 int16x8_t q10s16, |
| 399 int16x8_t q11s16, |
| 400 int16x8_t q12s16, |
| 401 int16x8_t q13s16, |
| 402 int16x8_t q14s16, |
| 403 int16x8_t q15s16) { |
| 404 uint8_t *r6 = dest + 31 * stride; |
| 405 uint8_t *r7 = dest/* + 0 * stride*/; |
| 406 uint8_t *r9 = dest + 15 * stride; |
| 407 uint8_t *r10 = dest + 16 * stride; |
| 408 int str2 = stride << 1; |
| 409 int16x8_t q0s16, q1s16, q4s16, q5s16; |
| 410 |
| 411 STORE_COMBINE_CENTER_RESULTS(r10, r9); |
| 412 r10 += str2; r9 -= str2; |
| 413 |
| 414 LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) |
| 415 q4s16 = vaddq_s16(q2s16, q1s16); |
| 416 q5s16 = vaddq_s16(q3s16, q0s16); |
| 417 q6s16 = vsubq_s16(q3s16, q0s16); |
| 418 q7s16 = vsubq_s16(q2s16, q1s16); |
| 419 STORE_COMBINE_EXTREME_RESULTS(r7, r6); |
| 420 r7 += str2; r6 -= str2; |
| 421 |
| 422 LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) |
| 423 q2s16 = vaddq_s16(q10s16, q1s16); |
| 424 q3s16 = vaddq_s16(q11s16, q0s16); |
| 425 q4s16 = vsubq_s16(q11s16, q0s16); |
| 426 q5s16 = vsubq_s16(q10s16, q1s16); |
| 427 |
| 428 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) |
| 429 q8s16 = vaddq_s16(q4s16, q1s16); |
| 430 q9s16 = vaddq_s16(q5s16, q0s16); |
| 431 q6s16 = vsubq_s16(q5s16, q0s16); |
| 432 q7s16 = vsubq_s16(q4s16, q1s16); |
| 433 STORE_COMBINE_CENTER_RESULTS(r10, r9); |
| 434 r10 += str2; r9 -= str2; |
| 435 |
| 436 LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) |
| 437 q4s16 = vaddq_s16(q2s16, q1s16); |
| 438 q5s16 = vaddq_s16(q3s16, q0s16); |
| 439 q6s16 = vsubq_s16(q3s16, q0s16); |
| 440 q7s16 = vsubq_s16(q2s16, q1s16); |
| 441 STORE_COMBINE_EXTREME_RESULTS(r7, r6); |
| 442 r7 += str2; r6 -= str2; |
| 443 |
| 444 LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) |
| 445 q2s16 = vaddq_s16(q12s16, q1s16); |
| 446 q3s16 = vaddq_s16(q13s16, q0s16); |
| 447 q4s16 = vsubq_s16(q13s16, q0s16); |
| 448 q5s16 = vsubq_s16(q12s16, q1s16); |
| 449 |
| 450 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) |
| 451 q8s16 = vaddq_s16(q4s16, q1s16); |
| 452 q9s16 = vaddq_s16(q5s16, q0s16); |
| 453 q6s16 = vsubq_s16(q5s16, q0s16); |
| 454 q7s16 = vsubq_s16(q4s16, q1s16); |
| 455 STORE_COMBINE_CENTER_RESULTS(r10, r9); |
| 456 r10 += str2; r9 -= str2; |
| 457 |
| 458 LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) |
| 459 q4s16 = vaddq_s16(q2s16, q1s16); |
| 460 q5s16 = vaddq_s16(q3s16, q0s16); |
| 461 q6s16 = vsubq_s16(q3s16, q0s16); |
| 462 q7s16 = vsubq_s16(q2s16, q1s16); |
| 463 STORE_COMBINE_EXTREME_RESULTS(r7, r6); |
| 464 r7 += str2; r6 -= str2; |
| 465 |
| 466 LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) |
| 467 q2s16 = vaddq_s16(q14s16, q1s16); |
| 468 q3s16 = vaddq_s16(q15s16, q0s16); |
| 469 q4s16 = vsubq_s16(q15s16, q0s16); |
| 470 q5s16 = vsubq_s16(q14s16, q1s16); |
| 471 |
| 472 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) |
| 473 q8s16 = vaddq_s16(q4s16, q1s16); |
| 474 q9s16 = vaddq_s16(q5s16, q0s16); |
| 475 q6s16 = vsubq_s16(q5s16, q0s16); |
| 476 q7s16 = vsubq_s16(q4s16, q1s16); |
| 477 STORE_COMBINE_CENTER_RESULTS(r10, r9); |
| 478 |
| 479 LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) |
| 480 q4s16 = vaddq_s16(q2s16, q1s16); |
| 481 q5s16 = vaddq_s16(q3s16, q0s16); |
| 482 q6s16 = vsubq_s16(q3s16, q0s16); |
| 483 q7s16 = vsubq_s16(q2s16, q1s16); |
| 484 STORE_COMBINE_EXTREME_RESULTS(r7, r6); |
| 485 return; |
| 486 } |
| 487 |
| 488 void vp9_idct32x32_1024_add_neon( |
| 489 int16_t *input, |
| 490 uint8_t *dest, |
| 491 int stride) { |
| 492 int i, idct32_pass_loop; |
| 493 int16_t trans_buf[32 * 8]; |
| 494 int16_t pass1[32 * 32]; |
| 495 int16_t pass2[32 * 32]; |
| 496 int16_t *out; |
| 497 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; |
| 498 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; |
| 499 |
| 500 for (idct32_pass_loop = 0, out = pass1; |
| 501 idct32_pass_loop < 2; |
| 502 idct32_pass_loop++, |
| 503 input = pass1, // the input of pass2 is the result of pass1 |
| 504 out = pass2) { |
| 505 for (i = 0; |
| 506 i < 4; i++, |
| 507 input += 32 * 8, out += 8) { // idct32_bands_loop |
| 508 idct32_transpose_pair(input, trans_buf); |
| 509 |
| 510 // ----------------------------------------- |
| 511 // BLOCK A: 16-19,28-31 |
| 512 // ----------------------------------------- |
| 513 // generate 16,17,30,31 |
| 514 // part of stage 1 |
| 515 LOAD_FROM_TRANSPOSED(0, 1, 31) |
| 516 DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) |
| 517 LOAD_FROM_TRANSPOSED(31, 17, 15) |
| 518 DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) |
| 519 // part of stage 2 |
| 520 q4s16 = vaddq_s16(q0s16, q1s16); |
| 521 q13s16 = vsubq_s16(q0s16, q1s16); |
| 522 q6s16 = vaddq_s16(q2s16, q3s16); |
| 523 q14s16 = vsubq_s16(q2s16, q3s16); |
| 524 // part of stage 3 |
| 525 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) |
| 526 |
| 527 // generate 18,19,28,29 |
| 528 // part of stage 1 |
| 529 LOAD_FROM_TRANSPOSED(15, 9, 23) |
| 530 DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) |
| 531 LOAD_FROM_TRANSPOSED(23, 25, 7) |
| 532 DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) |
| 533 // part of stage 2 |
| 534 q13s16 = vsubq_s16(q3s16, q2s16); |
| 535 q3s16 = vaddq_s16(q3s16, q2s16); |
| 536 q14s16 = vsubq_s16(q1s16, q0s16); |
| 537 q2s16 = vaddq_s16(q1s16, q0s16); |
| 538 // part of stage 3 |
| 539 DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) |
| 540 // part of stage 4 |
| 541 q8s16 = vaddq_s16(q4s16, q2s16); |
| 542 q9s16 = vaddq_s16(q5s16, q0s16); |
| 543 q10s16 = vaddq_s16(q7s16, q1s16); |
| 544 q15s16 = vaddq_s16(q6s16, q3s16); |
| 545 q13s16 = vsubq_s16(q5s16, q0s16); |
| 546 q14s16 = vsubq_s16(q7s16, q1s16); |
| 547 STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) |
| 548 STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) |
| 549 // part of stage 5 |
| 550 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) |
| 551 STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) |
| 552 // part of stage 4 |
| 553 q13s16 = vsubq_s16(q4s16, q2s16); |
| 554 q14s16 = vsubq_s16(q6s16, q3s16); |
| 555 // part of stage 5 |
| 556 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) |
| 557 STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) |
| 558 |
| 559 // ----------------------------------------- |
| 560 // BLOCK B: 20-23,24-27 |
| 561 // ----------------------------------------- |
| 562 // generate 20,21,26,27 |
| 563 // part of stage 1 |
| 564 LOAD_FROM_TRANSPOSED(7, 5, 27) |
| 565 DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) |
| 566 LOAD_FROM_TRANSPOSED(27, 21, 11) |
| 567 DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) |
| 568 // part of stage 2 |
| 569 q13s16 = vsubq_s16(q0s16, q1s16); |
| 570 q0s16 = vaddq_s16(q0s16, q1s16); |
| 571 q14s16 = vsubq_s16(q2s16, q3s16); |
| 572 q2s16 = vaddq_s16(q2s16, q3s16); |
| 573 // part of stage 3 |
| 574 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) |
| 575 |
| 576 // generate 22,23,24,25 |
| 577 // part of stage 1 |
| 578 LOAD_FROM_TRANSPOSED(11, 13, 19) |
| 579 DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) |
| 580 LOAD_FROM_TRANSPOSED(19, 29, 3) |
| 581 DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) |
| 582 // part of stage 2 |
| 583 q14s16 = vsubq_s16(q4s16, q5s16); |
| 584 q5s16 = vaddq_s16(q4s16, q5s16); |
| 585 q13s16 = vsubq_s16(q6s16, q7s16); |
| 586 q6s16 = vaddq_s16(q6s16, q7s16); |
| 587 // part of stage 3 |
| 588 DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) |
| 589 // part of stage 4 |
| 590 q10s16 = vaddq_s16(q7s16, q1s16); |
| 591 q11s16 = vaddq_s16(q5s16, q0s16); |
| 592 q12s16 = vaddq_s16(q6s16, q2s16); |
| 593 q15s16 = vaddq_s16(q4s16, q3s16); |
| 594 // part of stage 6 |
| 595 LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) |
| 596 q8s16 = vaddq_s16(q14s16, q11s16); |
| 597 q9s16 = vaddq_s16(q13s16, q10s16); |
| 598 q13s16 = vsubq_s16(q13s16, q10s16); |
| 599 q11s16 = vsubq_s16(q14s16, q11s16); |
| 600 STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) |
| 601 LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) |
| 602 q8s16 = vsubq_s16(q9s16, q12s16); |
| 603 q10s16 = vaddq_s16(q14s16, q15s16); |
| 604 q14s16 = vsubq_s16(q14s16, q15s16); |
| 605 q12s16 = vaddq_s16(q9s16, q12s16); |
| 606 STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) |
| 607 // part of stage 7 |
| 608 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) |
| 609 STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) |
| 610 q13s16 = q11s16; |
| 611 q14s16 = q8s16; |
| 612 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) |
| 613 STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) |
| 614 // part of stage 4 |
| 615 q14s16 = vsubq_s16(q5s16, q0s16); |
| 616 q13s16 = vsubq_s16(q6s16, q2s16); |
| 617 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); |
| 618 q14s16 = vsubq_s16(q7s16, q1s16); |
| 619 q13s16 = vsubq_s16(q4s16, q3s16); |
| 620 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); |
| 621 // part of stage 6 |
| 622 LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) |
| 623 q8s16 = vaddq_s16(q14s16, q1s16); |
| 624 q9s16 = vaddq_s16(q13s16, q6s16); |
| 625 q13s16 = vsubq_s16(q13s16, q6s16); |
| 626 q1s16 = vsubq_s16(q14s16, q1s16); |
| 627 STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) |
| 628 LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) |
| 629 q14s16 = vsubq_s16(q8s16, q5s16); |
| 630 q10s16 = vaddq_s16(q8s16, q5s16); |
| 631 q11s16 = vaddq_s16(q9s16, q0s16); |
| 632 q0s16 = vsubq_s16(q9s16, q0s16); |
| 633 STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) |
| 634 // part of stage 7 |
| 635 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) |
| 636 STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) |
| 637 DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, |
| 638 &q1s16, &q0s16); |
| 639 STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) |
| 640 |
| 641 // ----------------------------------------- |
| 642 // BLOCK C: 8-10,11-15 |
| 643 // ----------------------------------------- |
| 644 // generate 8,9,14,15 |
| 645 // part of stage 2 |
| 646 LOAD_FROM_TRANSPOSED(3, 2, 30) |
| 647 DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) |
| 648 LOAD_FROM_TRANSPOSED(30, 18, 14) |
| 649 DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) |
| 650 // part of stage 3 |
| 651 q13s16 = vsubq_s16(q0s16, q1s16); |
| 652 q0s16 = vaddq_s16(q0s16, q1s16); |
| 653 q14s16 = vsubq_s16(q2s16, q3s16); |
| 654 q2s16 = vaddq_s16(q2s16, q3s16); |
| 655 // part of stage 4 |
| 656 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) |
| 657 |
| 658 // generate 10,11,12,13 |
| 659 // part of stage 2 |
| 660 LOAD_FROM_TRANSPOSED(14, 10, 22) |
| 661 DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) |
| 662 LOAD_FROM_TRANSPOSED(22, 26, 6) |
| 663 DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) |
| 664 // part of stage 3 |
| 665 q14s16 = vsubq_s16(q4s16, q5s16); |
| 666 q5s16 = vaddq_s16(q4s16, q5s16); |
| 667 q13s16 = vsubq_s16(q6s16, q7s16); |
| 668 q6s16 = vaddq_s16(q6s16, q7s16); |
| 669 // part of stage 4 |
| 670 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) |
| 671 // part of stage 5 |
| 672 q8s16 = vaddq_s16(q0s16, q5s16); |
| 673 q9s16 = vaddq_s16(q1s16, q7s16); |
| 674 q13s16 = vsubq_s16(q1s16, q7s16); |
| 675 q14s16 = vsubq_s16(q3s16, q4s16); |
| 676 q10s16 = vaddq_s16(q3s16, q4s16); |
| 677 q15s16 = vaddq_s16(q2s16, q6s16); |
| 678 STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) |
| 679 STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) |
| 680 // part of stage 6 |
| 681 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) |
| 682 STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) |
| 683 q13s16 = vsubq_s16(q0s16, q5s16); |
| 684 q14s16 = vsubq_s16(q2s16, q6s16); |
| 685 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) |
| 686 STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) |
| 687 |
| 688 // ----------------------------------------- |
| 689 // BLOCK D: 0-3,4-7 |
| 690 // ----------------------------------------- |
| 691 // generate 4,5,6,7 |
| 692 // part of stage 3 |
| 693 LOAD_FROM_TRANSPOSED(6, 4, 28) |
| 694 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) |
| 695 LOAD_FROM_TRANSPOSED(28, 20, 12) |
| 696 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) |
| 697 // part of stage 4 |
| 698 q13s16 = vsubq_s16(q0s16, q1s16); |
| 699 q0s16 = vaddq_s16(q0s16, q1s16); |
| 700 q14s16 = vsubq_s16(q2s16, q3s16); |
| 701 q2s16 = vaddq_s16(q2s16, q3s16); |
| 702 // part of stage 5 |
| 703 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) |
| 704 |
| 705 // generate 0,1,2,3 |
| 706 // part of stage 4 |
| 707 LOAD_FROM_TRANSPOSED(12, 0, 16) |
| 708 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) |
| 709 LOAD_FROM_TRANSPOSED(16, 8, 24) |
| 710 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) |
| 711 // part of stage 5 |
| 712 q4s16 = vaddq_s16(q7s16, q6s16); |
| 713 q7s16 = vsubq_s16(q7s16, q6s16); |
| 714 q6s16 = vsubq_s16(q5s16, q14s16); |
| 715 q5s16 = vaddq_s16(q5s16, q14s16); |
| 716 // part of stage 6 |
| 717 q8s16 = vaddq_s16(q4s16, q2s16); |
| 718 q9s16 = vaddq_s16(q5s16, q3s16); |
| 719 q10s16 = vaddq_s16(q6s16, q1s16); |
| 720 q11s16 = vaddq_s16(q7s16, q0s16); |
| 721 q12s16 = vsubq_s16(q7s16, q0s16); |
| 722 q13s16 = vsubq_s16(q6s16, q1s16); |
| 723 q14s16 = vsubq_s16(q5s16, q3s16); |
| 724 q15s16 = vsubq_s16(q4s16, q2s16); |
| 725 // part of stage 7 |
| 726 LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) |
| 727 q2s16 = vaddq_s16(q8s16, q1s16); |
| 728 q3s16 = vaddq_s16(q9s16, q0s16); |
| 729 q4s16 = vsubq_s16(q9s16, q0s16); |
| 730 q5s16 = vsubq_s16(q8s16, q1s16); |
| 731 LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) |
| 732 q8s16 = vaddq_s16(q4s16, q1s16); |
| 733 q9s16 = vaddq_s16(q5s16, q0s16); |
| 734 q6s16 = vsubq_s16(q5s16, q0s16); |
| 735 q7s16 = vsubq_s16(q4s16, q1s16); |
| 736 |
| 737 if (idct32_pass_loop == 0) { |
| 738 idct32_bands_end_1st_pass(out, |
| 739 q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, |
| 740 q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); |
| 741 } else { |
| 742 idct32_bands_end_2nd_pass(out, dest, stride, |
| 743 q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, |
| 744 q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); |
| 745 dest += 8; |
| 746 } |
| 747 } |
| 748 } |
| 749 return; |
| 750 } |
OLD | NEW |