Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(533)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14
15 static int16_t cospi_1_64 = 16364;
16 static int16_t cospi_2_64 = 16305;
17 static int16_t cospi_3_64 = 16207;
18 static int16_t cospi_4_64 = 16069;
19 static int16_t cospi_5_64 = 15893;
20 static int16_t cospi_6_64 = 15679;
21 static int16_t cospi_7_64 = 15426;
22 static int16_t cospi_8_64 = 15137;
23 static int16_t cospi_9_64 = 14811;
24 static int16_t cospi_10_64 = 14449;
25 static int16_t cospi_11_64 = 14053;
26 static int16_t cospi_12_64 = 13623;
27 static int16_t cospi_13_64 = 13160;
28 static int16_t cospi_14_64 = 12665;
29 static int16_t cospi_15_64 = 12140;
30 static int16_t cospi_16_64 = 11585;
31 static int16_t cospi_17_64 = 11003;
32 static int16_t cospi_18_64 = 10394;
33 static int16_t cospi_19_64 = 9760;
34 static int16_t cospi_20_64 = 9102;
35 static int16_t cospi_21_64 = 8423;
36 static int16_t cospi_22_64 = 7723;
37 static int16_t cospi_23_64 = 7005;
38 static int16_t cospi_24_64 = 6270;
39 static int16_t cospi_25_64 = 5520;
40 static int16_t cospi_26_64 = 4756;
41 static int16_t cospi_27_64 = 3981;
42 static int16_t cospi_28_64 = 3196;
43 static int16_t cospi_29_64 = 2404;
44 static int16_t cospi_30_64 = 1606;
45 static int16_t cospi_31_64 = 804;
46
47 #define LOAD_FROM_TRANSPOSED(prev, first, second) \
48 q14s16 = vld1q_s16(trans_buf + first * 8); \
49 q13s16 = vld1q_s16(trans_buf + second * 8);
50
51 #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
52 qA = vld1q_s16(out + first * 32); \
53 qB = vld1q_s16(out + second * 32);
54
55 #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
56 vst1q_s16(out + first * 32, qA); \
57 vst1q_s16(out + second * 32, qB);
58
59 #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
60 __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
61 q6s16, q7s16, q8s16, q9s16);
62 static INLINE void __STORE_COMBINE_CENTER_RESULTS(
63 uint8_t *p1,
64 uint8_t *p2,
65 int stride,
66 int16x8_t q6s16,
67 int16x8_t q7s16,
68 int16x8_t q8s16,
69 int16x8_t q9s16) {
70 int16x4_t d8s16, d9s16, d10s16, d11s16;
71
72 d8s16 = vld1_s16((int16_t *)p1);
73 p1 += stride;
74 d11s16 = vld1_s16((int16_t *)p2);
75 p2 -= stride;
76 d9s16 = vld1_s16((int16_t *)p1);
77 d10s16 = vld1_s16((int16_t *)p2);
78
79 q7s16 = vrshrq_n_s16(q7s16, 6);
80 q8s16 = vrshrq_n_s16(q8s16, 6);
81 q9s16 = vrshrq_n_s16(q9s16, 6);
82 q6s16 = vrshrq_n_s16(q6s16, 6);
83
84 q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
85 vreinterpret_u8_s16(d9s16)));
86 q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
87 vreinterpret_u8_s16(d10s16)));
88 q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
89 vreinterpret_u8_s16(d11s16)));
90 q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
91 vreinterpret_u8_s16(d8s16)));
92
93 d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
94 d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
95 d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
96 d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
97
98 vst1_s16((int16_t *)p1, d9s16);
99 p1 -= stride;
100 vst1_s16((int16_t *)p2, d10s16);
101 p2 += stride;
102 vst1_s16((int16_t *)p1, d8s16);
103 vst1_s16((int16_t *)p2, d11s16);
104 return;
105 }
106
107 #define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
108 __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
109 q4s16, q5s16, q6s16, q7s16);
110 static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
111 uint8_t *p1,
112 uint8_t *p2,
113 int stride,
114 int16x8_t q4s16,
115 int16x8_t q5s16,
116 int16x8_t q6s16,
117 int16x8_t q7s16) {
118 int16x4_t d4s16, d5s16, d6s16, d7s16;
119
120 d4s16 = vld1_s16((int16_t *)p1);
121 p1 += stride;
122 d7s16 = vld1_s16((int16_t *)p2);
123 p2 -= stride;
124 d5s16 = vld1_s16((int16_t *)p1);
125 d6s16 = vld1_s16((int16_t *)p2);
126
127 q5s16 = vrshrq_n_s16(q5s16, 6);
128 q6s16 = vrshrq_n_s16(q6s16, 6);
129 q7s16 = vrshrq_n_s16(q7s16, 6);
130 q4s16 = vrshrq_n_s16(q4s16, 6);
131
132 q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
133 vreinterpret_u8_s16(d5s16)));
134 q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
135 vreinterpret_u8_s16(d6s16)));
136 q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
137 vreinterpret_u8_s16(d7s16)));
138 q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
139 vreinterpret_u8_s16(d4s16)));
140
141 d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
142 d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
143 d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
144 d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
145
146 vst1_s16((int16_t *)p1, d5s16);
147 p1 -= stride;
148 vst1_s16((int16_t *)p2, d6s16);
149 p2 += stride;
150 vst1_s16((int16_t *)p2, d7s16);
151 vst1_s16((int16_t *)p1, d4s16);
152 return;
153 }
154
155 #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
156 DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
157 static INLINE void DO_BUTTERFLY(
158 int16x8_t q14s16,
159 int16x8_t q13s16,
160 int16_t first_const,
161 int16_t second_const,
162 int16x8_t *qAs16,
163 int16x8_t *qBs16) {
164 int16x4_t d30s16, d31s16;
165 int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
166 int16x4_t dCs16, dDs16, dAs16, dBs16;
167
168 dCs16 = vget_low_s16(q14s16);
169 dDs16 = vget_high_s16(q14s16);
170 dAs16 = vget_low_s16(q13s16);
171 dBs16 = vget_high_s16(q13s16);
172
173 d30s16 = vdup_n_s16(first_const);
174 d31s16 = vdup_n_s16(second_const);
175
176 q8s32 = vmull_s16(dCs16, d30s16);
177 q10s32 = vmull_s16(dAs16, d31s16);
178 q9s32 = vmull_s16(dDs16, d30s16);
179 q11s32 = vmull_s16(dBs16, d31s16);
180 q12s32 = vmull_s16(dCs16, d31s16);
181
182 q8s32 = vsubq_s32(q8s32, q10s32);
183 q9s32 = vsubq_s32(q9s32, q11s32);
184
185 q10s32 = vmull_s16(dDs16, d31s16);
186 q11s32 = vmull_s16(dAs16, d30s16);
187 q15s32 = vmull_s16(dBs16, d30s16);
188
189 q11s32 = vaddq_s32(q12s32, q11s32);
190 q10s32 = vaddq_s32(q10s32, q15s32);
191
192 *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
193 vqrshrn_n_s32(q9s32, 14));
194 *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
195 vqrshrn_n_s32(q10s32, 14));
196 return;
197 }
198
199 static INLINE void idct32_transpose_pair(
200 int16_t *input,
201 int16_t *t_buf) {
202 int16_t *in;
203 int i;
204 const int stride = 32;
205 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
206 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
207 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
208 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
209 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
210
211 for (i = 0; i < 4; i++, input += 8) {
212 in = input;
213 q8s16 = vld1q_s16(in);
214 in += stride;
215 q9s16 = vld1q_s16(in);
216 in += stride;
217 q10s16 = vld1q_s16(in);
218 in += stride;
219 q11s16 = vld1q_s16(in);
220 in += stride;
221 q12s16 = vld1q_s16(in);
222 in += stride;
223 q13s16 = vld1q_s16(in);
224 in += stride;
225 q14s16 = vld1q_s16(in);
226 in += stride;
227 q15s16 = vld1q_s16(in);
228
229 d16s16 = vget_low_s16(q8s16);
230 d17s16 = vget_high_s16(q8s16);
231 d18s16 = vget_low_s16(q9s16);
232 d19s16 = vget_high_s16(q9s16);
233 d20s16 = vget_low_s16(q10s16);
234 d21s16 = vget_high_s16(q10s16);
235 d22s16 = vget_low_s16(q11s16);
236 d23s16 = vget_high_s16(q11s16);
237 d24s16 = vget_low_s16(q12s16);
238 d25s16 = vget_high_s16(q12s16);
239 d26s16 = vget_low_s16(q13s16);
240 d27s16 = vget_high_s16(q13s16);
241 d28s16 = vget_low_s16(q14s16);
242 d29s16 = vget_high_s16(q14s16);
243 d30s16 = vget_low_s16(q15s16);
244 d31s16 = vget_high_s16(q15s16);
245
246 q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
247 q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
248 q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
249 q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
250 q12s16 = vcombine_s16(d17s16, d25s16);
251 q13s16 = vcombine_s16(d19s16, d27s16);
252 q14s16 = vcombine_s16(d21s16, d29s16);
253 q15s16 = vcombine_s16(d23s16, d31s16);
254
255 q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
256 vreinterpretq_s32_s16(q10s16));
257 q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
258 vreinterpretq_s32_s16(q11s16));
259 q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
260 vreinterpretq_s32_s16(q14s16));
261 q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
262 vreinterpretq_s32_s16(q15s16));
263
264 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
265 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
266 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
267 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
268 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
269 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
270 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
271 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
272
273 vst1q_s16(t_buf, q0x2s16.val[0]);
274 t_buf += 8;
275 vst1q_s16(t_buf, q0x2s16.val[1]);
276 t_buf += 8;
277 vst1q_s16(t_buf, q1x2s16.val[0]);
278 t_buf += 8;
279 vst1q_s16(t_buf, q1x2s16.val[1]);
280 t_buf += 8;
281 vst1q_s16(t_buf, q2x2s16.val[0]);
282 t_buf += 8;
283 vst1q_s16(t_buf, q2x2s16.val[1]);
284 t_buf += 8;
285 vst1q_s16(t_buf, q3x2s16.val[0]);
286 t_buf += 8;
287 vst1q_s16(t_buf, q3x2s16.val[1]);
288 t_buf += 8;
289 }
290 return;
291 }
292
293 static INLINE void idct32_bands_end_1st_pass(
294 int16_t *out,
295 int16x8_t q2s16,
296 int16x8_t q3s16,
297 int16x8_t q6s16,
298 int16x8_t q7s16,
299 int16x8_t q8s16,
300 int16x8_t q9s16,
301 int16x8_t q10s16,
302 int16x8_t q11s16,
303 int16x8_t q12s16,
304 int16x8_t q13s16,
305 int16x8_t q14s16,
306 int16x8_t q15s16) {
307 int16x8_t q0s16, q1s16, q4s16, q5s16;
308
309 STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
310 STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
311
312 LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
313 q4s16 = vaddq_s16(q2s16, q1s16);
314 q5s16 = vaddq_s16(q3s16, q0s16);
315 q6s16 = vsubq_s16(q3s16, q0s16);
316 q7s16 = vsubq_s16(q2s16, q1s16);
317 STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
318 STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
319
320 LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
321 q2s16 = vaddq_s16(q10s16, q1s16);
322 q3s16 = vaddq_s16(q11s16, q0s16);
323 q4s16 = vsubq_s16(q11s16, q0s16);
324 q5s16 = vsubq_s16(q10s16, q1s16);
325
326 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
327 q8s16 = vaddq_s16(q4s16, q1s16);
328 q9s16 = vaddq_s16(q5s16, q0s16);
329 q6s16 = vsubq_s16(q5s16, q0s16);
330 q7s16 = vsubq_s16(q4s16, q1s16);
331 STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
332 STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
333
334 LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
335 q4s16 = vaddq_s16(q2s16, q1s16);
336 q5s16 = vaddq_s16(q3s16, q0s16);
337 q6s16 = vsubq_s16(q3s16, q0s16);
338 q7s16 = vsubq_s16(q2s16, q1s16);
339 STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
340 STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
341
342 LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
343 q2s16 = vaddq_s16(q12s16, q1s16);
344 q3s16 = vaddq_s16(q13s16, q0s16);
345 q4s16 = vsubq_s16(q13s16, q0s16);
346 q5s16 = vsubq_s16(q12s16, q1s16);
347
348 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
349 q8s16 = vaddq_s16(q4s16, q1s16);
350 q9s16 = vaddq_s16(q5s16, q0s16);
351 q6s16 = vsubq_s16(q5s16, q0s16);
352 q7s16 = vsubq_s16(q4s16, q1s16);
353 STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
354 STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
355
356 LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
357 q4s16 = vaddq_s16(q2s16, q1s16);
358 q5s16 = vaddq_s16(q3s16, q0s16);
359 q6s16 = vsubq_s16(q3s16, q0s16);
360 q7s16 = vsubq_s16(q2s16, q1s16);
361 STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
362 STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
363
364 LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
365 q2s16 = vaddq_s16(q14s16, q1s16);
366 q3s16 = vaddq_s16(q15s16, q0s16);
367 q4s16 = vsubq_s16(q15s16, q0s16);
368 q5s16 = vsubq_s16(q14s16, q1s16);
369
370 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
371 q8s16 = vaddq_s16(q4s16, q1s16);
372 q9s16 = vaddq_s16(q5s16, q0s16);
373 q6s16 = vsubq_s16(q5s16, q0s16);
374 q7s16 = vsubq_s16(q4s16, q1s16);
375 STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
376 STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
377
378 LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
379 q4s16 = vaddq_s16(q2s16, q1s16);
380 q5s16 = vaddq_s16(q3s16, q0s16);
381 q6s16 = vsubq_s16(q3s16, q0s16);
382 q7s16 = vsubq_s16(q2s16, q1s16);
383 STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
384 STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
385 return;
386 }
387
388 static INLINE void idct32_bands_end_2nd_pass(
389 int16_t *out,
390 uint8_t *dest,
391 int stride,
392 int16x8_t q2s16,
393 int16x8_t q3s16,
394 int16x8_t q6s16,
395 int16x8_t q7s16,
396 int16x8_t q8s16,
397 int16x8_t q9s16,
398 int16x8_t q10s16,
399 int16x8_t q11s16,
400 int16x8_t q12s16,
401 int16x8_t q13s16,
402 int16x8_t q14s16,
403 int16x8_t q15s16) {
404 uint8_t *r6 = dest + 31 * stride;
405 uint8_t *r7 = dest/* + 0 * stride*/;
406 uint8_t *r9 = dest + 15 * stride;
407 uint8_t *r10 = dest + 16 * stride;
408 int str2 = stride << 1;
409 int16x8_t q0s16, q1s16, q4s16, q5s16;
410
411 STORE_COMBINE_CENTER_RESULTS(r10, r9);
412 r10 += str2; r9 -= str2;
413
414 LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
415 q4s16 = vaddq_s16(q2s16, q1s16);
416 q5s16 = vaddq_s16(q3s16, q0s16);
417 q6s16 = vsubq_s16(q3s16, q0s16);
418 q7s16 = vsubq_s16(q2s16, q1s16);
419 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
420 r7 += str2; r6 -= str2;
421
422 LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
423 q2s16 = vaddq_s16(q10s16, q1s16);
424 q3s16 = vaddq_s16(q11s16, q0s16);
425 q4s16 = vsubq_s16(q11s16, q0s16);
426 q5s16 = vsubq_s16(q10s16, q1s16);
427
428 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
429 q8s16 = vaddq_s16(q4s16, q1s16);
430 q9s16 = vaddq_s16(q5s16, q0s16);
431 q6s16 = vsubq_s16(q5s16, q0s16);
432 q7s16 = vsubq_s16(q4s16, q1s16);
433 STORE_COMBINE_CENTER_RESULTS(r10, r9);
434 r10 += str2; r9 -= str2;
435
436 LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
437 q4s16 = vaddq_s16(q2s16, q1s16);
438 q5s16 = vaddq_s16(q3s16, q0s16);
439 q6s16 = vsubq_s16(q3s16, q0s16);
440 q7s16 = vsubq_s16(q2s16, q1s16);
441 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
442 r7 += str2; r6 -= str2;
443
444 LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
445 q2s16 = vaddq_s16(q12s16, q1s16);
446 q3s16 = vaddq_s16(q13s16, q0s16);
447 q4s16 = vsubq_s16(q13s16, q0s16);
448 q5s16 = vsubq_s16(q12s16, q1s16);
449
450 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
451 q8s16 = vaddq_s16(q4s16, q1s16);
452 q9s16 = vaddq_s16(q5s16, q0s16);
453 q6s16 = vsubq_s16(q5s16, q0s16);
454 q7s16 = vsubq_s16(q4s16, q1s16);
455 STORE_COMBINE_CENTER_RESULTS(r10, r9);
456 r10 += str2; r9 -= str2;
457
458 LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
459 q4s16 = vaddq_s16(q2s16, q1s16);
460 q5s16 = vaddq_s16(q3s16, q0s16);
461 q6s16 = vsubq_s16(q3s16, q0s16);
462 q7s16 = vsubq_s16(q2s16, q1s16);
463 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
464 r7 += str2; r6 -= str2;
465
466 LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
467 q2s16 = vaddq_s16(q14s16, q1s16);
468 q3s16 = vaddq_s16(q15s16, q0s16);
469 q4s16 = vsubq_s16(q15s16, q0s16);
470 q5s16 = vsubq_s16(q14s16, q1s16);
471
472 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
473 q8s16 = vaddq_s16(q4s16, q1s16);
474 q9s16 = vaddq_s16(q5s16, q0s16);
475 q6s16 = vsubq_s16(q5s16, q0s16);
476 q7s16 = vsubq_s16(q4s16, q1s16);
477 STORE_COMBINE_CENTER_RESULTS(r10, r9);
478
479 LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
480 q4s16 = vaddq_s16(q2s16, q1s16);
481 q5s16 = vaddq_s16(q3s16, q0s16);
482 q6s16 = vsubq_s16(q3s16, q0s16);
483 q7s16 = vsubq_s16(q2s16, q1s16);
484 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
485 return;
486 }
487
488 void vp9_idct32x32_1024_add_neon(
489 int16_t *input,
490 uint8_t *dest,
491 int stride) {
492 int i, idct32_pass_loop;
493 int16_t trans_buf[32 * 8];
494 int16_t pass1[32 * 32];
495 int16_t pass2[32 * 32];
496 int16_t *out;
497 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
498 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
499
500 for (idct32_pass_loop = 0, out = pass1;
501 idct32_pass_loop < 2;
502 idct32_pass_loop++,
503 input = pass1, // the input of pass2 is the result of pass1
504 out = pass2) {
505 for (i = 0;
506 i < 4; i++,
507 input += 32 * 8, out += 8) { // idct32_bands_loop
508 idct32_transpose_pair(input, trans_buf);
509
510 // -----------------------------------------
511 // BLOCK A: 16-19,28-31
512 // -----------------------------------------
513 // generate 16,17,30,31
514 // part of stage 1
515 LOAD_FROM_TRANSPOSED(0, 1, 31)
516 DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
517 LOAD_FROM_TRANSPOSED(31, 17, 15)
518 DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
519 // part of stage 2
520 q4s16 = vaddq_s16(q0s16, q1s16);
521 q13s16 = vsubq_s16(q0s16, q1s16);
522 q6s16 = vaddq_s16(q2s16, q3s16);
523 q14s16 = vsubq_s16(q2s16, q3s16);
524 // part of stage 3
525 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
526
527 // generate 18,19,28,29
528 // part of stage 1
529 LOAD_FROM_TRANSPOSED(15, 9, 23)
530 DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
531 LOAD_FROM_TRANSPOSED(23, 25, 7)
532 DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
533 // part of stage 2
534 q13s16 = vsubq_s16(q3s16, q2s16);
535 q3s16 = vaddq_s16(q3s16, q2s16);
536 q14s16 = vsubq_s16(q1s16, q0s16);
537 q2s16 = vaddq_s16(q1s16, q0s16);
538 // part of stage 3
539 DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
540 // part of stage 4
541 q8s16 = vaddq_s16(q4s16, q2s16);
542 q9s16 = vaddq_s16(q5s16, q0s16);
543 q10s16 = vaddq_s16(q7s16, q1s16);
544 q15s16 = vaddq_s16(q6s16, q3s16);
545 q13s16 = vsubq_s16(q5s16, q0s16);
546 q14s16 = vsubq_s16(q7s16, q1s16);
547 STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
548 STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
549 // part of stage 5
550 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
551 STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
552 // part of stage 4
553 q13s16 = vsubq_s16(q4s16, q2s16);
554 q14s16 = vsubq_s16(q6s16, q3s16);
555 // part of stage 5
556 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
557 STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
558
559 // -----------------------------------------
560 // BLOCK B: 20-23,24-27
561 // -----------------------------------------
562 // generate 20,21,26,27
563 // part of stage 1
564 LOAD_FROM_TRANSPOSED(7, 5, 27)
565 DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
566 LOAD_FROM_TRANSPOSED(27, 21, 11)
567 DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
568 // part of stage 2
569 q13s16 = vsubq_s16(q0s16, q1s16);
570 q0s16 = vaddq_s16(q0s16, q1s16);
571 q14s16 = vsubq_s16(q2s16, q3s16);
572 q2s16 = vaddq_s16(q2s16, q3s16);
573 // part of stage 3
574 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
575
576 // generate 22,23,24,25
577 // part of stage 1
578 LOAD_FROM_TRANSPOSED(11, 13, 19)
579 DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
580 LOAD_FROM_TRANSPOSED(19, 29, 3)
581 DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
582 // part of stage 2
583 q14s16 = vsubq_s16(q4s16, q5s16);
584 q5s16 = vaddq_s16(q4s16, q5s16);
585 q13s16 = vsubq_s16(q6s16, q7s16);
586 q6s16 = vaddq_s16(q6s16, q7s16);
587 // part of stage 3
588 DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
589 // part of stage 4
590 q10s16 = vaddq_s16(q7s16, q1s16);
591 q11s16 = vaddq_s16(q5s16, q0s16);
592 q12s16 = vaddq_s16(q6s16, q2s16);
593 q15s16 = vaddq_s16(q4s16, q3s16);
594 // part of stage 6
595 LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
596 q8s16 = vaddq_s16(q14s16, q11s16);
597 q9s16 = vaddq_s16(q13s16, q10s16);
598 q13s16 = vsubq_s16(q13s16, q10s16);
599 q11s16 = vsubq_s16(q14s16, q11s16);
600 STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
601 LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
602 q8s16 = vsubq_s16(q9s16, q12s16);
603 q10s16 = vaddq_s16(q14s16, q15s16);
604 q14s16 = vsubq_s16(q14s16, q15s16);
605 q12s16 = vaddq_s16(q9s16, q12s16);
606 STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
607 // part of stage 7
608 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
609 STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
610 q13s16 = q11s16;
611 q14s16 = q8s16;
612 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
613 STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
614 // part of stage 4
615 q14s16 = vsubq_s16(q5s16, q0s16);
616 q13s16 = vsubq_s16(q6s16, q2s16);
617 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
618 q14s16 = vsubq_s16(q7s16, q1s16);
619 q13s16 = vsubq_s16(q4s16, q3s16);
620 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
621 // part of stage 6
622 LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
623 q8s16 = vaddq_s16(q14s16, q1s16);
624 q9s16 = vaddq_s16(q13s16, q6s16);
625 q13s16 = vsubq_s16(q13s16, q6s16);
626 q1s16 = vsubq_s16(q14s16, q1s16);
627 STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
628 LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
629 q14s16 = vsubq_s16(q8s16, q5s16);
630 q10s16 = vaddq_s16(q8s16, q5s16);
631 q11s16 = vaddq_s16(q9s16, q0s16);
632 q0s16 = vsubq_s16(q9s16, q0s16);
633 STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
634 // part of stage 7
635 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
636 STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
637 DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
638 &q1s16, &q0s16);
639 STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
640
641 // -----------------------------------------
642 // BLOCK C: 8-10,11-15
643 // -----------------------------------------
644 // generate 8,9,14,15
645 // part of stage 2
646 LOAD_FROM_TRANSPOSED(3, 2, 30)
647 DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
648 LOAD_FROM_TRANSPOSED(30, 18, 14)
649 DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
650 // part of stage 3
651 q13s16 = vsubq_s16(q0s16, q1s16);
652 q0s16 = vaddq_s16(q0s16, q1s16);
653 q14s16 = vsubq_s16(q2s16, q3s16);
654 q2s16 = vaddq_s16(q2s16, q3s16);
655 // part of stage 4
656 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
657
658 // generate 10,11,12,13
659 // part of stage 2
660 LOAD_FROM_TRANSPOSED(14, 10, 22)
661 DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
662 LOAD_FROM_TRANSPOSED(22, 26, 6)
663 DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
664 // part of stage 3
665 q14s16 = vsubq_s16(q4s16, q5s16);
666 q5s16 = vaddq_s16(q4s16, q5s16);
667 q13s16 = vsubq_s16(q6s16, q7s16);
668 q6s16 = vaddq_s16(q6s16, q7s16);
669 // part of stage 4
670 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
671 // part of stage 5
672 q8s16 = vaddq_s16(q0s16, q5s16);
673 q9s16 = vaddq_s16(q1s16, q7s16);
674 q13s16 = vsubq_s16(q1s16, q7s16);
675 q14s16 = vsubq_s16(q3s16, q4s16);
676 q10s16 = vaddq_s16(q3s16, q4s16);
677 q15s16 = vaddq_s16(q2s16, q6s16);
678 STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
679 STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
680 // part of stage 6
681 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
682 STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
683 q13s16 = vsubq_s16(q0s16, q5s16);
684 q14s16 = vsubq_s16(q2s16, q6s16);
685 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
686 STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
687
688 // -----------------------------------------
689 // BLOCK D: 0-3,4-7
690 // -----------------------------------------
691 // generate 4,5,6,7
692 // part of stage 3
693 LOAD_FROM_TRANSPOSED(6, 4, 28)
694 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
695 LOAD_FROM_TRANSPOSED(28, 20, 12)
696 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
697 // part of stage 4
698 q13s16 = vsubq_s16(q0s16, q1s16);
699 q0s16 = vaddq_s16(q0s16, q1s16);
700 q14s16 = vsubq_s16(q2s16, q3s16);
701 q2s16 = vaddq_s16(q2s16, q3s16);
702 // part of stage 5
703 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
704
705 // generate 0,1,2,3
706 // part of stage 4
707 LOAD_FROM_TRANSPOSED(12, 0, 16)
708 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
709 LOAD_FROM_TRANSPOSED(16, 8, 24)
710 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
711 // part of stage 5
712 q4s16 = vaddq_s16(q7s16, q6s16);
713 q7s16 = vsubq_s16(q7s16, q6s16);
714 q6s16 = vsubq_s16(q5s16, q14s16);
715 q5s16 = vaddq_s16(q5s16, q14s16);
716 // part of stage 6
717 q8s16 = vaddq_s16(q4s16, q2s16);
718 q9s16 = vaddq_s16(q5s16, q3s16);
719 q10s16 = vaddq_s16(q6s16, q1s16);
720 q11s16 = vaddq_s16(q7s16, q0s16);
721 q12s16 = vsubq_s16(q7s16, q0s16);
722 q13s16 = vsubq_s16(q6s16, q1s16);
723 q14s16 = vsubq_s16(q5s16, q3s16);
724 q15s16 = vsubq_s16(q4s16, q2s16);
725 // part of stage 7
726 LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
727 q2s16 = vaddq_s16(q8s16, q1s16);
728 q3s16 = vaddq_s16(q9s16, q0s16);
729 q4s16 = vsubq_s16(q9s16, q0s16);
730 q5s16 = vsubq_s16(q8s16, q1s16);
731 LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
732 q8s16 = vaddq_s16(q4s16, q1s16);
733 q9s16 = vaddq_s16(q5s16, q0s16);
734 q6s16 = vsubq_s16(q5s16, q0s16);
735 q7s16 = vsubq_s16(q4s16, q1s16);
736
737 if (idct32_pass_loop == 0) {
738 idct32_bands_end_1st_pass(out,
739 q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
740 q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
741 } else {
742 idct32_bands_end_2nd_pass(out, dest, stride,
743 q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
744 q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
745 dest += 8;
746 }
747 }
748 }
749 return;
750 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698