Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1051)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.c

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14
15 static int16_t cospi_2_64 = 16305;
16 static int16_t cospi_4_64 = 16069;
17 static int16_t cospi_6_64 = 15679;
18 static int16_t cospi_8_64 = 15137;
19 static int16_t cospi_10_64 = 14449;
20 static int16_t cospi_12_64 = 13623;
21 static int16_t cospi_14_64 = 12665;
22 static int16_t cospi_16_64 = 11585;
23 static int16_t cospi_18_64 = 10394;
24 static int16_t cospi_20_64 = 9102;
25 static int16_t cospi_22_64 = 7723;
26 static int16_t cospi_24_64 = 6270;
27 static int16_t cospi_26_64 = 4756;
28 static int16_t cospi_28_64 = 3196;
29 static int16_t cospi_30_64 = 1606;
30
31 static INLINE void TRANSPOSE8X8(
32 int16x8_t *q8s16,
33 int16x8_t *q9s16,
34 int16x8_t *q10s16,
35 int16x8_t *q11s16,
36 int16x8_t *q12s16,
37 int16x8_t *q13s16,
38 int16x8_t *q14s16,
39 int16x8_t *q15s16) {
40 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
41 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
42 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
43 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
44
45 d16s16 = vget_low_s16(*q8s16);
46 d17s16 = vget_high_s16(*q8s16);
47 d18s16 = vget_low_s16(*q9s16);
48 d19s16 = vget_high_s16(*q9s16);
49 d20s16 = vget_low_s16(*q10s16);
50 d21s16 = vget_high_s16(*q10s16);
51 d22s16 = vget_low_s16(*q11s16);
52 d23s16 = vget_high_s16(*q11s16);
53 d24s16 = vget_low_s16(*q12s16);
54 d25s16 = vget_high_s16(*q12s16);
55 d26s16 = vget_low_s16(*q13s16);
56 d27s16 = vget_high_s16(*q13s16);
57 d28s16 = vget_low_s16(*q14s16);
58 d29s16 = vget_high_s16(*q14s16);
59 d30s16 = vget_low_s16(*q15s16);
60 d31s16 = vget_high_s16(*q15s16);
61
62 *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
63 *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
64 *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
65 *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
66 *q12s16 = vcombine_s16(d17s16, d25s16);
67 *q13s16 = vcombine_s16(d19s16, d27s16);
68 *q14s16 = vcombine_s16(d21s16, d29s16);
69 *q15s16 = vcombine_s16(d23s16, d31s16);
70
71 q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
72 vreinterpretq_s32_s16(*q10s16));
73 q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
74 vreinterpretq_s32_s16(*q11s16));
75 q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
76 vreinterpretq_s32_s16(*q14s16));
77 q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
78 vreinterpretq_s32_s16(*q15s16));
79
80 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
81 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
82 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
83 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
84 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
85 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
86 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
87 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
88
89 *q8s16 = q0x2s16.val[0];
90 *q9s16 = q0x2s16.val[1];
91 *q10s16 = q1x2s16.val[0];
92 *q11s16 = q1x2s16.val[1];
93 *q12s16 = q2x2s16.val[0];
94 *q13s16 = q2x2s16.val[1];
95 *q14s16 = q3x2s16.val[0];
96 *q15s16 = q3x2s16.val[1];
97 return;
98 }
99
100 void vp9_idct16x16_256_add_neon_pass1(
101 int16_t *in,
102 int16_t *out,
103 int output_stride) {
104 int16x4_t d0s16, d1s16, d2s16, d3s16;
105 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
106 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
107 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
108 uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
109 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
110 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
111 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
112 int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
113 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
114 int16x8x2_t q0x2s16;
115
116 q0x2s16 = vld2q_s16(in);
117 q8s16 = q0x2s16.val[0];
118 in += 16;
119 q0x2s16 = vld2q_s16(in);
120 q9s16 = q0x2s16.val[0];
121 in += 16;
122 q0x2s16 = vld2q_s16(in);
123 q10s16 = q0x2s16.val[0];
124 in += 16;
125 q0x2s16 = vld2q_s16(in);
126 q11s16 = q0x2s16.val[0];
127 in += 16;
128 q0x2s16 = vld2q_s16(in);
129 q12s16 = q0x2s16.val[0];
130 in += 16;
131 q0x2s16 = vld2q_s16(in);
132 q13s16 = q0x2s16.val[0];
133 in += 16;
134 q0x2s16 = vld2q_s16(in);
135 q14s16 = q0x2s16.val[0];
136 in += 16;
137 q0x2s16 = vld2q_s16(in);
138 q15s16 = q0x2s16.val[0];
139
140 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
141 &q12s16, &q13s16, &q14s16, &q15s16);
142
143 d16s16 = vget_low_s16(q8s16);
144 d17s16 = vget_high_s16(q8s16);
145 d18s16 = vget_low_s16(q9s16);
146 d19s16 = vget_high_s16(q9s16);
147 d20s16 = vget_low_s16(q10s16);
148 d21s16 = vget_high_s16(q10s16);
149 d22s16 = vget_low_s16(q11s16);
150 d23s16 = vget_high_s16(q11s16);
151 d24s16 = vget_low_s16(q12s16);
152 d25s16 = vget_high_s16(q12s16);
153 d26s16 = vget_low_s16(q13s16);
154 d27s16 = vget_high_s16(q13s16);
155 d28s16 = vget_low_s16(q14s16);
156 d29s16 = vget_high_s16(q14s16);
157 d30s16 = vget_low_s16(q15s16);
158 d31s16 = vget_high_s16(q15s16);
159
160 // stage 3
161 d0s16 = vdup_n_s16(cospi_28_64);
162 d1s16 = vdup_n_s16(cospi_4_64);
163
164 q2s32 = vmull_s16(d18s16, d0s16);
165 q3s32 = vmull_s16(d19s16, d0s16);
166 q5s32 = vmull_s16(d18s16, d1s16);
167 q6s32 = vmull_s16(d19s16, d1s16);
168
169 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
170 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
171 q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
172 q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
173
174 d2s16 = vdup_n_s16(cospi_12_64);
175 d3s16 = vdup_n_s16(cospi_20_64);
176
177 d8s16 = vqrshrn_n_s32(q2s32, 14);
178 d9s16 = vqrshrn_n_s32(q3s32, 14);
179 d14s16 = vqrshrn_n_s32(q5s32, 14);
180 d15s16 = vqrshrn_n_s32(q6s32, 14);
181 q4s16 = vcombine_s16(d8s16, d9s16);
182 q7s16 = vcombine_s16(d14s16, d15s16);
183
184 q2s32 = vmull_s16(d26s16, d2s16);
185 q3s32 = vmull_s16(d27s16, d2s16);
186 q9s32 = vmull_s16(d26s16, d3s16);
187 q15s32 = vmull_s16(d27s16, d3s16);
188
189 q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
190 q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
191 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
192 q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
193
194 d10s16 = vqrshrn_n_s32(q2s32, 14);
195 d11s16 = vqrshrn_n_s32(q3s32, 14);
196 d12s16 = vqrshrn_n_s32(q9s32, 14);
197 d13s16 = vqrshrn_n_s32(q15s32, 14);
198 q5s16 = vcombine_s16(d10s16, d11s16);
199 q6s16 = vcombine_s16(d12s16, d13s16);
200
201 // stage 4
202 d30s16 = vdup_n_s16(cospi_16_64);
203
204 q2s32 = vmull_s16(d16s16, d30s16);
205 q11s32 = vmull_s16(d17s16, d30s16);
206 q0s32 = vmull_s16(d24s16, d30s16);
207 q1s32 = vmull_s16(d25s16, d30s16);
208
209 d30s16 = vdup_n_s16(cospi_24_64);
210 d31s16 = vdup_n_s16(cospi_8_64);
211
212 q3s32 = vaddq_s32(q2s32, q0s32);
213 q12s32 = vaddq_s32(q11s32, q1s32);
214 q13s32 = vsubq_s32(q2s32, q0s32);
215 q1s32 = vsubq_s32(q11s32, q1s32);
216
217 d16s16 = vqrshrn_n_s32(q3s32, 14);
218 d17s16 = vqrshrn_n_s32(q12s32, 14);
219 d18s16 = vqrshrn_n_s32(q13s32, 14);
220 d19s16 = vqrshrn_n_s32(q1s32, 14);
221 q8s16 = vcombine_s16(d16s16, d17s16);
222 q9s16 = vcombine_s16(d18s16, d19s16);
223
224 q0s32 = vmull_s16(d20s16, d31s16);
225 q1s32 = vmull_s16(d21s16, d31s16);
226 q12s32 = vmull_s16(d20s16, d30s16);
227 q13s32 = vmull_s16(d21s16, d30s16);
228
229 q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
230 q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
231 q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
232 q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
233
234 d22s16 = vqrshrn_n_s32(q0s32, 14);
235 d23s16 = vqrshrn_n_s32(q1s32, 14);
236 d20s16 = vqrshrn_n_s32(q12s32, 14);
237 d21s16 = vqrshrn_n_s32(q13s32, 14);
238 q10s16 = vcombine_s16(d20s16, d21s16);
239 q11s16 = vcombine_s16(d22s16, d23s16);
240
241 q13s16 = vsubq_s16(q4s16, q5s16);
242 q4s16 = vaddq_s16(q4s16, q5s16);
243 q14s16 = vsubq_s16(q7s16, q6s16);
244 q15s16 = vaddq_s16(q6s16, q7s16);
245 d26s16 = vget_low_s16(q13s16);
246 d27s16 = vget_high_s16(q13s16);
247 d28s16 = vget_low_s16(q14s16);
248 d29s16 = vget_high_s16(q14s16);
249
250 // stage 5
251 q0s16 = vaddq_s16(q8s16, q11s16);
252 q1s16 = vaddq_s16(q9s16, q10s16);
253 q2s16 = vsubq_s16(q9s16, q10s16);
254 q3s16 = vsubq_s16(q8s16, q11s16);
255
256 d16s16 = vdup_n_s16(cospi_16_64);
257
258 q11s32 = vmull_s16(d26s16, d16s16);
259 q12s32 = vmull_s16(d27s16, d16s16);
260 q9s32 = vmull_s16(d28s16, d16s16);
261 q10s32 = vmull_s16(d29s16, d16s16);
262
263 q6s32 = vsubq_s32(q9s32, q11s32);
264 q13s32 = vsubq_s32(q10s32, q12s32);
265 q9s32 = vaddq_s32(q9s32, q11s32);
266 q10s32 = vaddq_s32(q10s32, q12s32);
267
268 d10s16 = vqrshrn_n_s32(q6s32, 14);
269 d11s16 = vqrshrn_n_s32(q13s32, 14);
270 d12s16 = vqrshrn_n_s32(q9s32, 14);
271 d13s16 = vqrshrn_n_s32(q10s32, 14);
272 q5s16 = vcombine_s16(d10s16, d11s16);
273 q6s16 = vcombine_s16(d12s16, d13s16);
274
275 // stage 6
276 q8s16 = vaddq_s16(q0s16, q15s16);
277 q9s16 = vaddq_s16(q1s16, q6s16);
278 q10s16 = vaddq_s16(q2s16, q5s16);
279 q11s16 = vaddq_s16(q3s16, q4s16);
280 q12s16 = vsubq_s16(q3s16, q4s16);
281 q13s16 = vsubq_s16(q2s16, q5s16);
282 q14s16 = vsubq_s16(q1s16, q6s16);
283 q15s16 = vsubq_s16(q0s16, q15s16);
284
285 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
286 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
287 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
288 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
289 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
290 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
291 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
292 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
293 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
294 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
295 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
296 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
297 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
298 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
299 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
300 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
301
302 // store the data
303 output_stride >>= 1; // output_stride / 2, out is int16_t
304 vst1_u64((uint64_t *)out, d16u64);
305 out += output_stride;
306 vst1_u64((uint64_t *)out, d17u64);
307 out += output_stride;
308 vst1_u64((uint64_t *)out, d18u64);
309 out += output_stride;
310 vst1_u64((uint64_t *)out, d19u64);
311 out += output_stride;
312 vst1_u64((uint64_t *)out, d20u64);
313 out += output_stride;
314 vst1_u64((uint64_t *)out, d21u64);
315 out += output_stride;
316 vst1_u64((uint64_t *)out, d22u64);
317 out += output_stride;
318 vst1_u64((uint64_t *)out, d23u64);
319 out += output_stride;
320 vst1_u64((uint64_t *)out, d24u64);
321 out += output_stride;
322 vst1_u64((uint64_t *)out, d25u64);
323 out += output_stride;
324 vst1_u64((uint64_t *)out, d26u64);
325 out += output_stride;
326 vst1_u64((uint64_t *)out, d27u64);
327 out += output_stride;
328 vst1_u64((uint64_t *)out, d28u64);
329 out += output_stride;
330 vst1_u64((uint64_t *)out, d29u64);
331 out += output_stride;
332 vst1_u64((uint64_t *)out, d30u64);
333 out += output_stride;
334 vst1_u64((uint64_t *)out, d31u64);
335 return;
336 }
337
338 void vp9_idct16x16_256_add_neon_pass2(
339 int16_t *src,
340 int16_t *out,
341 int16_t *pass1Output,
342 int16_t skip_adding,
343 uint8_t *dest,
344 int dest_stride) {
345 uint8_t *d;
346 uint8x8_t d12u8, d13u8;
347 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
348 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
349 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
350 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
351 uint64x1_t d24u64, d25u64, d26u64, d27u64;
352 int64x1_t d12s64, d13s64;
353 uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
354 uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
355 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
356 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
357 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
358 int32x4_t q10s32, q11s32, q12s32, q13s32;
359 int16x8x2_t q0x2s16;
360
361 q0x2s16 = vld2q_s16(src);
362 q8s16 = q0x2s16.val[0];
363 src += 16;
364 q0x2s16 = vld2q_s16(src);
365 q9s16 = q0x2s16.val[0];
366 src += 16;
367 q0x2s16 = vld2q_s16(src);
368 q10s16 = q0x2s16.val[0];
369 src += 16;
370 q0x2s16 = vld2q_s16(src);
371 q11s16 = q0x2s16.val[0];
372 src += 16;
373 q0x2s16 = vld2q_s16(src);
374 q12s16 = q0x2s16.val[0];
375 src += 16;
376 q0x2s16 = vld2q_s16(src);
377 q13s16 = q0x2s16.val[0];
378 src += 16;
379 q0x2s16 = vld2q_s16(src);
380 q14s16 = q0x2s16.val[0];
381 src += 16;
382 q0x2s16 = vld2q_s16(src);
383 q15s16 = q0x2s16.val[0];
384
385 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
386 &q12s16, &q13s16, &q14s16, &q15s16);
387
388 d16s16 = vget_low_s16(q8s16);
389 d17s16 = vget_high_s16(q8s16);
390 d18s16 = vget_low_s16(q9s16);
391 d19s16 = vget_high_s16(q9s16);
392 d20s16 = vget_low_s16(q10s16);
393 d21s16 = vget_high_s16(q10s16);
394 d22s16 = vget_low_s16(q11s16);
395 d23s16 = vget_high_s16(q11s16);
396 d24s16 = vget_low_s16(q12s16);
397 d25s16 = vget_high_s16(q12s16);
398 d26s16 = vget_low_s16(q13s16);
399 d27s16 = vget_high_s16(q13s16);
400 d28s16 = vget_low_s16(q14s16);
401 d29s16 = vget_high_s16(q14s16);
402 d30s16 = vget_low_s16(q15s16);
403 d31s16 = vget_high_s16(q15s16);
404
405 // stage 3
406 d12s16 = vdup_n_s16(cospi_30_64);
407 d13s16 = vdup_n_s16(cospi_2_64);
408
409 q2s32 = vmull_s16(d16s16, d12s16);
410 q3s32 = vmull_s16(d17s16, d12s16);
411 q1s32 = vmull_s16(d16s16, d13s16);
412 q4s32 = vmull_s16(d17s16, d13s16);
413
414 q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
415 q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
416 q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
417 q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
418
419 d0s16 = vqrshrn_n_s32(q2s32, 14);
420 d1s16 = vqrshrn_n_s32(q3s32, 14);
421 d14s16 = vqrshrn_n_s32(q1s32, 14);
422 d15s16 = vqrshrn_n_s32(q4s32, 14);
423 q0s16 = vcombine_s16(d0s16, d1s16);
424 q7s16 = vcombine_s16(d14s16, d15s16);
425
426 d30s16 = vdup_n_s16(cospi_14_64);
427 d31s16 = vdup_n_s16(cospi_18_64);
428
429 q2s32 = vmull_s16(d24s16, d30s16);
430 q3s32 = vmull_s16(d25s16, d30s16);
431 q4s32 = vmull_s16(d24s16, d31s16);
432 q5s32 = vmull_s16(d25s16, d31s16);
433
434 q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
435 q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
436 q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
437 q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
438
439 d2s16 = vqrshrn_n_s32(q2s32, 14);
440 d3s16 = vqrshrn_n_s32(q3s32, 14);
441 d12s16 = vqrshrn_n_s32(q4s32, 14);
442 d13s16 = vqrshrn_n_s32(q5s32, 14);
443 q1s16 = vcombine_s16(d2s16, d3s16);
444 q6s16 = vcombine_s16(d12s16, d13s16);
445
446 d30s16 = vdup_n_s16(cospi_22_64);
447 d31s16 = vdup_n_s16(cospi_10_64);
448
449 q11s32 = vmull_s16(d20s16, d30s16);
450 q12s32 = vmull_s16(d21s16, d30s16);
451 q4s32 = vmull_s16(d20s16, d31s16);
452 q5s32 = vmull_s16(d21s16, d31s16);
453
454 q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
455 q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
456 q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
457 q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
458
459 d4s16 = vqrshrn_n_s32(q11s32, 14);
460 d5s16 = vqrshrn_n_s32(q12s32, 14);
461 d11s16 = vqrshrn_n_s32(q5s32, 14);
462 d10s16 = vqrshrn_n_s32(q4s32, 14);
463 q2s16 = vcombine_s16(d4s16, d5s16);
464 q5s16 = vcombine_s16(d10s16, d11s16);
465
466 d30s16 = vdup_n_s16(cospi_6_64);
467 d31s16 = vdup_n_s16(cospi_26_64);
468
469 q10s32 = vmull_s16(d28s16, d30s16);
470 q11s32 = vmull_s16(d29s16, d30s16);
471 q12s32 = vmull_s16(d28s16, d31s16);
472 q13s32 = vmull_s16(d29s16, d31s16);
473
474 q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
475 q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
476 q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
477 q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
478
479 d6s16 = vqrshrn_n_s32(q10s32, 14);
480 d7s16 = vqrshrn_n_s32(q11s32, 14);
481 d8s16 = vqrshrn_n_s32(q12s32, 14);
482 d9s16 = vqrshrn_n_s32(q13s32, 14);
483 q3s16 = vcombine_s16(d6s16, d7s16);
484 q4s16 = vcombine_s16(d8s16, d9s16);
485
486 // stage 3
487 q9s16 = vsubq_s16(q0s16, q1s16);
488 q0s16 = vaddq_s16(q0s16, q1s16);
489 q10s16 = vsubq_s16(q3s16, q2s16);
490 q11s16 = vaddq_s16(q2s16, q3s16);
491 q12s16 = vaddq_s16(q4s16, q5s16);
492 q13s16 = vsubq_s16(q4s16, q5s16);
493 q14s16 = vsubq_s16(q7s16, q6s16);
494 q7s16 = vaddq_s16(q6s16, q7s16);
495
496 // stage 4
497 d18s16 = vget_low_s16(q9s16);
498 d19s16 = vget_high_s16(q9s16);
499 d20s16 = vget_low_s16(q10s16);
500 d21s16 = vget_high_s16(q10s16);
501 d26s16 = vget_low_s16(q13s16);
502 d27s16 = vget_high_s16(q13s16);
503 d28s16 = vget_low_s16(q14s16);
504 d29s16 = vget_high_s16(q14s16);
505
506 d30s16 = vdup_n_s16(cospi_8_64);
507 d31s16 = vdup_n_s16(cospi_24_64);
508
509 q2s32 = vmull_s16(d18s16, d31s16);
510 q3s32 = vmull_s16(d19s16, d31s16);
511 q4s32 = vmull_s16(d28s16, d31s16);
512 q5s32 = vmull_s16(d29s16, d31s16);
513
514 q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
515 q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
516 q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
517 q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
518
519 d12s16 = vqrshrn_n_s32(q2s32, 14);
520 d13s16 = vqrshrn_n_s32(q3s32, 14);
521 d2s16 = vqrshrn_n_s32(q4s32, 14);
522 d3s16 = vqrshrn_n_s32(q5s32, 14);
523 q1s16 = vcombine_s16(d2s16, d3s16);
524 q6s16 = vcombine_s16(d12s16, d13s16);
525
526 q3s16 = q11s16;
527 q4s16 = q12s16;
528
529 d30s16 = vdup_n_s16(-cospi_8_64);
530 q11s32 = vmull_s16(d26s16, d30s16);
531 q12s32 = vmull_s16(d27s16, d30s16);
532 q8s32 = vmull_s16(d20s16, d30s16);
533 q9s32 = vmull_s16(d21s16, d30s16);
534
535 q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
536 q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
537 q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
538 q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
539
540 d4s16 = vqrshrn_n_s32(q11s32, 14);
541 d5s16 = vqrshrn_n_s32(q12s32, 14);
542 d10s16 = vqrshrn_n_s32(q8s32, 14);
543 d11s16 = vqrshrn_n_s32(q9s32, 14);
544 q2s16 = vcombine_s16(d4s16, d5s16);
545 q5s16 = vcombine_s16(d10s16, d11s16);
546
547 // stage 5
548 q8s16 = vaddq_s16(q0s16, q3s16);
549 q9s16 = vaddq_s16(q1s16, q2s16);
550 q10s16 = vsubq_s16(q1s16, q2s16);
551 q11s16 = vsubq_s16(q0s16, q3s16);
552 q12s16 = vsubq_s16(q7s16, q4s16);
553 q13s16 = vsubq_s16(q6s16, q5s16);
554 q14s16 = vaddq_s16(q6s16, q5s16);
555 q15s16 = vaddq_s16(q7s16, q4s16);
556
557 // stage 6
558 d20s16 = vget_low_s16(q10s16);
559 d21s16 = vget_high_s16(q10s16);
560 d22s16 = vget_low_s16(q11s16);
561 d23s16 = vget_high_s16(q11s16);
562 d24s16 = vget_low_s16(q12s16);
563 d25s16 = vget_high_s16(q12s16);
564 d26s16 = vget_low_s16(q13s16);
565 d27s16 = vget_high_s16(q13s16);
566
567 d14s16 = vdup_n_s16(cospi_16_64);
568
569 q3s32 = vmull_s16(d26s16, d14s16);
570 q4s32 = vmull_s16(d27s16, d14s16);
571 q0s32 = vmull_s16(d20s16, d14s16);
572 q1s32 = vmull_s16(d21s16, d14s16);
573
574 q5s32 = vsubq_s32(q3s32, q0s32);
575 q6s32 = vsubq_s32(q4s32, q1s32);
576 q10s32 = vaddq_s32(q3s32, q0s32);
577 q4s32 = vaddq_s32(q4s32, q1s32);
578
579 d4s16 = vqrshrn_n_s32(q5s32, 14);
580 d5s16 = vqrshrn_n_s32(q6s32, 14);
581 d10s16 = vqrshrn_n_s32(q10s32, 14);
582 d11s16 = vqrshrn_n_s32(q4s32, 14);
583 q2s16 = vcombine_s16(d4s16, d5s16);
584 q5s16 = vcombine_s16(d10s16, d11s16);
585
586 q0s32 = vmull_s16(d22s16, d14s16);
587 q1s32 = vmull_s16(d23s16, d14s16);
588 q13s32 = vmull_s16(d24s16, d14s16);
589 q6s32 = vmull_s16(d25s16, d14s16);
590
591 q10s32 = vsubq_s32(q13s32, q0s32);
592 q4s32 = vsubq_s32(q6s32, q1s32);
593 q13s32 = vaddq_s32(q13s32, q0s32);
594 q6s32 = vaddq_s32(q6s32, q1s32);
595
596 d6s16 = vqrshrn_n_s32(q10s32, 14);
597 d7s16 = vqrshrn_n_s32(q4s32, 14);
598 d8s16 = vqrshrn_n_s32(q13s32, 14);
599 d9s16 = vqrshrn_n_s32(q6s32, 14);
600 q3s16 = vcombine_s16(d6s16, d7s16);
601 q4s16 = vcombine_s16(d8s16, d9s16);
602
603 // stage 7
604 if (skip_adding != 0) {
605 d = dest;
606 // load the data in pass1
607 q0s16 = vld1q_s16(pass1Output);
608 pass1Output += 8;
609 q1s16 = vld1q_s16(pass1Output);
610 pass1Output += 8;
611 d12s64 = vld1_s64((int64_t *)dest);
612 dest += dest_stride;
613 d13s64 = vld1_s64((int64_t *)dest);
614 dest += dest_stride;
615
616 q12s16 = vaddq_s16(q0s16, q15s16);
617 q13s16 = vaddq_s16(q1s16, q14s16);
618 q12s16 = vrshrq_n_s16(q12s16, 6);
619 q13s16 = vrshrq_n_s16(q13s16, 6);
620 q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
621 vreinterpret_u8_s64(d12s64));
622 q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
623 vreinterpret_u8_s64(d13s64));
624 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
625 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
626 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
627 d += dest_stride;
628 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
629 d += dest_stride;
630 q14s16 = vsubq_s16(q1s16, q14s16);
631 q15s16 = vsubq_s16(q0s16, q15s16);
632
633 q10s16 = vld1q_s16(pass1Output);
634 pass1Output += 8;
635 q11s16 = vld1q_s16(pass1Output);
636 pass1Output += 8;
637 d12s64 = vld1_s64((int64_t *)dest);
638 dest += dest_stride;
639 d13s64 = vld1_s64((int64_t *)dest);
640 dest += dest_stride;
641 q12s16 = vaddq_s16(q10s16, q5s16);
642 q13s16 = vaddq_s16(q11s16, q4s16);
643 q12s16 = vrshrq_n_s16(q12s16, 6);
644 q13s16 = vrshrq_n_s16(q13s16, 6);
645 q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
646 vreinterpret_u8_s64(d12s64));
647 q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
648 vreinterpret_u8_s64(d13s64));
649 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
650 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
651 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
652 d += dest_stride;
653 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
654 d += dest_stride;
655 q4s16 = vsubq_s16(q11s16, q4s16);
656 q5s16 = vsubq_s16(q10s16, q5s16);
657
658 q0s16 = vld1q_s16(pass1Output);
659 pass1Output += 8;
660 q1s16 = vld1q_s16(pass1Output);
661 pass1Output += 8;
662 d12s64 = vld1_s64((int64_t *)dest);
663 dest += dest_stride;
664 d13s64 = vld1_s64((int64_t *)dest);
665 dest += dest_stride;
666 q12s16 = vaddq_s16(q0s16, q3s16);
667 q13s16 = vaddq_s16(q1s16, q2s16);
668 q12s16 = vrshrq_n_s16(q12s16, 6);
669 q13s16 = vrshrq_n_s16(q13s16, 6);
670 q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
671 vreinterpret_u8_s64(d12s64));
672 q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
673 vreinterpret_u8_s64(d13s64));
674 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
675 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
676 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
677 d += dest_stride;
678 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
679 d += dest_stride;
680 q2s16 = vsubq_s16(q1s16, q2s16);
681 q3s16 = vsubq_s16(q0s16, q3s16);
682
683 q10s16 = vld1q_s16(pass1Output);
684 pass1Output += 8;
685 q11s16 = vld1q_s16(pass1Output);
686 d12s64 = vld1_s64((int64_t *)dest);
687 dest += dest_stride;
688 d13s64 = vld1_s64((int64_t *)dest);
689 dest += dest_stride;
690 q12s16 = vaddq_s16(q10s16, q9s16);
691 q13s16 = vaddq_s16(q11s16, q8s16);
692 q12s16 = vrshrq_n_s16(q12s16, 6);
693 q13s16 = vrshrq_n_s16(q13s16, 6);
694 q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
695 vreinterpret_u8_s64(d12s64));
696 q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
697 vreinterpret_u8_s64(d13s64));
698 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
699 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
700 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
701 d += dest_stride;
702 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
703 d += dest_stride;
704 q8s16 = vsubq_s16(q11s16, q8s16);
705 q9s16 = vsubq_s16(q10s16, q9s16);
706
707 // store the data out 8,9,10,11,12,13,14,15
708 d12s64 = vld1_s64((int64_t *)dest);
709 dest += dest_stride;
710 q8s16 = vrshrq_n_s16(q8s16, 6);
711 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
712 vreinterpret_u8_s64(d12s64));
713 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
714 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
715 d += dest_stride;
716
717 d12s64 = vld1_s64((int64_t *)dest);
718 dest += dest_stride;
719 q9s16 = vrshrq_n_s16(q9s16, 6);
720 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
721 vreinterpret_u8_s64(d12s64));
722 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
723 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
724 d += dest_stride;
725
726 d12s64 = vld1_s64((int64_t *)dest);
727 dest += dest_stride;
728 q2s16 = vrshrq_n_s16(q2s16, 6);
729 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
730 vreinterpret_u8_s64(d12s64));
731 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
732 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
733 d += dest_stride;
734
735 d12s64 = vld1_s64((int64_t *)dest);
736 dest += dest_stride;
737 q3s16 = vrshrq_n_s16(q3s16, 6);
738 q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
739 vreinterpret_u8_s64(d12s64));
740 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
741 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
742 d += dest_stride;
743
744 d12s64 = vld1_s64((int64_t *)dest);
745 dest += dest_stride;
746 q4s16 = vrshrq_n_s16(q4s16, 6);
747 q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
748 vreinterpret_u8_s64(d12s64));
749 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
750 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
751 d += dest_stride;
752
753 d12s64 = vld1_s64((int64_t *)dest);
754 dest += dest_stride;
755 q5s16 = vrshrq_n_s16(q5s16, 6);
756 q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
757 vreinterpret_u8_s64(d12s64));
758 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
759 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
760 d += dest_stride;
761
762 d12s64 = vld1_s64((int64_t *)dest);
763 dest += dest_stride;
764 q14s16 = vrshrq_n_s16(q14s16, 6);
765 q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
766 vreinterpret_u8_s64(d12s64));
767 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
768 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
769 d += dest_stride;
770
771 d12s64 = vld1_s64((int64_t *)dest);
772 q15s16 = vrshrq_n_s16(q15s16, 6);
773 q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
774 vreinterpret_u8_s64(d12s64));
775 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
776 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
777 } else { // skip_adding_dest
778 q0s16 = vld1q_s16(pass1Output);
779 pass1Output += 8;
780 q1s16 = vld1q_s16(pass1Output);
781 pass1Output += 8;
782 q12s16 = vaddq_s16(q0s16, q15s16);
783 q13s16 = vaddq_s16(q1s16, q14s16);
784 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
785 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
786 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
787 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
788 vst1_u64((uint64_t *)out, d24u64);
789 out += 4;
790 vst1_u64((uint64_t *)out, d25u64);
791 out += 12;
792 vst1_u64((uint64_t *)out, d26u64);
793 out += 4;
794 vst1_u64((uint64_t *)out, d27u64);
795 out += 12;
796 q14s16 = vsubq_s16(q1s16, q14s16);
797 q15s16 = vsubq_s16(q0s16, q15s16);
798
799 q10s16 = vld1q_s16(pass1Output);
800 pass1Output += 8;
801 q11s16 = vld1q_s16(pass1Output);
802 pass1Output += 8;
803 q12s16 = vaddq_s16(q10s16, q5s16);
804 q13s16 = vaddq_s16(q11s16, q4s16);
805 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
806 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
807 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
808 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
809 vst1_u64((uint64_t *)out, d24u64);
810 out += 4;
811 vst1_u64((uint64_t *)out, d25u64);
812 out += 12;
813 vst1_u64((uint64_t *)out, d26u64);
814 out += 4;
815 vst1_u64((uint64_t *)out, d27u64);
816 out += 12;
817 q4s16 = vsubq_s16(q11s16, q4s16);
818 q5s16 = vsubq_s16(q10s16, q5s16);
819
820 q0s16 = vld1q_s16(pass1Output);
821 pass1Output += 8;
822 q1s16 = vld1q_s16(pass1Output);
823 pass1Output += 8;
824 q12s16 = vaddq_s16(q0s16, q3s16);
825 q13s16 = vaddq_s16(q1s16, q2s16);
826 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
827 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
828 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
829 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
830 vst1_u64((uint64_t *)out, d24u64);
831 out += 4;
832 vst1_u64((uint64_t *)out, d25u64);
833 out += 12;
834 vst1_u64((uint64_t *)out, d26u64);
835 out += 4;
836 vst1_u64((uint64_t *)out, d27u64);
837 out += 12;
838 q2s16 = vsubq_s16(q1s16, q2s16);
839 q3s16 = vsubq_s16(q0s16, q3s16);
840
841 q10s16 = vld1q_s16(pass1Output);
842 pass1Output += 8;
843 q11s16 = vld1q_s16(pass1Output);
844 pass1Output += 8;
845 q12s16 = vaddq_s16(q10s16, q9s16);
846 q13s16 = vaddq_s16(q11s16, q8s16);
847 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
848 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
849 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
850 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
851 vst1_u64((uint64_t *)out, d24u64);
852 out += 4;
853 vst1_u64((uint64_t *)out, d25u64);
854 out += 12;
855 vst1_u64((uint64_t *)out, d26u64);
856 out += 4;
857 vst1_u64((uint64_t *)out, d27u64);
858 out += 12;
859 q8s16 = vsubq_s16(q11s16, q8s16);
860 q9s16 = vsubq_s16(q10s16, q9s16);
861
862 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
863 out += 4;
864 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
865 out += 12;
866 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
867 out += 4;
868 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
869 out += 12;
870 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
871 out += 4;
872 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
873 out += 12;
874 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
875 out += 4;
876 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
877 out += 12;
878 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
879 out += 4;
880 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
881 out += 12;
882 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
883 out += 4;
884 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
885 out += 12;
886 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
887 out += 4;
888 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
889 out += 12;
890 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
891 out += 4;
892 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
893 }
894 return;
895 }
896
897 void vp9_idct16x16_10_add_neon_pass1(
898 int16_t *in,
899 int16_t *out,
900 int output_stride) {
901 int16x4_t d4s16;
902 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
903 uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
904 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
905 int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
906 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
907 int32x4_t q6s32, q9s32;
908 int32x4_t q10s32, q11s32, q12s32, q15s32;
909 int16x8x2_t q0x2s16;
910
911 q0x2s16 = vld2q_s16(in);
912 q8s16 = q0x2s16.val[0];
913 in += 16;
914 q0x2s16 = vld2q_s16(in);
915 q9s16 = q0x2s16.val[0];
916 in += 16;
917 q0x2s16 = vld2q_s16(in);
918 q10s16 = q0x2s16.val[0];
919 in += 16;
920 q0x2s16 = vld2q_s16(in);
921 q11s16 = q0x2s16.val[0];
922 in += 16;
923 q0x2s16 = vld2q_s16(in);
924 q12s16 = q0x2s16.val[0];
925 in += 16;
926 q0x2s16 = vld2q_s16(in);
927 q13s16 = q0x2s16.val[0];
928 in += 16;
929 q0x2s16 = vld2q_s16(in);
930 q14s16 = q0x2s16.val[0];
931 in += 16;
932 q0x2s16 = vld2q_s16(in);
933 q15s16 = q0x2s16.val[0];
934
935 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
936 &q12s16, &q13s16, &q14s16, &q15s16);
937
938 // stage 3
939 q0s16 = vdupq_n_s16(cospi_28_64 * 2);
940 q1s16 = vdupq_n_s16(cospi_4_64 * 2);
941
942 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
943 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
944
945 // stage 4
946 q1s16 = vdupq_n_s16(cospi_16_64 * 2);
947 d4s16 = vdup_n_s16(cospi_16_64);
948
949 q8s16 = vqrdmulhq_s16(q8s16, q1s16);
950
951 d8s16 = vget_low_s16(q4s16);
952 d9s16 = vget_high_s16(q4s16);
953 d14s16 = vget_low_s16(q7s16);
954 d15s16 = vget_high_s16(q7s16);
955 q9s32 = vmull_s16(d14s16, d4s16);
956 q10s32 = vmull_s16(d15s16, d4s16);
957 q12s32 = vmull_s16(d9s16, d4s16);
958 q11s32 = vmull_s16(d8s16, d4s16);
959
960 q15s32 = vsubq_s32(q10s32, q12s32);
961 q6s32 = vsubq_s32(q9s32, q11s32);
962 q9s32 = vaddq_s32(q9s32, q11s32);
963 q10s32 = vaddq_s32(q10s32, q12s32);
964
965 d11s16 = vqrshrn_n_s32(q15s32, 14);
966 d10s16 = vqrshrn_n_s32(q6s32, 14);
967 d12s16 = vqrshrn_n_s32(q9s32, 14);
968 d13s16 = vqrshrn_n_s32(q10s32, 14);
969 q5s16 = vcombine_s16(d10s16, d11s16);
970 q6s16 = vcombine_s16(d12s16, d13s16);
971
972 // stage 6
973 q2s16 = vaddq_s16(q8s16, q7s16);
974 q9s16 = vaddq_s16(q8s16, q6s16);
975 q10s16 = vaddq_s16(q8s16, q5s16);
976 q11s16 = vaddq_s16(q8s16, q4s16);
977 q12s16 = vsubq_s16(q8s16, q4s16);
978 q13s16 = vsubq_s16(q8s16, q5s16);
979 q14s16 = vsubq_s16(q8s16, q6s16);
980 q15s16 = vsubq_s16(q8s16, q7s16);
981
982 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
983 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
984 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
985 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
986 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
987 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
988 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
989 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
990 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
991 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
992 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
993 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
994 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
995 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
996 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
997 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
998
999 // store the data
1000 output_stride >>= 1; // output_stride / 2, out is int16_t
1001 vst1_u64((uint64_t *)out, d4u64);
1002 out += output_stride;
1003 vst1_u64((uint64_t *)out, d5u64);
1004 out += output_stride;
1005 vst1_u64((uint64_t *)out, d18u64);
1006 out += output_stride;
1007 vst1_u64((uint64_t *)out, d19u64);
1008 out += output_stride;
1009 vst1_u64((uint64_t *)out, d20u64);
1010 out += output_stride;
1011 vst1_u64((uint64_t *)out, d21u64);
1012 out += output_stride;
1013 vst1_u64((uint64_t *)out, d22u64);
1014 out += output_stride;
1015 vst1_u64((uint64_t *)out, d23u64);
1016 out += output_stride;
1017 vst1_u64((uint64_t *)out, d24u64);
1018 out += output_stride;
1019 vst1_u64((uint64_t *)out, d25u64);
1020 out += output_stride;
1021 vst1_u64((uint64_t *)out, d26u64);
1022 out += output_stride;
1023 vst1_u64((uint64_t *)out, d27u64);
1024 out += output_stride;
1025 vst1_u64((uint64_t *)out, d28u64);
1026 out += output_stride;
1027 vst1_u64((uint64_t *)out, d29u64);
1028 out += output_stride;
1029 vst1_u64((uint64_t *)out, d30u64);
1030 out += output_stride;
1031 vst1_u64((uint64_t *)out, d31u64);
1032 return;
1033 }
1034
1035 void vp9_idct16x16_10_add_neon_pass2(
1036 int16_t *src,
1037 int16_t *out,
1038 int16_t *pass1Output,
1039 int16_t skip_adding,
1040 uint8_t *dest,
1041 int dest_stride) {
1042 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
1043 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
1044 int16x4_t d20s16, d21s16, d22s16, d23s16;
1045 int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
1046 uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
1047 uint64x1_t d16u64, d17u64, d18u64, d19u64;
1048 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
1049 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
1050 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
1051 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
1052 int32x4_t q10s32, q11s32, q12s32, q13s32;
1053 int16x8x2_t q0x2s16;
1054 (void)skip_adding;
1055 (void)dest;
1056 (void)dest_stride;
1057
1058 q0x2s16 = vld2q_s16(src);
1059 q8s16 = q0x2s16.val[0];
1060 src += 16;
1061 q0x2s16 = vld2q_s16(src);
1062 q9s16 = q0x2s16.val[0];
1063 src += 16;
1064 q0x2s16 = vld2q_s16(src);
1065 q10s16 = q0x2s16.val[0];
1066 src += 16;
1067 q0x2s16 = vld2q_s16(src);
1068 q11s16 = q0x2s16.val[0];
1069 src += 16;
1070 q0x2s16 = vld2q_s16(src);
1071 q12s16 = q0x2s16.val[0];
1072 src += 16;
1073 q0x2s16 = vld2q_s16(src);
1074 q13s16 = q0x2s16.val[0];
1075 src += 16;
1076 q0x2s16 = vld2q_s16(src);
1077 q14s16 = q0x2s16.val[0];
1078 src += 16;
1079 q0x2s16 = vld2q_s16(src);
1080 q15s16 = q0x2s16.val[0];
1081
1082 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
1083 &q12s16, &q13s16, &q14s16, &q15s16);
1084
1085 // stage 3
1086 q6s16 = vdupq_n_s16(cospi_30_64 * 2);
1087 q0s16 = vqrdmulhq_s16(q8s16, q6s16);
1088 q6s16 = vdupq_n_s16(cospi_2_64 * 2);
1089 q7s16 = vqrdmulhq_s16(q8s16, q6s16);
1090
1091 q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
1092 q14s16 = vdupq_n_s16(cospi_6_64 * 2);
1093 q3s16 = vqrdmulhq_s16(q9s16, q15s16);
1094 q4s16 = vqrdmulhq_s16(q9s16, q14s16);
1095
1096 // stage 4
1097 d0s16 = vget_low_s16(q0s16);
1098 d1s16 = vget_high_s16(q0s16);
1099 d6s16 = vget_low_s16(q3s16);
1100 d7s16 = vget_high_s16(q3s16);
1101 d8s16 = vget_low_s16(q4s16);
1102 d9s16 = vget_high_s16(q4s16);
1103 d14s16 = vget_low_s16(q7s16);
1104 d15s16 = vget_high_s16(q7s16);
1105
1106 d30s16 = vdup_n_s16(cospi_8_64);
1107 d31s16 = vdup_n_s16(cospi_24_64);
1108
1109 q12s32 = vmull_s16(d14s16, d31s16);
1110 q5s32 = vmull_s16(d15s16, d31s16);
1111 q2s32 = vmull_s16(d0s16, d31s16);
1112 q11s32 = vmull_s16(d1s16, d31s16);
1113
1114 q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
1115 q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
1116 q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
1117 q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
1118
1119 d2s16 = vqrshrn_n_s32(q12s32, 14);
1120 d3s16 = vqrshrn_n_s32(q5s32, 14);
1121 d12s16 = vqrshrn_n_s32(q2s32, 14);
1122 d13s16 = vqrshrn_n_s32(q11s32, 14);
1123 q1s16 = vcombine_s16(d2s16, d3s16);
1124 q6s16 = vcombine_s16(d12s16, d13s16);
1125
1126 d30s16 = vdup_n_s16(-cospi_8_64);
1127 q10s32 = vmull_s16(d8s16, d30s16);
1128 q13s32 = vmull_s16(d9s16, d30s16);
1129 q8s32 = vmull_s16(d6s16, d30s16);
1130 q9s32 = vmull_s16(d7s16, d30s16);
1131
1132 q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
1133 q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
1134 q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
1135 q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
1136
1137 d4s16 = vqrshrn_n_s32(q10s32, 14);
1138 d5s16 = vqrshrn_n_s32(q13s32, 14);
1139 d10s16 = vqrshrn_n_s32(q8s32, 14);
1140 d11s16 = vqrshrn_n_s32(q9s32, 14);
1141 q2s16 = vcombine_s16(d4s16, d5s16);
1142 q5s16 = vcombine_s16(d10s16, d11s16);
1143
1144 // stage 5
1145 q8s16 = vaddq_s16(q0s16, q3s16);
1146 q9s16 = vaddq_s16(q1s16, q2s16);
1147 q10s16 = vsubq_s16(q1s16, q2s16);
1148 q11s16 = vsubq_s16(q0s16, q3s16);
1149 q12s16 = vsubq_s16(q7s16, q4s16);
1150 q13s16 = vsubq_s16(q6s16, q5s16);
1151 q14s16 = vaddq_s16(q6s16, q5s16);
1152 q15s16 = vaddq_s16(q7s16, q4s16);
1153
1154 // stage 6
1155 d20s16 = vget_low_s16(q10s16);
1156 d21s16 = vget_high_s16(q10s16);
1157 d22s16 = vget_low_s16(q11s16);
1158 d23s16 = vget_high_s16(q11s16);
1159 d24s16 = vget_low_s16(q12s16);
1160 d25s16 = vget_high_s16(q12s16);
1161 d26s16 = vget_low_s16(q13s16);
1162 d27s16 = vget_high_s16(q13s16);
1163
1164 d14s16 = vdup_n_s16(cospi_16_64);
1165 q3s32 = vmull_s16(d26s16, d14s16);
1166 q4s32 = vmull_s16(d27s16, d14s16);
1167 q0s32 = vmull_s16(d20s16, d14s16);
1168 q1s32 = vmull_s16(d21s16, d14s16);
1169
1170 q5s32 = vsubq_s32(q3s32, q0s32);
1171 q6s32 = vsubq_s32(q4s32, q1s32);
1172 q0s32 = vaddq_s32(q3s32, q0s32);
1173 q4s32 = vaddq_s32(q4s32, q1s32);
1174
1175 d4s16 = vqrshrn_n_s32(q5s32, 14);
1176 d5s16 = vqrshrn_n_s32(q6s32, 14);
1177 d10s16 = vqrshrn_n_s32(q0s32, 14);
1178 d11s16 = vqrshrn_n_s32(q4s32, 14);
1179 q2s16 = vcombine_s16(d4s16, d5s16);
1180 q5s16 = vcombine_s16(d10s16, d11s16);
1181
1182 q0s32 = vmull_s16(d22s16, d14s16);
1183 q1s32 = vmull_s16(d23s16, d14s16);
1184 q13s32 = vmull_s16(d24s16, d14s16);
1185 q6s32 = vmull_s16(d25s16, d14s16);
1186
1187 q10s32 = vsubq_s32(q13s32, q0s32);
1188 q4s32 = vsubq_s32(q6s32, q1s32);
1189 q13s32 = vaddq_s32(q13s32, q0s32);
1190 q6s32 = vaddq_s32(q6s32, q1s32);
1191
1192 d6s16 = vqrshrn_n_s32(q10s32, 14);
1193 d7s16 = vqrshrn_n_s32(q4s32, 14);
1194 d8s16 = vqrshrn_n_s32(q13s32, 14);
1195 d9s16 = vqrshrn_n_s32(q6s32, 14);
1196 q3s16 = vcombine_s16(d6s16, d7s16);
1197 q4s16 = vcombine_s16(d8s16, d9s16);
1198
1199 // stage 7
1200 q0s16 = vld1q_s16(pass1Output);
1201 pass1Output += 8;
1202 q1s16 = vld1q_s16(pass1Output);
1203 pass1Output += 8;
1204 q12s16 = vaddq_s16(q0s16, q15s16);
1205 q13s16 = vaddq_s16(q1s16, q14s16);
1206 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1207 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1208 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1209 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1210 vst1_u64((uint64_t *)out, d24u64);
1211 out += 4;
1212 vst1_u64((uint64_t *)out, d25u64);
1213 out += 12;
1214 vst1_u64((uint64_t *)out, d26u64);
1215 out += 4;
1216 vst1_u64((uint64_t *)out, d27u64);
1217 out += 12;
1218 q14s16 = vsubq_s16(q1s16, q14s16);
1219 q15s16 = vsubq_s16(q0s16, q15s16);
1220
1221 q10s16 = vld1q_s16(pass1Output);
1222 pass1Output += 8;
1223 q11s16 = vld1q_s16(pass1Output);
1224 pass1Output += 8;
1225 q12s16 = vaddq_s16(q10s16, q5s16);
1226 q13s16 = vaddq_s16(q11s16, q4s16);
1227 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1228 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1229 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1230 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1231 vst1_u64((uint64_t *)out, d24u64);
1232 out += 4;
1233 vst1_u64((uint64_t *)out, d25u64);
1234 out += 12;
1235 vst1_u64((uint64_t *)out, d26u64);
1236 out += 4;
1237 vst1_u64((uint64_t *)out, d27u64);
1238 out += 12;
1239 q4s16 = vsubq_s16(q11s16, q4s16);
1240 q5s16 = vsubq_s16(q10s16, q5s16);
1241
1242 q0s16 = vld1q_s16(pass1Output);
1243 pass1Output += 8;
1244 q1s16 = vld1q_s16(pass1Output);
1245 pass1Output += 8;
1246 q12s16 = vaddq_s16(q0s16, q3s16);
1247 q13s16 = vaddq_s16(q1s16, q2s16);
1248 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1249 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1250 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1251 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1252 vst1_u64((uint64_t *)out, d24u64);
1253 out += 4;
1254 vst1_u64((uint64_t *)out, d25u64);
1255 out += 12;
1256 vst1_u64((uint64_t *)out, d26u64);
1257 out += 4;
1258 vst1_u64((uint64_t *)out, d27u64);
1259 out += 12;
1260 q2s16 = vsubq_s16(q1s16, q2s16);
1261 q3s16 = vsubq_s16(q0s16, q3s16);
1262
1263 q10s16 = vld1q_s16(pass1Output);
1264 pass1Output += 8;
1265 q11s16 = vld1q_s16(pass1Output);
1266 q12s16 = vaddq_s16(q10s16, q9s16);
1267 q13s16 = vaddq_s16(q11s16, q8s16);
1268 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1269 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1270 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1271 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1272 vst1_u64((uint64_t *)out, d24u64);
1273 out += 4;
1274 vst1_u64((uint64_t *)out, d25u64);
1275 out += 12;
1276 vst1_u64((uint64_t *)out, d26u64);
1277 out += 4;
1278 vst1_u64((uint64_t *)out, d27u64);
1279 out += 12;
1280 q8s16 = vsubq_s16(q11s16, q8s16);
1281 q9s16 = vsubq_s16(q10s16, q9s16);
1282
1283 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
1284 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
1285 d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
1286 d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
1287 d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
1288 d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
1289 d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
1290 d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
1291 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
1292 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
1293 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
1294 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
1295 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
1296 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
1297 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
1298 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
1299
1300 vst1_u64((uint64_t *)out, d16u64);
1301 out += 4;
1302 vst1_u64((uint64_t *)out, d17u64);
1303 out += 12;
1304 vst1_u64((uint64_t *)out, d18u64);
1305 out += 4;
1306 vst1_u64((uint64_t *)out, d19u64);
1307 out += 12;
1308 vst1_u64((uint64_t *)out, d4u64);
1309 out += 4;
1310 vst1_u64((uint64_t *)out, d5u64);
1311 out += 12;
1312 vst1_u64((uint64_t *)out, d6u64);
1313 out += 4;
1314 vst1_u64((uint64_t *)out, d7u64);
1315 out += 12;
1316 vst1_u64((uint64_t *)out, d8u64);
1317 out += 4;
1318 vst1_u64((uint64_t *)out, d9u64);
1319 out += 12;
1320 vst1_u64((uint64_t *)out, d10u64);
1321 out += 4;
1322 vst1_u64((uint64_t *)out, d11u64);
1323 out += 12;
1324 vst1_u64((uint64_t *)out, d28u64);
1325 out += 4;
1326 vst1_u64((uint64_t *)out, d29u64);
1327 out += 12;
1328 vst1_u64((uint64_t *)out, d30u64);
1329 out += 4;
1330 vst1_u64((uint64_t *)out, d31u64);
1331 return;
1332 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698