Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(539)

Side by Side Diff: simd/jsimd_arm_neon.S

Issue 8720003: Update libjpeg-turbo to r722. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libjpeg_turbo/
Patch Set: '' Created 9 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « simd/jsimd_arm.c ('k') | simd/jsimdcfg.inc.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * ARM NEON optimizations for libjpeg-turbo 2 * ARM NEON optimizations for libjpeg-turbo
3 * 3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved. 5 * All rights reserved.
6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * 7 *
8 * This software is provided 'as-is', without any express or implied 8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages 9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software. 10 * arising from the use of this software.
11 * 11 *
12 * Permission is granted to anyone to use this software for any purpose, 12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it 13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions: 14 * freely, subject to the following restrictions:
15 * 15 *
16 * 1. The origin of this software must not be misrepresented; you must not 16 * 1. The origin of this software must not be misrepresented; you must not
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
55 .endm 55 .endm
56 56
57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ 57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
58 .macro transpose_4x4 x0, x1, x2, x3 58 .macro transpose_4x4 x0, x1, x2, x3
59 vtrn.16 \x0, \x1 59 vtrn.16 \x0, \x1
60 vtrn.16 \x2, \x3 60 vtrn.16 \x2, \x3
61 vtrn.32 \x0, \x2 61 vtrn.32 \x0, \x2
62 vtrn.32 \x1, \x3 62 vtrn.32 \x1, \x3
63 .endm 63 .endm
64 64
65 #define CENTERJSAMPLE 128
66
67 /*****************************************************************************/
68
69 /*
70 * Perform dequantization and inverse DCT on one block of coefficients.
71 *
72 * GLOBAL(void)
73 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
74 * JSAMPARRAY output_buf, JDIMENSION output_col)
75 */
76
77 #define FIX_0_298631336 (2446)
78 #define FIX_0_390180644 (3196)
79 #define FIX_0_541196100 (4433)
80 #define FIX_0_765366865 (6270)
81 #define FIX_0_899976223 (7373)
82 #define FIX_1_175875602 (9633)
83 #define FIX_1_501321110 (12299)
84 #define FIX_1_847759065 (15137)
85 #define FIX_1_961570560 (16069)
86 #define FIX_2_053119869 (16819)
87 #define FIX_2_562915447 (20995)
88 #define FIX_3_072711026 (25172)
89
90 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
91 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
92 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
93 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
94 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
95 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
96 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
97 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
98
99 /*
100 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
101 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
102 */
103 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
104 { \
105 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
106 INT32 q1, q2, q3, q4, q5, q6, q7; \
107 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
108 \
109 /* 1-D iDCT input data */ \
110 row0 = xrow0; \
111 row1 = xrow1; \
112 row2 = xrow2; \
113 row3 = xrow3; \
114 row4 = xrow4; \
115 row5 = xrow5; \
116 row6 = xrow6; \
117 row7 = xrow7; \
118 \
119 q5 = row7 + row3; \
120 q4 = row5 + row1; \
121 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
122 MULTIPLY(q4, FIX_1_175875602); \
123 q7 = MULTIPLY(q5, FIX_1_175875602) + \
124 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
125 q2 = MULTIPLY(row2, FIX_0_541196100) + \
126 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
127 q4 = q6; \
128 q3 = ((INT32) row0 - (INT32) row4) << 13; \
129 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
130 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
131 /* now we can use q1 (reloadable constants have been used up) */ \
132 q1 = q3 + q2; \
133 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
134 MULTIPLY(row1, -FIX_0_899976223); \
135 q5 = q7; \
136 q1 = q1 + q6; \
137 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
138 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
139 \
140 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
141 tmp11_plus_tmp2 = q1; \
142 row1 = 0; \
143 \
144 q1 = q1 - q6; \
145 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
146 MULTIPLY(row3, -FIX_2_562915447); \
147 q1 = q1 - q6; \
148 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
149 MULTIPLY(row6, FIX_0_541196100); \
150 q3 = q3 - q2; \
151 \
152 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
153 tmp11_minus_tmp2 = q1; \
154 \
155 q1 = ((INT32) row0 + (INT32) row4) << 13; \
156 q2 = q1 + q6; \
157 q1 = q1 - q6; \
158 \
159 /* pick up the results */ \
160 tmp0 = q4; \
161 tmp1 = q5; \
162 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
163 tmp3 = q7; \
164 tmp10 = q2; \
165 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
166 tmp12 = q3; \
167 tmp13 = q1; \
168 }
169
170 #define XFIX_0_899976223 d0[0]
171 #define XFIX_0_541196100 d0[1]
172 #define XFIX_2_562915447 d0[2]
173 #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
174 #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
175 #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
176 #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
177 #define XFIX_1_175875602 d1[3]
178 #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
179 #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
180 #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
181 #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
182
183 .balign 16
184 jsimd_idct_islow_neon_consts:
185 .short FIX_0_899976223 /* d0[0] */
186 .short FIX_0_541196100 /* d0[1] */
187 .short FIX_2_562915447 /* d0[2] */
188 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
189 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
190 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
191 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
192 .short FIX_1_175875602 /* d1[3] */
193 /* reloadable constants */
194 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
195 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
196 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
197 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
198
199 asm_function jsimd_idct_islow_neon
200
201 DCT_TABLE .req r0
202 COEF_BLOCK .req r1
203 OUTPUT_BUF .req r2
204 OUTPUT_COL .req r3
205 TMP1 .req r0
206 TMP2 .req r1
207 TMP3 .req r2
208 TMP4 .req ip
209
210 ROW0L .req d16
211 ROW0R .req d17
212 ROW1L .req d18
213 ROW1R .req d19
214 ROW2L .req d20
215 ROW2R .req d21
216 ROW3L .req d22
217 ROW3R .req d23
218 ROW4L .req d24
219 ROW4R .req d25
220 ROW5L .req d26
221 ROW5R .req d27
222 ROW6L .req d28
223 ROW6R .req d29
224 ROW7L .req d30
225 ROW7R .req d31
226
227 /* Load and dequantize coefficients into NEON registers
228 * with the following allocation:
229 * 0 1 2 3 | 4 5 6 7
230 * ---------+--------
231 * 0 | d16 | d17 ( q8 )
232 * 1 | d18 | d19 ( q9 )
233 * 2 | d20 | d21 ( q10 )
234 * 3 | d22 | d23 ( q11 )
235 * 4 | d24 | d25 ( q12 )
236 * 5 | d26 | d27 ( q13 )
237 * 6 | d28 | d29 ( q14 )
238 * 7 | d30 | d31 ( q15 )
239 */
240 adr ip, jsimd_idct_islow_neon_consts
241 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
242 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
243 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
244 vmul.s16 q8, q8, q0
245 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
246 vmul.s16 q9, q9, q1
247 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
248 vmul.s16 q10, q10, q2
249 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
250 vmul.s16 q11, q11, q3
251 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
252 vmul.s16 q12, q12, q0
253 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
254 vmul.s16 q14, q14, q2
255 vmul.s16 q13, q13, q1
256 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
257 add ip, ip, #16
258 vmul.s16 q15, q15, q3
259 vpush {d8-d15} /* save NEON registers */
260 /* 1-D IDCT, pass 1, left 4x8 half */
261 vadd.s16 d4, ROW7L, ROW3L
262 vadd.s16 d5, ROW5L, ROW1L
263 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
264 vmlal.s16 q6, d5, XFIX_1_175875602
265 vmull.s16 q7, d4, XFIX_1_175875602
266 /* Check for the zero coefficients in the right 4x8 half */
267 push {r4, r5}
268 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
269 vsubl.s16 q3, ROW0L, ROW4L
270 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
271 vmull.s16 q2, ROW2L, XFIX_0_541196100
272 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
273 orr r0, r4, r5
274 vmov q4, q6
275 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
276 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
277 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
278 vshl.s32 q3, q3, #13
279 orr r0, r0, r4
280 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
281 orr r0, r0, r5
282 vadd.s32 q1, q3, q2
283 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
284 vmov q5, q7
285 vadd.s32 q1, q1, q6
286 orr r0, r0, r4
287 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
288 orr r0, r0, r5
289 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
290 vrshrn.s32 ROW1L, q1, #11
291 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
292 vsub.s32 q1, q1, q6
293 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
294 orr r0, r0, r4
295 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
296 orr r0, r0, r5
297 vsub.s32 q1, q1, q6
298 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
299 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
300 vmlal.s16 q6, ROW6L, XFIX_0_541196100
301 vsub.s32 q3, q3, q2
302 orr r0, r0, r4
303 vrshrn.s32 ROW6L, q1, #11
304 orr r0, r0, r5
305 vadd.s32 q1, q3, q5
306 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
307 vsub.s32 q3, q3, q5
308 vaddl.s16 q5, ROW0L, ROW4L
309 orr r0, r0, r4
310 vrshrn.s32 ROW2L, q1, #11
311 orr r0, r0, r5
312 vrshrn.s32 ROW5L, q3, #11
313 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
314 vshl.s32 q5, q5, #13
315 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
316 orr r0, r0, r4
317 vadd.s32 q2, q5, q6
318 orrs r0, r0, r5
319 vsub.s32 q1, q5, q6
320 vadd.s32 q6, q2, q7
321 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
322 vsub.s32 q2, q2, q7
323 vadd.s32 q5, q1, q4
324 orr r0, r4, r5
325 vsub.s32 q3, q1, q4
326 pop {r4, r5}
327 vrshrn.s32 ROW7L, q2, #11
328 vrshrn.s32 ROW3L, q5, #11
329 vrshrn.s32 ROW0L, q6, #11
330 vrshrn.s32 ROW4L, q3, #11
331
332 beq 3f /* Go to do some special handling for the sparse right 4x8 half */
333
334 /* 1-D IDCT, pass 1, right 4x8 half */
335 vld1.s16 {d2}, [ip, :64] /* reload constants */
336 vadd.s16 d10, ROW7R, ROW3R
337 vadd.s16 d8, ROW5R, ROW1R
338 /* Transpose left 4x8 half */
339 vtrn.16 ROW6L, ROW7L
340 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
341 vmlal.s16 q6, d8, XFIX_1_175875602
342 vtrn.16 ROW2L, ROW3L
343 vmull.s16 q7, d10, XFIX_1_175875602
344 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
345 vtrn.16 ROW0L, ROW1L
346 vsubl.s16 q3, ROW0R, ROW4R
347 vmull.s16 q2, ROW2R, XFIX_0_541196100
348 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
349 vtrn.16 ROW4L, ROW5L
350 vmov q4, q6
351 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
352 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
353 vtrn.32 ROW1L, ROW3L
354 vshl.s32 q3, q3, #13
355 vmlsl.s16 q4, ROW1R, XFIX_0_899976223
356 vtrn.32 ROW4L, ROW6L
357 vadd.s32 q1, q3, q2
358 vmov q5, q7
359 vadd.s32 q1, q1, q6
360 vtrn.32 ROW0L, ROW2L
361 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
362 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
363 vrshrn.s32 ROW1R, q1, #11
364 vtrn.32 ROW5L, ROW7L
365 vsub.s32 q1, q1, q6
366 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
367 vmlsl.s16 q5, ROW3R, XFIX_2_562915447
368 vsub.s32 q1, q1, q6
369 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
370 vmlal.s16 q6, ROW6R, XFIX_0_541196100
371 vsub.s32 q3, q3, q2
372 vrshrn.s32 ROW6R, q1, #11
373 vadd.s32 q1, q3, q5
374 vsub.s32 q3, q3, q5
375 vaddl.s16 q5, ROW0R, ROW4R
376 vrshrn.s32 ROW2R, q1, #11
377 vrshrn.s32 ROW5R, q3, #11
378 vshl.s32 q5, q5, #13
379 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
380 vadd.s32 q2, q5, q6
381 vsub.s32 q1, q5, q6
382 vadd.s32 q6, q2, q7
383 vsub.s32 q2, q2, q7
384 vadd.s32 q5, q1, q4
385 vsub.s32 q3, q1, q4
386 vrshrn.s32 ROW7R, q2, #11
387 vrshrn.s32 ROW3R, q5, #11
388 vrshrn.s32 ROW0R, q6, #11
389 vrshrn.s32 ROW4R, q3, #11
390 /* Transpose right 4x8 half */
391 vtrn.16 ROW6R, ROW7R
392 vtrn.16 ROW2R, ROW3R
393 vtrn.16 ROW0R, ROW1R
394 vtrn.16 ROW4R, ROW5R
395 vtrn.32 ROW1R, ROW3R
396 vtrn.32 ROW4R, ROW6R
397 vtrn.32 ROW0R, ROW2R
398 vtrn.32 ROW5R, ROW7R
399
400 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
401 vld1.s16 {d2}, [ip, :64] /* reload constants */
402 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
403 vmlal.s16 q6, ROW1L, XFIX_1_175875602
404 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <- > ROW3R */
405 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
406 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
407 vmlal.s16 q7, ROW3L, XFIX_1_175875602
408 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <- > ROW1R */
409 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
410 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
411 vmull.s16 q2, ROW2L, XFIX_0_541196100
412 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <- > ROW2R */
413 vmov q4, q6
414 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
415 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
416 vshl.s32 q3, q3, #13
417 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
418 vadd.s32 q1, q3, q2
419 vmov q5, q7
420 vadd.s32 q1, q1, q6
421 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
422 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
423 vshrn.s32 ROW1L, q1, #16
424 vsub.s32 q1, q1, q6
425 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <- > ROW1R */
426 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
427 vsub.s32 q1, q1, q6
428 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
429 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
430 vsub.s32 q3, q3, q2
431 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
432 vadd.s32 q1, q3, q5
433 vsub.s32 q3, q3, q5
434 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
435 vshrn.s32 ROW2L, q1, #16
436 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
437 vshl.s32 q5, q5, #13
438 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <- > ROW3R */
439 vadd.s32 q2, q5, q6
440 vsub.s32 q1, q5, q6
441 vadd.s32 q6, q2, q7
442 vsub.s32 q2, q2, q7
443 vadd.s32 q5, q1, q4
444 vsub.s32 q3, q1, q4
445 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
446 vshrn.s32 ROW3L, q5, #16
447 vshrn.s32 ROW0L, q6, #16
448 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
449 /* 1-D IDCT, pass 2, right 4x8 half */
450 vld1.s16 {d2}, [ip, :64] /* reload constants */
451 vmull.s16 q6, ROW5R, XFIX_1_175875602
452 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
453 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
454 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <- > ROW3R */
455 vmull.s16 q7, ROW7R, XFIX_1_175875602
456 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
457 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
458 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <- > ROW1R */
459 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
460 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
461 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
462 vmov q4, q6
463 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
464 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <- > ROW3R */
465 vshl.s32 q3, q3, #13
466 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
467 vadd.s32 q1, q3, q2
468 vmov q5, q7
469 vadd.s32 q1, q1, q6
470 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
471 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <- > ROW1R */
472 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
473 vsub.s32 q1, q1, q6
474 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
475 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
476 vsub.s32 q1, q1, q6
477 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
478 vmlal.s16 q6, ROW6R, XFIX_0_541196100
479 vsub.s32 q3, q3, q2
480 vshrn.s32 ROW6R, q1, #16
481 vadd.s32 q1, q3, q5
482 vsub.s32 q3, q3, q5
483 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
484 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
485 vshrn.s32 ROW5R, q3, #16
486 vshl.s32 q5, q5, #13
487 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
488 vadd.s32 q2, q5, q6
489 vsub.s32 q1, q5, q6
490 vadd.s32 q6, q2, q7
491 vsub.s32 q2, q2, q7
492 vadd.s32 q5, q1, q4
493 vsub.s32 q3, q1, q4
494 vshrn.s32 ROW7R, q2, #16
495 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
496 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
497 vshrn.s32 ROW4R, q3, #16
498
499 2: /* Descale to 8-bit and range limit */
500 vqrshrn.s16 d16, q8, #2
501 vqrshrn.s16 d17, q9, #2
502 vqrshrn.s16 d18, q10, #2
503 vqrshrn.s16 d19, q11, #2
504 vpop {d8-d15} /* restore NEON registers */
505 vqrshrn.s16 d20, q12, #2
506 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
507 vtrn.16 q8, q9
508 vqrshrn.s16 d21, q13, #2
509 vqrshrn.s16 d22, q14, #2
510 vmov.u8 q0, #(CENTERJSAMPLE)
511 vqrshrn.s16 d23, q15, #2
512 vtrn.8 d16, d17
513 vtrn.8 d18, d19
514 vadd.u8 q8, q8, q0
515 vadd.u8 q9, q9, q0
516 vtrn.16 q10, q11
517 /* Store results to the output buffer */
518 ldmia OUTPUT_BUF!, {TMP1, TMP2}
519 add TMP1, TMP1, OUTPUT_COL
520 add TMP2, TMP2, OUTPUT_COL
521 vst1.8 {d16}, [TMP1]
522 vtrn.8 d20, d21
523 vst1.8 {d17}, [TMP2]
524 ldmia OUTPUT_BUF!, {TMP1, TMP2}
525 add TMP1, TMP1, OUTPUT_COL
526 add TMP2, TMP2, OUTPUT_COL
527 vst1.8 {d18}, [TMP1]
528 vadd.u8 q10, q10, q0
529 vst1.8 {d19}, [TMP2]
530 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
531 add TMP1, TMP1, OUTPUT_COL
532 add TMP2, TMP2, OUTPUT_COL
533 add TMP3, TMP3, OUTPUT_COL
534 add TMP4, TMP4, OUTPUT_COL
535 vtrn.8 d22, d23
536 vst1.8 {d20}, [TMP1]
537 vadd.u8 q11, q11, q0
538 vst1.8 {d21}, [TMP2]
539 vst1.8 {d22}, [TMP3]
540 vst1.8 {d23}, [TMP4]
541 bx lr
542
543 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
544
545 /* Transpose left 4x8 half */
546 vtrn.16 ROW6L, ROW7L
547 vtrn.16 ROW2L, ROW3L
548 vtrn.16 ROW0L, ROW1L
549 vtrn.16 ROW4L, ROW5L
550 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
551 vtrn.32 ROW1L, ROW3L
552 vtrn.32 ROW4L, ROW6L
553 vtrn.32 ROW0L, ROW2L
554 vtrn.32 ROW5L, ROW7L
555
556 cmp r0, #0
557 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */
558
559 /* Only row 0 is non-zero for the right 4x8 half */
560 vdup.s16 ROW1R, ROW0R[1]
561 vdup.s16 ROW2R, ROW0R[2]
562 vdup.s16 ROW3R, ROW0R[3]
563 vdup.s16 ROW4R, ROW0R[0]
564 vdup.s16 ROW5R, ROW0R[1]
565 vdup.s16 ROW6R, ROW0R[2]
566 vdup.s16 ROW7R, ROW0R[3]
567 vdup.s16 ROW0R, ROW0R[0]
568 b 1b /* Go to 'normal' second pass */
569
570 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
571 vld1.s16 {d2}, [ip, :64] /* reload constants */
572 vmull.s16 q6, ROW1L, XFIX_1_175875602
573 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
574 vmull.s16 q7, ROW3L, XFIX_1_175875602
575 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
576 vmull.s16 q2, ROW2L, XFIX_0_541196100
577 vshll.s16 q3, ROW0L, #13
578 vmov q4, q6
579 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
580 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
581 vadd.s32 q1, q3, q2
582 vmov q5, q7
583 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
584 vadd.s32 q1, q1, q6
585 vadd.s32 q6, q6, q6
586 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
587 vshrn.s32 ROW1L, q1, #16
588 vsub.s32 q1, q1, q6
589 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
590 vsub.s32 q3, q3, q2
591 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
592 vadd.s32 q1, q3, q5
593 vsub.s32 q3, q3, q5
594 vshll.s16 q5, ROW0L, #13
595 vshrn.s32 ROW2L, q1, #16
596 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
597 vadd.s32 q2, q5, q6
598 vsub.s32 q1, q5, q6
599 vadd.s32 q6, q2, q7
600 vsub.s32 q2, q2, q7
601 vadd.s32 q5, q1, q4
602 vsub.s32 q3, q1, q4
603 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
604 vshrn.s32 ROW3L, q5, #16
605 vshrn.s32 ROW0L, q6, #16
606 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
607 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
608 vld1.s16 {d2}, [ip, :64] /* reload constants */
609 vmull.s16 q6, ROW5L, XFIX_1_175875602
610 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
611 vmull.s16 q7, ROW7L, XFIX_1_175875602
612 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
613 vmull.s16 q2, ROW6L, XFIX_0_541196100
614 vshll.s16 q3, ROW4L, #13
615 vmov q4, q6
616 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
617 vmlsl.s16 q4, ROW5L, XFIX_0_899976223
618 vadd.s32 q1, q3, q2
619 vmov q5, q7
620 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
621 vadd.s32 q1, q1, q6
622 vadd.s32 q6, q6, q6
623 vmlsl.s16 q5, ROW7L, XFIX_2_562915447
624 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
625 vsub.s32 q1, q1, q6
626 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
627 vsub.s32 q3, q3, q2
628 vshrn.s32 ROW6R, q1, #16
629 vadd.s32 q1, q3, q5
630 vsub.s32 q3, q3, q5
631 vshll.s16 q5, ROW4L, #13
632 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
633 vshrn.s32 ROW5R, q3, #16
634 vadd.s32 q2, q5, q6
635 vsub.s32 q1, q5, q6
636 vadd.s32 q6, q2, q7
637 vsub.s32 q2, q2, q7
638 vadd.s32 q5, q1, q4
639 vsub.s32 q3, q1, q4
640 vshrn.s32 ROW7R, q2, #16
641 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
642 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
643 vshrn.s32 ROW4R, q3, #16
644 b 2b /* Go to epilogue */
645
646 .unreq DCT_TABLE
647 .unreq COEF_BLOCK
648 .unreq OUTPUT_BUF
649 .unreq OUTPUT_COL
650 .unreq TMP1
651 .unreq TMP2
652 .unreq TMP3
653 .unreq TMP4
654
655 .unreq ROW0L
656 .unreq ROW0R
657 .unreq ROW1L
658 .unreq ROW1R
659 .unreq ROW2L
660 .unreq ROW2R
661 .unreq ROW3L
662 .unreq ROW3R
663 .unreq ROW4L
664 .unreq ROW4R
665 .unreq ROW5L
666 .unreq ROW5R
667 .unreq ROW6L
668 .unreq ROW6R
669 .unreq ROW7L
670 .unreq ROW7R
671 .endfunc
672
65 /*****************************************************************************/ 673 /*****************************************************************************/
66 674
67 /* 675 /*
68 * jsimd_idct_ifast_neon 676 * jsimd_idct_ifast_neon
69 * 677 *
70 * This function contains a fast, not so accurate integer implementation of 678 * This function contains a fast, not so accurate integer implementation of
71 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
72 * and produces exactly the same output as IJG's original 'jpeg_idct_fast' 680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
73 * function from jidctfst.c 681 * function from jidctfst.c
74 * 682 *
75 * TODO: a bit better instructions scheduling is needed. 683 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
684 * But in ARM NEON case some extra additions are required because VQDMULH
685 * instruction can't handle the constants larger than 1. So the expressions
686 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
687 * which introduces an extra addition. Overall, there are 6 extra additions
688 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
76 */ 689 */
77 690
78 #define XFIX_1_082392200 d0[0] 691 #define XFIX_1_082392200 d0[0]
79 #define XFIX_1_414213562 d0[1] 692 #define XFIX_1_414213562 d0[1]
80 #define XFIX_1_847759065 d0[2] 693 #define XFIX_1_847759065 d0[2]
81 #define XFIX_2_613125930 d0[3] 694 #define XFIX_2_613125930 d0[3]
82 695
83 .balign 16 696 .balign 16
84 jsimd_idct_ifast_neon_consts: 697 jsimd_idct_ifast_neon_consts:
85 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 698 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
86 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 699 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
87 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 700 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
88 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 701 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
89 702
90 /* 1-D IDCT helper macro */
91
92 .macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \
93 t10, t11, t12, t13, t14
94
95 vsub.s16 \t10, \x0, \x4
96 vadd.s16 \x4, \x0, \x4
97 vswp.s16 \t10, \x0
98 vsub.s16 \t11, \x2, \x6
99 vadd.s16 \x6, \x2, \x6
100 vswp.s16 \t11, \x2
101 vsub.s16 \t10, \x3, \x5
102 vadd.s16 \x5, \x3, \x5
103 vswp.s16 \t10, \x3
104 vsub.s16 \t11, \x1, \x7
105 vadd.s16 \x7, \x1, \x7
106 vswp.s16 \t11, \x1
107
108 vqdmulh.s16 \t13, \x2, d0[1]
109 vadd.s16 \t12, \x3, \x3
110 vadd.s16 \x2, \x2, \t13
111 vqdmulh.s16 \t13, \x3, d0[3]
112 vsub.s16 \t10, \x1, \x3
113 vadd.s16 \t12, \t12, \t13
114 vqdmulh.s16 \t13, \t10, d0[2]
115 vsub.s16 \t11, \x7, \x5
116 vadd.s16 \t10, \t10, \t13
117 vqdmulh.s16 \t13, \t11, d0[1]
118 vadd.s16 \t11, \t11, \t13
119
120 vqdmulh.s16 \t13, \x1, d0[0]
121 vsub.s16 \x2, \x6, \x2
122 vsub.s16 \t14, \x0, \x2
123 vadd.s16 \x2, \x0, \x2
124 vadd.s16 \x0, \x4, \x6
125 vsub.s16 \x4, \x4, \x6
126 vadd.s16 \x1, \x1, \t13
127 vadd.s16 \t13, \x7, \x5
128 vsub.s16 \t12, \t13, \t12
129 vsub.s16 \t12, \t12, \t10
130 vadd.s16 \t11, \t12, \t11
131 vsub.s16 \t10, \x1, \t10
132 vadd.s16 \t10, \t10, \t11
133
134 vsub.s16 \x7, \x0, \t13
135 vadd.s16 \x0, \x0, \t13
136 vadd.s16 \x6, \t14, \t12
137 vsub.s16 \x1, \t14, \t12
138 vsub.s16 \x5, \x2, \t11
139 vadd.s16 \x2, \x2, \t11
140 vsub.s16 \x3, \x4, \t10
141 vadd.s16 \x4, \x4, \t10
142 .endm
143
144 asm_function jsimd_idct_ifast_neon 703 asm_function jsimd_idct_ifast_neon
145 704
146 DCT_TABLE .req r0 705 DCT_TABLE .req r0
147 COEF_BLOCK .req r1 706 COEF_BLOCK .req r1
148 OUTPUT_BUF .req r2 707 OUTPUT_BUF .req r2
149 OUTPUT_COL .req r3 708 OUTPUT_COL .req r3
150 TMP .req ip 709 TMP1 .req r0
151 710 TMP2 .req r1
152 vpush {d8-d15} 711 TMP3 .req r2
153 712 TMP4 .req ip
154 /* Load constants */ 713
155 adr TMP, jsimd_idct_ifast_neon_consts 714 /* Load and dequantize coefficients into NEON registers
156 vld1.16 {d0}, [TMP, :64] 715 * with the following allocation:
157
158 /* Load all COEF_BLOCK into NEON registers with the following allocation:
159 * 0 1 2 3 | 4 5 6 7 716 * 0 1 2 3 | 4 5 6 7
160 * ---------+-------- 717 * ---------+--------
161 * 0 | d4 | d5 718 * 0 | d16 | d17 ( q8 )
162 * 1 | d6 | d7 719 * 1 | d18 | d19 ( q9 )
163 * 2 | d8 | d9 720 * 2 | d20 | d21 ( q10 )
164 * 3 | d10 | d11 721 * 3 | d22 | d23 ( q11 )
165 * 4 | d12 | d13 722 * 4 | d24 | d25 ( q12 )
166 * 5 | d14 | d15 723 * 5 | d26 | d27 ( q13 )
167 * 6 | d16 | d17 724 * 6 | d28 | d29 ( q14 )
168 * 7 | d18 | d19 725 * 7 | d30 | d31 ( q15 )
169 */ 726 */
170 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! 727 adr ip, jsimd_idct_ifast_neon_consts
171 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! 728 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
172 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! 729 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
173 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]! 730 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
174 /* Dequantize */ 731 vmul.s16 q8, q8, q0
175 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! 732 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
176 vmul.s16 q2, q2, q10 733 vmul.s16 q9, q9, q1
177 vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]! 734 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
178 vmul.s16 q3, q3, q11 735 vmul.s16 q10, q10, q2
179 vmul.s16 q4, q4, q12 736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
180 vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]! 737 vmul.s16 q11, q11, q3
181 vmul.s16 q5, q5, q13 738 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
182 vmul.s16 q6, q6, q14 739 vmul.s16 q12, q12, q0
183 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! 740 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
184 vmul.s16 q7, q7, q15 741 vmul.s16 q14, q14, q2
185 vmul.s16 q8, q8, q10 742 vmul.s16 q13, q13, q1
186 vmul.s16 q9, q9, q11 743 vld1.16 {d0}, [ip, :64] /* load constants */
187 744 vmul.s16 q15, q15, q3
188 /* Pass 1 */ 745 vpush {d8-d13} /* save NEON registers */
189 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 746 /* 1-D IDCT, pass 1 */
190 /* Transpose */ 747 vsub.s16 q2, q10, q14
191 transpose_4x4 d4, d6, d8, d10 748 vadd.s16 q14, q10, q14
192 transpose_4x4 d5, d7, d9, d11 749 vsub.s16 q1, q11, q13
193 transpose_4x4 d12, d14, d16, d18 750 vadd.s16 q13, q11, q13
194 transpose_4x4 d13, d15, d17, d19 751 vsub.s16 q5, q9, q15
195 vswp d12, d5 752 vadd.s16 q15, q9, q15
196 vswp d14, d7 753 vqdmulh.s16 q4, q2, XFIX_1_414213562
197 vswp d16, d9 754 vqdmulh.s16 q6, q1, XFIX_2_613125930
198 vswp d18, d11 755 vadd.s16 q3, q1, q1
199 756 vsub.s16 q1, q5, q1
200 /* Pass 2 */ 757 vadd.s16 q10, q2, q4
201 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 758 vqdmulh.s16 q4, q1, XFIX_1_847759065
202 /* Transpose */ 759 vsub.s16 q2, q15, q13
203 transpose_4x4 d4, d6, d8, d10 760 vadd.s16 q3, q3, q6
204 transpose_4x4 d5, d7, d9, d11 761 vqdmulh.s16 q6, q2, XFIX_1_414213562
205 transpose_4x4 d12, d14, d16, d18 762 vadd.s16 q1, q1, q4
206 transpose_4x4 d13, d15, d17, d19 763 vqdmulh.s16 q4, q5, XFIX_1_082392200
207 vswp d12, d5 764 vsub.s16 q10, q10, q14
208 vswp d14, d7 765 vadd.s16 q2, q2, q6
209 vswp d16, d9 766 vsub.s16 q6, q8, q12
210 vswp d18, d11 767 vadd.s16 q12, q8, q12
211 768 vadd.s16 q9, q5, q4
212 /* Descale and range limit */ 769 vadd.s16 q5, q6, q10
213 vmov.s16 q15, #(0x80 << 5) 770 vsub.s16 q10, q6, q10
214 vqadd.s16 q2, q2, q15 771 vadd.s16 q6, q15, q13
215 vqadd.s16 q3, q3, q15 772 vadd.s16 q8, q12, q14
216 vqadd.s16 q4, q4, q15 773 vsub.s16 q3, q6, q3
217 vqadd.s16 q5, q5, q15 774 vsub.s16 q12, q12, q14
218 vqadd.s16 q6, q6, q15 775 vsub.s16 q3, q3, q1
219 vqadd.s16 q7, q7, q15 776 vsub.s16 q1, q9, q1
220 vqadd.s16 q8, q8, q15 777 vadd.s16 q2, q3, q2
221 vqadd.s16 q9, q9, q15 778 vsub.s16 q15, q8, q6
222 vqshrun.s16 d4, q2, #5 779 vadd.s16 q1, q1, q2
223 vqshrun.s16 d6, q3, #5 780 vadd.s16 q8, q8, q6
224 vqshrun.s16 d8, q4, #5 781 vadd.s16 q14, q5, q3
225 vqshrun.s16 d10, q5, #5 782 vsub.s16 q9, q5, q3
226 vqshrun.s16 d12, q6, #5 783 vsub.s16 q13, q10, q2
227 vqshrun.s16 d14, q7, #5 784 vadd.s16 q10, q10, q2
228 vqshrun.s16 d16, q8, #5 785 /* Transpose */
229 vqshrun.s16 d18, q9, #5 786 vtrn.16 q8, q9
230 787 vsub.s16 q11, q12, q1
231 /* Store results to the output buffer */ 788 vtrn.16 q14, q15
232 .irp x, d4, d6, d8, d10, d12, d14, d16, d18 789 vadd.s16 q12, q12, q1
233 ldr TMP, [OUTPUT_BUF], #4 790 vtrn.16 q10, q11
234 add TMP, TMP, OUTPUT_COL 791 vtrn.16 q12, q13
235 vst1.8 {\x}, [TMP]! 792 vtrn.32 q9, q11
236 .endr 793 vtrn.32 q12, q14
237 794 vtrn.32 q8, q10
238 vpop {d8-d15} 795 vtrn.32 q13, q15
796 vswp d28, d21
797 vswp d26, d19
798 /* 1-D IDCT, pass 2 */
799 vsub.s16 q2, q10, q14
800 vswp d30, d23
801 vadd.s16 q14, q10, q14
802 vswp d24, d17
803 vsub.s16 q1, q11, q13
804 vadd.s16 q13, q11, q13
805 vsub.s16 q5, q9, q15
806 vadd.s16 q15, q9, q15
807 vqdmulh.s16 q4, q2, XFIX_1_414213562
808 vqdmulh.s16 q6, q1, XFIX_2_613125930
809 vadd.s16 q3, q1, q1
810 vsub.s16 q1, q5, q1
811 vadd.s16 q10, q2, q4
812 vqdmulh.s16 q4, q1, XFIX_1_847759065
813 vsub.s16 q2, q15, q13
814 vadd.s16 q3, q3, q6
815 vqdmulh.s16 q6, q2, XFIX_1_414213562
816 vadd.s16 q1, q1, q4
817 vqdmulh.s16 q4, q5, XFIX_1_082392200
818 vsub.s16 q10, q10, q14
819 vadd.s16 q2, q2, q6
820 vsub.s16 q6, q8, q12
821 vadd.s16 q12, q8, q12
822 vadd.s16 q9, q5, q4
823 vadd.s16 q5, q6, q10
824 vsub.s16 q10, q6, q10
825 vadd.s16 q6, q15, q13
826 vadd.s16 q8, q12, q14
827 vsub.s16 q3, q6, q3
828 vsub.s16 q12, q12, q14
829 vsub.s16 q3, q3, q1
830 vsub.s16 q1, q9, q1
831 vadd.s16 q2, q3, q2
832 vsub.s16 q15, q8, q6
833 vadd.s16 q1, q1, q2
834 vadd.s16 q8, q8, q6
835 vadd.s16 q14, q5, q3
836 vsub.s16 q9, q5, q3
837 vsub.s16 q13, q10, q2
838 vpop {d8-d13} /* restore NEON registers */
839 vadd.s16 q10, q10, q2
840 vsub.s16 q11, q12, q1
841 vadd.s16 q12, q12, q1
842 /* Descale to 8-bit and range limit */
843 vmov.u8 q0, #0x80
844 vqshrn.s16 d16, q8, #5
845 vqshrn.s16 d17, q9, #5
846 vqshrn.s16 d18, q10, #5
847 vqshrn.s16 d19, q11, #5
848 vqshrn.s16 d20, q12, #5
849 vqshrn.s16 d21, q13, #5
850 vqshrn.s16 d22, q14, #5
851 vqshrn.s16 d23, q15, #5
852 vadd.u8 q8, q8, q0
853 vadd.u8 q9, q9, q0
854 vadd.u8 q10, q10, q0
855 vadd.u8 q11, q11, q0
856 /* Transpose the final 8-bit samples */
857 vtrn.16 q8, q9
858 vtrn.16 q10, q11
859 vtrn.32 q8, q10
860 vtrn.32 q9, q11
861 vtrn.8 d16, d17
862 vtrn.8 d18, d19
863 /* Store results to the output buffer */
864 ldmia OUTPUT_BUF!, {TMP1, TMP2}
865 add TMP1, TMP1, OUTPUT_COL
866 add TMP2, TMP2, OUTPUT_COL
867 vst1.8 {d16}, [TMP1]
868 vst1.8 {d17}, [TMP2]
869 ldmia OUTPUT_BUF!, {TMP1, TMP2}
870 add TMP1, TMP1, OUTPUT_COL
871 add TMP2, TMP2, OUTPUT_COL
872 vst1.8 {d18}, [TMP1]
873 vtrn.8 d20, d21
874 vst1.8 {d19}, [TMP2]
875 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
876 add TMP1, TMP1, OUTPUT_COL
877 add TMP2, TMP2, OUTPUT_COL
878 add TMP3, TMP3, OUTPUT_COL
879 add TMP4, TMP4, OUTPUT_COL
880 vst1.8 {d20}, [TMP1]
881 vtrn.8 d22, d23
882 vst1.8 {d21}, [TMP2]
883 vst1.8 {d22}, [TMP3]
884 vst1.8 {d23}, [TMP4]
239 bx lr 885 bx lr
240 886
241 .unreq DCT_TABLE 887 .unreq DCT_TABLE
242 .unreq COEF_BLOCK 888 .unreq COEF_BLOCK
243 .unreq OUTPUT_BUF 889 .unreq OUTPUT_BUF
244 .unreq OUTPUT_COL 890 .unreq OUTPUT_COL
245 .unreq TMP 891 .unreq TMP1
892 .unreq TMP2
893 .unreq TMP3
894 .unreq TMP4
246 .endfunc 895 .endfunc
247 896
248 .purgem idct_helper
249
250 /*****************************************************************************/ 897 /*****************************************************************************/
251 898
252 /* 899 /*
253 * jsimd_idct_4x4_neon 900 * jsimd_idct_4x4_neon
254 * 901 *
255 * This function contains inverse-DCT code for getting reduced-size 902 * This function contains inverse-DCT code for getting reduced-size
256 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 903 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
257 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
258 * function from jpeg-6b (jidctred.c). 905 * function from jpeg-6b (jidctred.c).
259 * 906 *
(...skipping 364 matching lines...) Expand 10 before | Expand all | Expand 10 after
624 * jsimd_ycc_extbgrx_convert_neon 1271 * jsimd_ycc_extbgrx_convert_neon
625 * jsimd_ycc_extxbgr_convert_neon 1272 * jsimd_ycc_extxbgr_convert_neon
626 * jsimd_ycc_extxrgb_convert_neon 1273 * jsimd_ycc_extxrgb_convert_neon
627 * 1274 *
628 * Colorspace conversion YCbCr -> RGB 1275 * Colorspace conversion YCbCr -> RGB
629 */ 1276 */
630 1277
631 1278
632 .macro do_load size 1279 .macro do_load size
633 .if \size == 8 1280 .if \size == 8
634 vld1.8 {d4}, [U]! 1281 vld1.8 {d4}, [U, :64]!
635 vld1.8 {d5}, [V]! 1282 vld1.8 {d5}, [V, :64]!
636 vld1.8 {d0}, [Y]! 1283 vld1.8 {d0}, [Y, :64]!
637 pld [Y, #64]
638 pld [U, #64] 1284 pld [U, #64]
639 pld [V, #64] 1285 pld [V, #64]
1286 pld [Y, #64]
640 .elseif \size == 4 1287 .elseif \size == 4
641 vld1.8 {d4[0]}, [U]! 1288 vld1.8 {d4[0]}, [U]!
642 vld1.8 {d4[1]}, [U]! 1289 vld1.8 {d4[1]}, [U]!
643 vld1.8 {d4[2]}, [U]! 1290 vld1.8 {d4[2]}, [U]!
644 vld1.8 {d4[3]}, [U]! 1291 vld1.8 {d4[3]}, [U]!
645 vld1.8 {d5[0]}, [V]! 1292 vld1.8 {d5[0]}, [V]!
646 vld1.8 {d5[1]}, [V]! 1293 vld1.8 {d5[1]}, [V]!
647 vld1.8 {d5[2]}, [V]! 1294 vld1.8 {d5[2]}, [V]!
648 vld1.8 {d5[3]}, [V]! 1295 vld1.8 {d5[3]}, [V]!
649 vld1.8 {d0[0]}, [Y]! 1296 vld1.8 {d0[0]}, [Y]!
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
699 .else 1346 .else
700 .error unsupported macroblock size 1347 .error unsupported macroblock size
701 .endif 1348 .endif
702 .else 1349 .else
703 .error unsupported bpp 1350 .error unsupported bpp
704 .endif 1351 .endif
705 .endm 1352 .endm
706 1353
707 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1354 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
708 1355
709 .macro do_yuv_to_rgb 1356 /*
1357 * 2 stage pipelined YCbCr->RGB conversion
1358 */
1359
1360 .macro do_yuv_to_rgb_stage1
710 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1361 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
711 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1362 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
712 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1363 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
713 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1364 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
714 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1365 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
715 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1366 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
716 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1367 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
717 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1368 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
718 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1369 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
719 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1370 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1371 .endm
1372
1373 .macro do_yuv_to_rgb_stage2
720 vrshrn.s32 d20, q10, #15 1374 vrshrn.s32 d20, q10, #15
721 vrshrn.s32 d21, q11, #15 1375 vrshrn.s32 d21, q11, #15
722 vrshrn.s32 d24, q12, #14 1376 vrshrn.s32 d24, q12, #14
723 vrshrn.s32 d25, q13, #14 1377 vrshrn.s32 d25, q13, #14
724 vrshrn.s32 d28, q14, #14 1378 vrshrn.s32 d28, q14, #14
725 vrshrn.s32 d29, q15, #14 1379 vrshrn.s32 d29, q15, #14
726 vaddw.u8 q10, q10, d0 1380 vaddw.u8 q10, q10, d0
727 vaddw.u8 q12, q12, d0 1381 vaddw.u8 q12, q12, d0
728 vaddw.u8 q14, q14, d0 1382 vaddw.u8 q14, q14, d0
729 vqmovun.s16 d1\g_offs, q10 1383 vqmovun.s16 d1\g_offs, q10
730 vqmovun.s16 d1\r_offs, q12 1384 vqmovun.s16 d1\r_offs, q12
731 vqmovun.s16 d1\b_offs, q14 1385 vqmovun.s16 d1\b_offs, q14
732 .endm 1386 .endm
733 1387
1388 .macro do_yuv_to_rgb_stage2_store_load_stage1
1389 vld1.8 {d4}, [U, :64]!
1390 vrshrn.s32 d20, q10, #15
1391 vrshrn.s32 d21, q11, #15
1392 vrshrn.s32 d24, q12, #14
1393 vrshrn.s32 d25, q13, #14
1394 vrshrn.s32 d28, q14, #14
1395 vld1.8 {d5}, [V, :64]!
1396 vrshrn.s32 d29, q15, #14
1397 vaddw.u8 q10, q10, d0
1398 vaddw.u8 q12, q12, d0
1399 vaddw.u8 q14, q14, d0
1400 vqmovun.s16 d1\g_offs, q10
1401 vld1.8 {d0}, [Y, :64]!
1402 vqmovun.s16 d1\r_offs, q12
1403 pld [U, #64]
1404 pld [V, #64]
1405 pld [Y, #64]
1406 vqmovun.s16 d1\b_offs, q14
1407 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1408 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1409 do_store \bpp, 8
1410 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1411 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1412 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1413 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1414 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1415 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1416 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1417 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1418 .endm
1419
1420 .macro do_yuv_to_rgb
1421 do_yuv_to_rgb_stage1
1422 do_yuv_to_rgb_stage2
1423 .endm
1424
734 /* Apple gas crashes on adrl, work around that by using adr. 1425 /* Apple gas crashes on adrl, work around that by using adr.
735 * But this requires a copy of these constants for each function. 1426 * But this requires a copy of these constants for each function.
736 */ 1427 */
737 1428
738 .balign 16 1429 .balign 16
739 jsimd_ycc_\colorid\()_neon_consts: 1430 jsimd_ycc_\colorid\()_neon_consts:
740 .short 0, 0, 0, 0 1431 .short 0, 0, 0, 0
741 .short 22971, -11277, -23401, 29033 1432 .short 22971, -11277, -23401, 29033
742 .short -128, -128, -128, -128 1433 .short -128, -128, -128, -128
743 .short -128, -128, -128, -128 1434 .short -128, -128, -128, -128
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
784 0: 1475 0:
785 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] 1476 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
786 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] 1477 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
787 mov N, OUTPUT_WIDTH 1478 mov N, OUTPUT_WIDTH
788 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] 1479 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
789 add INPUT_ROW, INPUT_ROW, #1 1480 add INPUT_ROW, INPUT_ROW, #1
790 ldr RGB, [OUTPUT_BUF], #4 1481 ldr RGB, [OUTPUT_BUF], #4
791 1482
792 /* Inner loop over pixels */ 1483 /* Inner loop over pixels */
793 subs N, N, #8 1484 subs N, N, #8
1485 blt 3f
1486 do_load 8
1487 do_yuv_to_rgb_stage1
1488 subs N, N, #8
794 blt 2f 1489 blt 2f
795 1: 1490 1:
796 do_load 8 1491 do_yuv_to_rgb_stage2_store_load_stage1
797 do_yuv_to_rgb
798 do_store \bpp, 8
799 subs N, N, #8 1492 subs N, N, #8
800 bge 1b 1493 bge 1b
1494 2:
1495 do_yuv_to_rgb_stage2
1496 do_store \bpp, 8
801 tst N, #7 1497 tst N, #7
802 beq 8f 1498 beq 8f
803 2: 1499 3:
804 tst N, #4 1500 tst N, #4
805 beq 3f 1501 beq 3f
806 do_load 4 1502 do_load 4
807 3: 1503 3:
808 tst N, #2 1504 tst N, #2
809 beq 4f 1505 beq 4f
810 do_load 2 1506 do_load 2
811 4: 1507 4:
812 tst N, #1 1508 tst N, #1
813 beq 5f 1509 beq 5f
(...skipping 27 matching lines...) Expand all
841 .unreq INPUT_BUF1 1537 .unreq INPUT_BUF1
842 .unreq INPUT_BUF2 1538 .unreq INPUT_BUF2
843 .unreq RGB 1539 .unreq RGB
844 .unreq Y 1540 .unreq Y
845 .unreq U 1541 .unreq U
846 .unreq V 1542 .unreq V
847 .unreq N 1543 .unreq N
848 .endfunc 1544 .endfunc
849 1545
850 .purgem do_yuv_to_rgb 1546 .purgem do_yuv_to_rgb
1547 .purgem do_yuv_to_rgb_stage1
1548 .purgem do_yuv_to_rgb_stage2
1549 .purgem do_yuv_to_rgb_stage2_store_load_stage1
851 1550
852 .endm 1551 .endm
853 1552
854 /*--------------------------------- id ----- bpp R G B */ 1553 /*--------------------------------- id ----- bpp R G B */
855 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 1554 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
856 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 1555 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 1556 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 1557 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 1558 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 1559 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
861 1560
862 .purgem do_load 1561 .purgem do_load
863 .purgem do_store 1562 .purgem do_store
864 1563
865 /*****************************************************************************/ 1564 /*****************************************************************************/
1565
1566 /*
1567 * jsimd_extrgb_ycc_convert_neon
1568 * jsimd_extbgr_ycc_convert_neon
1569 * jsimd_extrgbx_ycc_convert_neon
1570 * jsimd_extbgrx_ycc_convert_neon
1571 * jsimd_extxbgr_ycc_convert_neon
1572 * jsimd_extxrgb_ycc_convert_neon
1573 *
1574 * Colorspace conversion RGB -> YCbCr
1575 */
1576
1577 .macro do_store size
1578 .if \size == 8
1579 vst1.8 {d20}, [Y]!
1580 vst1.8 {d21}, [U]!
1581 vst1.8 {d22}, [V]!
1582 .elseif \size == 4
1583 vst1.8 {d20[0]}, [Y]!
1584 vst1.8 {d20[1]}, [Y]!
1585 vst1.8 {d20[2]}, [Y]!
1586 vst1.8 {d20[3]}, [Y]!
1587 vst1.8 {d21[0]}, [U]!
1588 vst1.8 {d21[1]}, [U]!
1589 vst1.8 {d21[2]}, [U]!
1590 vst1.8 {d21[3]}, [U]!
1591 vst1.8 {d22[0]}, [V]!
1592 vst1.8 {d22[1]}, [V]!
1593 vst1.8 {d22[2]}, [V]!
1594 vst1.8 {d22[3]}, [V]!
1595 .elseif \size == 2
1596 vst1.8 {d20[4]}, [Y]!
1597 vst1.8 {d20[5]}, [Y]!
1598 vst1.8 {d21[4]}, [U]!
1599 vst1.8 {d21[5]}, [U]!
1600 vst1.8 {d22[4]}, [V]!
1601 vst1.8 {d22[5]}, [V]!
1602 .elseif \size == 1
1603 vst1.8 {d20[6]}, [Y]!
1604 vst1.8 {d21[6]}, [U]!
1605 vst1.8 {d22[6]}, [V]!
1606 .else
1607 .error unsupported macroblock size
1608 .endif
1609 .endm
1610
1611 .macro do_load bpp, size
1612 .if \bpp == 24
1613 .if \size == 8
1614 vld3.8 {d10, d11, d12}, [RGB]!
1615 pld [RGB, #128]
1616 .elseif \size == 4
1617 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1618 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1619 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1620 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1621 .elseif \size == 2
1622 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1623 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1624 .elseif \size == 1
1625 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1626 .else
1627 .error unsupported macroblock size
1628 .endif
1629 .elseif \bpp == 32
1630 .if \size == 8
1631 vld4.8 {d10, d11, d12, d13}, [RGB]!
1632 pld [RGB, #128]
1633 .elseif \size == 4
1634 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1635 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1636 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1637 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1638 .elseif \size == 2
1639 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1640 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1641 .elseif \size == 1
1642 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1643 .else
1644 .error unsupported macroblock size
1645 .endif
1646 .else
1647 .error unsupported bpp
1648 .endif
1649 .endm
1650
1651 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1652
1653 /*
1654 * 2 stage pipelined RGB->YCbCr conversion
1655 */
1656
1657 .macro do_rgb_to_yuv_stage1
1658 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1659 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1660 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1661 vmull.u16 q7, d4, d0[0]
1662 vmlal.u16 q7, d6, d0[1]
1663 vmlal.u16 q7, d8, d0[2]
1664 vmull.u16 q8, d5, d0[0]
1665 vmlal.u16 q8, d7, d0[1]
1666 vmlal.u16 q8, d9, d0[2]
1667 vrev64.32 q9, q1
1668 vrev64.32 q13, q1
1669 vmlsl.u16 q9, d4, d0[3]
1670 vmlsl.u16 q9, d6, d1[0]
1671 vmlal.u16 q9, d8, d1[1]
1672 vmlsl.u16 q13, d5, d0[3]
1673 vmlsl.u16 q13, d7, d1[0]
1674 vmlal.u16 q13, d9, d1[1]
1675 vrev64.32 q14, q1
1676 vrev64.32 q15, q1
1677 vmlal.u16 q14, d4, d1[1]
1678 vmlsl.u16 q14, d6, d1[2]
1679 vmlsl.u16 q14, d8, d1[3]
1680 vmlal.u16 q15, d5, d1[1]
1681 vmlsl.u16 q15, d7, d1[2]
1682 vmlsl.u16 q15, d9, d1[3]
1683 .endm
1684
1685 .macro do_rgb_to_yuv_stage2
1686 vrshrn.u32 d20, q7, #16
1687 vrshrn.u32 d21, q8, #16
1688 vshrn.u32 d22, q9, #16
1689 vshrn.u32 d23, q13, #16
1690 vshrn.u32 d24, q14, #16
1691 vshrn.u32 d25, q15, #16
1692 vmovn.u16 d20, q10 /* d20 = y */
1693 vmovn.u16 d21, q11 /* d21 = u */
1694 vmovn.u16 d22, q12 /* d22 = v */
1695 .endm
1696
1697 .macro do_rgb_to_yuv
1698 do_rgb_to_yuv_stage1
1699 do_rgb_to_yuv_stage2
1700 .endm
1701
1702 .macro do_rgb_to_yuv_stage2_store_load_stage1
1703 vrshrn.u32 d20, q7, #16
1704 vrshrn.u32 d21, q8, #16
1705 vshrn.u32 d22, q9, #16
1706 vrev64.32 q9, q1
1707 vshrn.u32 d23, q13, #16
1708 vrev64.32 q13, q1
1709 vshrn.u32 d24, q14, #16
1710 vshrn.u32 d25, q15, #16
1711 do_load \bpp, 8
1712 vmovn.u16 d20, q10 /* d20 = y */
1713 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1714 vmovn.u16 d21, q11 /* d21 = u */
1715 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1716 vmovn.u16 d22, q12 /* d22 = v */
1717 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1718 vmull.u16 q7, d4, d0[0]
1719 vmlal.u16 q7, d6, d0[1]
1720 vmlal.u16 q7, d8, d0[2]
1721 vst1.8 {d20}, [Y]!
1722 vmull.u16 q8, d5, d0[0]
1723 vmlal.u16 q8, d7, d0[1]
1724 vmlal.u16 q8, d9, d0[2]
1725 vmlsl.u16 q9, d4, d0[3]
1726 vmlsl.u16 q9, d6, d1[0]
1727 vmlal.u16 q9, d8, d1[1]
1728 vst1.8 {d21}, [U]!
1729 vmlsl.u16 q13, d5, d0[3]
1730 vmlsl.u16 q13, d7, d1[0]
1731 vmlal.u16 q13, d9, d1[1]
1732 vrev64.32 q14, q1
1733 vrev64.32 q15, q1
1734 vmlal.u16 q14, d4, d1[1]
1735 vmlsl.u16 q14, d6, d1[2]
1736 vmlsl.u16 q14, d8, d1[3]
1737 vst1.8 {d22}, [V]!
1738 vmlal.u16 q15, d5, d1[1]
1739 vmlsl.u16 q15, d7, d1[2]
1740 vmlsl.u16 q15, d9, d1[3]
1741 .endm
1742
1743 .balign 16
1744 jsimd_\colorid\()_ycc_neon_consts:
1745 .short 19595, 38470, 7471, 11059
1746 .short 21709, 32768, 27439, 5329
1747 .short 32767, 128, 32767, 128
1748 .short 32767, 128, 32767, 128
1749
1750 asm_function jsimd_\colorid\()_ycc_convert_neon
1751 OUTPUT_WIDTH .req r0
1752 INPUT_BUF .req r1
1753 OUTPUT_BUF .req r2
1754 OUTPUT_ROW .req r3
1755 NUM_ROWS .req r4
1756
1757 OUTPUT_BUF0 .req r5
1758 OUTPUT_BUF1 .req r6
1759 OUTPUT_BUF2 .req OUTPUT_BUF
1760
1761 RGB .req r7
1762 Y .req r8
1763 U .req r9
1764 V .req r10
1765 N .req ip
1766
1767 /* Load constants to d0, d1, d2, d3 */
1768 adr ip, jsimd_\colorid\()_ycc_neon_consts
1769 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1770
1771 /* Save ARM registers and handle input arguments */
1772 push {r4, r5, r6, r7, r8, r9, r10, lr}
1773 ldr NUM_ROWS, [sp, #(4 * 8)]
1774 ldr OUTPUT_BUF0, [OUTPUT_BUF]
1775 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1776 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1777 .unreq OUTPUT_BUF
1778
1779 /* Save NEON registers */
1780 vpush {d8-d15}
1781
1782 /* Outer loop over scanlines */
1783 cmp NUM_ROWS, #1
1784 blt 9f
1785 0:
1786 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1787 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1788 mov N, OUTPUT_WIDTH
1789 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1790 add OUTPUT_ROW, OUTPUT_ROW, #1
1791 ldr RGB, [INPUT_BUF], #4
1792
1793 /* Inner loop over pixels */
1794 subs N, N, #8
1795 blt 3f
1796 do_load \bpp, 8
1797 do_rgb_to_yuv_stage1
1798 subs N, N, #8
1799 blt 2f
1800 1:
1801 do_rgb_to_yuv_stage2_store_load_stage1
1802 subs N, N, #8
1803 bge 1b
1804 2:
1805 do_rgb_to_yuv_stage2
1806 do_store 8
1807 tst N, #7
1808 beq 8f
1809 3:
1810 tst N, #4
1811 beq 3f
1812 do_load \bpp, 4
1813 3:
1814 tst N, #2
1815 beq 4f
1816 do_load \bpp, 2
1817 4:
1818 tst N, #1
1819 beq 5f
1820 do_load \bpp, 1
1821 5:
1822 do_rgb_to_yuv
1823 tst N, #4
1824 beq 6f
1825 do_store 4
1826 6:
1827 tst N, #2
1828 beq 7f
1829 do_store 2
1830 7:
1831 tst N, #1
1832 beq 8f
1833 do_store 1
1834 8:
1835 subs NUM_ROWS, NUM_ROWS, #1
1836 bgt 0b
1837 9:
1838 /* Restore all registers and return */
1839 vpop {d8-d15}
1840 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1841
1842 .unreq OUTPUT_WIDTH
1843 .unreq OUTPUT_ROW
1844 .unreq INPUT_BUF
1845 .unreq NUM_ROWS
1846 .unreq OUTPUT_BUF0
1847 .unreq OUTPUT_BUF1
1848 .unreq OUTPUT_BUF2
1849 .unreq RGB
1850 .unreq Y
1851 .unreq U
1852 .unreq V
1853 .unreq N
1854 .endfunc
1855
1856 .purgem do_rgb_to_yuv
1857 .purgem do_rgb_to_yuv_stage1
1858 .purgem do_rgb_to_yuv_stage2
1859 .purgem do_rgb_to_yuv_stage2_store_load_stage1
1860
1861 .endm
1862
1863 /*--------------------------------- id ----- bpp R G B */
1864 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1865 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1866 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1867 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1868 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1869 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1870
1871 .purgem do_load
1872 .purgem do_store
1873
1874 /*****************************************************************************/
1875
1876 /*
1877 * Load data into workspace, applying unsigned->signed conversion
1878 *
1879 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1880 * rid of VST1.16 instructions
1881 */
1882
1883 asm_function jsimd_convsamp_neon
1884 SAMPLE_DATA .req r0
1885 START_COL .req r1
1886 WORKSPACE .req r2
1887 TMP1 .req r3
1888 TMP2 .req r4
1889 TMP3 .req r5
1890 TMP4 .req ip
1891
1892 push {r4, r5}
1893 vmov.u8 d0, #128
1894
1895 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1896 add TMP1, TMP1, START_COL
1897 add TMP2, TMP2, START_COL
1898 add TMP3, TMP3, START_COL
1899 add TMP4, TMP4, START_COL
1900 vld1.8 {d16}, [TMP1]
1901 vsubl.u8 q8, d16, d0
1902 vld1.8 {d18}, [TMP2]
1903 vsubl.u8 q9, d18, d0
1904 vld1.8 {d20}, [TMP3]
1905 vsubl.u8 q10, d20, d0
1906 vld1.8 {d22}, [TMP4]
1907 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1908 vsubl.u8 q11, d22, d0
1909 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1910 add TMP1, TMP1, START_COL
1911 add TMP2, TMP2, START_COL
1912 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1913 add TMP3, TMP3, START_COL
1914 add TMP4, TMP4, START_COL
1915 vld1.8 {d24}, [TMP1]
1916 vsubl.u8 q12, d24, d0
1917 vld1.8 {d26}, [TMP2]
1918 vsubl.u8 q13, d26, d0
1919 vld1.8 {d28}, [TMP3]
1920 vsubl.u8 q14, d28, d0
1921 vld1.8 {d30}, [TMP4]
1922 vsubl.u8 q15, d30, d0
1923 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1924 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1925 pop {r4, r5}
1926 bx lr
1927
1928 .unreq SAMPLE_DATA
1929 .unreq START_COL
1930 .unreq WORKSPACE
1931 .unreq TMP1
1932 .unreq TMP2
1933 .unreq TMP3
1934 .unreq TMP4
1935 .endfunc
1936
1937 /*****************************************************************************/
1938
1939 /*
1940 * jsimd_fdct_ifast_neon
1941 *
1942 * This function contains a fast, not so accurate integer implementation of
1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1945 * function from jfdctfst.c
1946 *
1947 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1948 * rid of a bunch of VLD1.16 instructions
1949 */
1950
1951 #define XFIX_0_382683433 d0[0]
1952 #define XFIX_0_541196100 d0[1]
1953 #define XFIX_0_707106781 d0[2]
1954 #define XFIX_1_306562965 d0[3]
1955
1956 .balign 16
1957 jsimd_fdct_ifast_neon_consts:
1958 .short (98 * 128) /* XFIX_0_382683433 */
1959 .short (139 * 128) /* XFIX_0_541196100 */
1960 .short (181 * 128) /* XFIX_0_707106781 */
1961 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
1962
1963 asm_function jsimd_fdct_ifast_neon
1964
1965 DATA .req r0
1966 TMP .req ip
1967
1968 vpush {d8-d15}
1969
1970 /* Load constants */
1971 adr TMP, jsimd_fdct_ifast_neon_consts
1972 vld1.16 {d0}, [TMP, :64]
1973
1974 /* Load all DATA into NEON registers with the following allocation:
1975 * 0 1 2 3 | 4 5 6 7
1976 * ---------+--------
1977 * 0 | d16 | d17 | q8
1978 * 1 | d18 | d19 | q9
1979 * 2 | d20 | d21 | q10
1980 * 3 | d22 | d23 | q11
1981 * 4 | d24 | d25 | q12
1982 * 5 | d26 | d27 | q13
1983 * 6 | d28 | d29 | q14
1984 * 7 | d30 | d31 | q15
1985 */
1986
1987 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
1988 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
1989 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
1990 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
1991 sub DATA, DATA, #(128 - 32)
1992
1993 mov TMP, #2
1994 1:
1995 /* Transpose */
1996 vtrn.16 q12, q13
1997 vtrn.16 q10, q11
1998 vtrn.16 q8, q9
1999 vtrn.16 q14, q15
2000 vtrn.32 q9, q11
2001 vtrn.32 q13, q15
2002 vtrn.32 q8, q10
2003 vtrn.32 q12, q14
2004 vswp d30, d23
2005 vswp d24, d17
2006 vswp d26, d19
2007 /* 1-D FDCT */
2008 vadd.s16 q2, q11, q12
2009 vswp d28, d21
2010 vsub.s16 q12, q11, q12
2011 vsub.s16 q6, q10, q13
2012 vadd.s16 q10, q10, q13
2013 vsub.s16 q7, q9, q14
2014 vadd.s16 q9, q9, q14
2015 vsub.s16 q1, q8, q15
2016 vadd.s16 q8, q8, q15
2017 vsub.s16 q4, q9, q10
2018 vsub.s16 q5, q8, q2
2019 vadd.s16 q3, q9, q10
2020 vadd.s16 q4, q4, q5
2021 vadd.s16 q2, q8, q2
2022 vqdmulh.s16 q4, q4, XFIX_0_707106781
2023 vadd.s16 q11, q12, q6
2024 vadd.s16 q8, q2, q3
2025 vsub.s16 q12, q2, q3
2026 vadd.s16 q3, q6, q7
2027 vadd.s16 q7, q7, q1
2028 vqdmulh.s16 q3, q3, XFIX_0_707106781
2029 vsub.s16 q6, q11, q7
2030 vadd.s16 q10, q5, q4
2031 vqdmulh.s16 q6, q6, XFIX_0_382683433
2032 vsub.s16 q14, q5, q4
2033 vqdmulh.s16 q11, q11, XFIX_0_541196100
2034 vqdmulh.s16 q5, q7, XFIX_1_306562965
2035 vadd.s16 q4, q1, q3
2036 vsub.s16 q3, q1, q3
2037 vadd.s16 q7, q7, q6
2038 vadd.s16 q11, q11, q6
2039 vadd.s16 q7, q7, q5
2040 vadd.s16 q13, q3, q11
2041 vsub.s16 q11, q3, q11
2042 vadd.s16 q9, q4, q7
2043 vsub.s16 q15, q4, q7
2044 subs TMP, TMP, #1
2045 bne 1b
2046
2047 /* store results */
2048 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2049 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2050 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2051 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2052
2053 vpop {d8-d15}
2054 bx lr
2055
2056 .unreq DATA
2057 .unreq TMP
2058 .endfunc
2059
2060 /*****************************************************************************/
2061
2062 /*
2063 * GLOBAL(void)
2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
2065 * DCTELEM * workspace);
2066 *
2067 * Note: the code uses 2 stage pipelining in order to improve instructions
2068 * scheduling and eliminate stalls (this provides ~15% better
2069 * performance for this function on both ARM Cortex-A8 and
2070 * ARM Cortex-A9 when compared to the non-pipelined variant).
2071 * The instructions which belong to the second stage use different
2072 * indentation for better readiability.
2073 */
2074 asm_function jsimd_quantize_neon
2075
2076 COEF_BLOCK .req r0
2077 DIVISORS .req r1
2078 WORKSPACE .req r2
2079
2080 RECIPROCAL .req DIVISORS
2081 CORRECTION .req r3
2082 SHIFT .req ip
2083 LOOP_COUNT .req r4
2084
2085 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2086 vabs.s16 q12, q0
2087 add CORRECTION, DIVISORS, #(64 * 2)
2088 add SHIFT, DIVISORS, #(64 * 6)
2089 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2090 vabs.s16 q13, q1
2091 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2092 vadd.u16 q12, q12, q10 /* add correction */
2093 vadd.u16 q13, q13, q11
2094 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2095 vmull.u16 q11, d25, d17
2096 vmull.u16 q8, d26, d18
2097 vmull.u16 q9, d27, d19
2098 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2099 vshrn.u32 d20, q10, #16
2100 vshrn.u32 d21, q11, #16
2101 vshrn.u32 d22, q8, #16
2102 vshrn.u32 d23, q9, #16
2103 vneg.s16 q12, q12
2104 vneg.s16 q13, q13
2105 vshr.s16 q2, q0, #15 /* extract sign */
2106 vshr.s16 q3, q1, #15
2107 vshl.u16 q14, q10, q12 /* shift */
2108 vshl.u16 q15, q11, q13
2109
2110 push {r4, r5}
2111 mov LOOP_COUNT, #3
2112 1:
2113 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2114 veor.u16 q14, q14, q2 /* restore sign */
2115 vabs.s16 q12, q0
2116 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2117 vabs.s16 q13, q1
2118 veor.u16 q15, q15, q3
2119 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2120 vadd.u16 q12, q12, q10 /* add correction */
2121 vadd.u16 q13, q13, q11
2122 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2123 vmull.u16 q11, d25, d17
2124 vmull.u16 q8, d26, d18
2125 vmull.u16 q9, d27, d19
2126 vsub.u16 q14, q14, q2
2127 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2128 vsub.u16 q15, q15, q3
2129 vshrn.u32 d20, q10, #16
2130 vshrn.u32 d21, q11, #16
2131 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2132 vshrn.u32 d22, q8, #16
2133 vshrn.u32 d23, q9, #16
2134 vneg.s16 q12, q12
2135 vneg.s16 q13, q13
2136 vshr.s16 q2, q0, #15 /* extract sign */
2137 vshr.s16 q3, q1, #15
2138 vshl.u16 q14, q10, q12 /* shift */
2139 vshl.u16 q15, q11, q13
2140 subs LOOP_COUNT, LOOP_COUNT, #1
2141 bne 1b
2142 pop {r4, r5}
2143
2144 veor.u16 q14, q14, q2 /* restore sign */
2145 veor.u16 q15, q15, q3
2146 vsub.u16 q14, q14, q2
2147 vsub.u16 q15, q15, q3
2148 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2149
2150 bx lr /* return */
2151
2152 .unreq COEF_BLOCK
2153 .unreq DIVISORS
2154 .unreq WORKSPACE
2155 .unreq RECIPROCAL
2156 .unreq CORRECTION
2157 .unreq SHIFT
2158 .unreq LOOP_COUNT
2159 .endfunc
OLDNEW
« no previous file with comments | « simd/jsimd_arm.c ('k') | simd/jsimdcfg.inc.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698