Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Side by Side Diff: third_party/libjpeg_turbo/simd/jsimd_arm_neon.S

Issue 7554002: Updates libjpeg-turbo to 1.1.90 (r677) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/
Patch Set: '' Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/libjpeg_turbo/simd/jsimd_arm.c ('k') | third_party/libjpeg_turbo/simd/jsimd_i386.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 /*
2 * ARM NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
7 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 * claim that you wrote the original software. If you use this software
18 * in a product, an acknowledgment in the product documentation would be
19 * appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 * misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25 #if defined(__linux__) && defined(__ELF__)
26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
27 #endif
28
29 .text
30 .fpu neon
31 .arch armv7a
32 .object_arch armv4
33 .arm
34
35
36 #define RESPECT_STRICT_ALIGNMENT 1
37
38 /*****************************************************************************/
39
40 /* Supplementary macro for setting function attributes */
41 .macro asm_function fname
42 #ifdef __APPLE__
43 .func _\fname
44 .globl _\fname
45 _\fname:
46 #else
47 .func \fname
48 .global \fname
49 #ifdef __ELF__
50 .hidden \fname
51 .type \fname, %function
52 #endif
53 \fname:
54 #endif
55 .endm
56
57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
58 .macro transpose_4x4 x0, x1, x2, x3
59 vtrn.16 \x0, \x1
60 vtrn.16 \x2, \x3
61 vtrn.32 \x0, \x2
62 vtrn.32 \x1, \x3
63 .endm
64
65 /*****************************************************************************/
66
67 /*
68 * jsimd_idct_ifast_neon
69 *
70 * This function contains a fast, not so accurate integer implementation of
71 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
72 * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
73 * function from jidctfst.c
74 *
75 * TODO: a bit better instructions scheduling is needed.
76 */
77
78 #define XFIX_1_082392200 d0[0]
79 #define XFIX_1_414213562 d0[1]
80 #define XFIX_1_847759065 d0[2]
81 #define XFIX_2_613125930 d0[3]
82
83 .balign 16
84 jsimd_idct_ifast_neon_consts:
85 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
86 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
87 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
88 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
89
90 /* 1-D IDCT helper macro */
91
92 .macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \
93 t10, t11, t12, t13, t14
94
95 vsub.s16 \t10, \x0, \x4
96 vadd.s16 \x4, \x0, \x4
97 vswp.s16 \t10, \x0
98 vsub.s16 \t11, \x2, \x6
99 vadd.s16 \x6, \x2, \x6
100 vswp.s16 \t11, \x2
101 vsub.s16 \t10, \x3, \x5
102 vadd.s16 \x5, \x3, \x5
103 vswp.s16 \t10, \x3
104 vsub.s16 \t11, \x1, \x7
105 vadd.s16 \x7, \x1, \x7
106 vswp.s16 \t11, \x1
107
108 vqdmulh.s16 \t13, \x2, d0[1]
109 vadd.s16 \t12, \x3, \x3
110 vadd.s16 \x2, \x2, \t13
111 vqdmulh.s16 \t13, \x3, d0[3]
112 vsub.s16 \t10, \x1, \x3
113 vadd.s16 \t12, \t12, \t13
114 vqdmulh.s16 \t13, \t10, d0[2]
115 vsub.s16 \t11, \x7, \x5
116 vadd.s16 \t10, \t10, \t13
117 vqdmulh.s16 \t13, \t11, d0[1]
118 vadd.s16 \t11, \t11, \t13
119
120 vqdmulh.s16 \t13, \x1, d0[0]
121 vsub.s16 \x2, \x6, \x2
122 vsub.s16 \t14, \x0, \x2
123 vadd.s16 \x2, \x0, \x2
124 vadd.s16 \x0, \x4, \x6
125 vsub.s16 \x4, \x4, \x6
126 vadd.s16 \x1, \x1, \t13
127 vadd.s16 \t13, \x7, \x5
128 vsub.s16 \t12, \t13, \t12
129 vsub.s16 \t12, \t12, \t10
130 vadd.s16 \t11, \t12, \t11
131 vsub.s16 \t10, \x1, \t10
132 vadd.s16 \t10, \t10, \t11
133
134 vsub.s16 \x7, \x0, \t13
135 vadd.s16 \x0, \x0, \t13
136 vadd.s16 \x6, \t14, \t12
137 vsub.s16 \x1, \t14, \t12
138 vsub.s16 \x5, \x2, \t11
139 vadd.s16 \x2, \x2, \t11
140 vsub.s16 \x3, \x4, \t10
141 vadd.s16 \x4, \x4, \t10
142 .endm
143
144 asm_function jsimd_idct_ifast_neon
145
146 DCT_TABLE .req r0
147 COEF_BLOCK .req r1
148 OUTPUT_BUF .req r2
149 OUTPUT_COL .req r3
150 TMP .req ip
151
152 vpush {d8-d15}
153
154 /* Load constants */
155 adr TMP, jsimd_idct_ifast_neon_consts
156 vld1.16 {d0}, [TMP, :64]
157
158 /* Load all COEF_BLOCK into NEON registers with the following allocation:
159 * 0 1 2 3 | 4 5 6 7
160 * ---------+--------
161 * 0 | d4 | d5
162 * 1 | d6 | d7
163 * 2 | d8 | d9
164 * 3 | d10 | d11
165 * 4 | d12 | d13
166 * 5 | d14 | d15
167 * 6 | d16 | d17
168 * 7 | d18 | d19
169 */
170 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]!
171 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]!
172 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]!
173 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]!
174 /* Dequantize */
175 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
176 vmul.s16 q2, q2, q10
177 vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]!
178 vmul.s16 q3, q3, q11
179 vmul.s16 q4, q4, q12
180 vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]!
181 vmul.s16 q5, q5, q13
182 vmul.s16 q6, q6, q14
183 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
184 vmul.s16 q7, q7, q15
185 vmul.s16 q8, q8, q10
186 vmul.s16 q9, q9, q11
187
188 /* Pass 1 */
189 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
190 /* Transpose */
191 transpose_4x4 d4, d6, d8, d10
192 transpose_4x4 d5, d7, d9, d11
193 transpose_4x4 d12, d14, d16, d18
194 transpose_4x4 d13, d15, d17, d19
195 vswp d12, d5
196 vswp d14, d7
197 vswp d16, d9
198 vswp d18, d11
199
200 /* Pass 2 */
201 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
202 /* Transpose */
203 transpose_4x4 d4, d6, d8, d10
204 transpose_4x4 d5, d7, d9, d11
205 transpose_4x4 d12, d14, d16, d18
206 transpose_4x4 d13, d15, d17, d19
207 vswp d12, d5
208 vswp d14, d7
209 vswp d16, d9
210 vswp d18, d11
211
212 /* Descale and range limit */
213 vmov.s16 q15, #(0x80 << 5)
214 vqadd.s16 q2, q2, q15
215 vqadd.s16 q3, q3, q15
216 vqadd.s16 q4, q4, q15
217 vqadd.s16 q5, q5, q15
218 vqadd.s16 q6, q6, q15
219 vqadd.s16 q7, q7, q15
220 vqadd.s16 q8, q8, q15
221 vqadd.s16 q9, q9, q15
222 vqshrun.s16 d4, q2, #5
223 vqshrun.s16 d6, q3, #5
224 vqshrun.s16 d8, q4, #5
225 vqshrun.s16 d10, q5, #5
226 vqshrun.s16 d12, q6, #5
227 vqshrun.s16 d14, q7, #5
228 vqshrun.s16 d16, q8, #5
229 vqshrun.s16 d18, q9, #5
230
231 /* Store results to the output buffer */
232 .irp x, d4, d6, d8, d10, d12, d14, d16, d18
233 ldr TMP, [OUTPUT_BUF], #4
234 add TMP, TMP, OUTPUT_COL
235 vst1.8 {\x}, [TMP]!
236 .endr
237
238 vpop {d8-d15}
239 bx lr
240
241 .unreq DCT_TABLE
242 .unreq COEF_BLOCK
243 .unreq OUTPUT_BUF
244 .unreq OUTPUT_COL
245 .unreq TMP
246 .endfunc
247
248 .purgem idct_helper
249
250 /*****************************************************************************/
251
252 /*
253 * jsimd_idct_4x4_neon
254 *
255 * This function contains inverse-DCT code for getting reduced-size
256 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
257 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
258 * function from jpeg-6b (jidctred.c).
259 *
260 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
261 * requires much less arithmetic operations and hence should be faster.
262 * The primary purpose of this particular NEON optimized function is
263 * bit exact compatibility with jpeg-6b.
264 *
265 * TODO: a bit better instructions scheduling can be achieved by expanding
266 * idct_helper/transpose_4x4 macros and reordering instructions,
267 * but readability will suffer somewhat.
268 */
269
270 #define CONST_BITS 13
271
272 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
273 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
274 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
275 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
276 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
277 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
278 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
279 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
280 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
281 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
282 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
283 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
284 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
285 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
286
287 .balign 16
288 jsimd_idct_4x4_neon_consts:
289 .short FIX_1_847759065 /* d0[0] */
290 .short -FIX_0_765366865 /* d0[1] */
291 .short -FIX_0_211164243 /* d0[2] */
292 .short FIX_1_451774981 /* d0[3] */
293 .short -FIX_2_172734803 /* d1[0] */
294 .short FIX_1_061594337 /* d1[1] */
295 .short -FIX_0_509795579 /* d1[2] */
296 .short -FIX_0_601344887 /* d1[3] */
297 .short FIX_0_899976223 /* d2[0] */
298 .short FIX_2_562915447 /* d2[1] */
299 .short 1 << (CONST_BITS+1) /* d2[2] */
300 .short 0 /* d2[3] */
301
302 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
303 vmull.s16 q14, \x4, d2[2]
304 vmlal.s16 q14, \x8, d0[0]
305 vmlal.s16 q14, \x14, d0[1]
306
307 vmull.s16 q13, \x16, d1[2]
308 vmlal.s16 q13, \x12, d1[3]
309 vmlal.s16 q13, \x10, d2[0]
310 vmlal.s16 q13, \x6, d2[1]
311
312 vmull.s16 q15, \x4, d2[2]
313 vmlsl.s16 q15, \x8, d0[0]
314 vmlsl.s16 q15, \x14, d0[1]
315
316 vmull.s16 q12, \x16, d0[2]
317 vmlal.s16 q12, \x12, d0[3]
318 vmlal.s16 q12, \x10, d1[0]
319 vmlal.s16 q12, \x6, d1[1]
320
321 vadd.s32 q10, q14, q13
322 vsub.s32 q14, q14, q13
323
324 .if \shift > 16
325 vrshr.s32 q10, q10, #\shift
326 vrshr.s32 q14, q14, #\shift
327 vmovn.s32 \y26, q10
328 vmovn.s32 \y29, q14
329 .else
330 vrshrn.s32 \y26, q10, #\shift
331 vrshrn.s32 \y29, q14, #\shift
332 .endif
333
334 vadd.s32 q10, q15, q12
335 vsub.s32 q15, q15, q12
336
337 .if \shift > 16
338 vrshr.s32 q10, q10, #\shift
339 vrshr.s32 q15, q15, #\shift
340 vmovn.s32 \y27, q10
341 vmovn.s32 \y28, q15
342 .else
343 vrshrn.s32 \y27, q10, #\shift
344 vrshrn.s32 \y28, q15, #\shift
345 .endif
346
347 .endm
348
349 asm_function jsimd_idct_4x4_neon
350
351 DCT_TABLE .req r0
352 COEF_BLOCK .req r1
353 OUTPUT_BUF .req r2
354 OUTPUT_COL .req r3
355 TMP1 .req r0
356 TMP2 .req r1
357 TMP3 .req r2
358 TMP4 .req ip
359
360 vpush {d8-d15}
361
362 /* Load constants (d3 is just used for padding) */
363 adr TMP4, jsimd_idct_4x4_neon_consts
364 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
365
366 /* Load all COEF_BLOCK into NEON registers with the following allocation:
367 * 0 1 2 3 | 4 5 6 7
368 * ---------+--------
369 * 0 | d4 | d5
370 * 1 | d6 | d7
371 * 2 | d8 | d9
372 * 3 | d10 | d11
373 * 4 | - | -
374 * 5 | d12 | d13
375 * 6 | d14 | d15
376 * 7 | d16 | d17
377 */
378 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
379 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
380 add COEF_BLOCK, COEF_BLOCK, #16
381 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
382 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
383 /* dequantize */
384 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
385 vmul.s16 q2, q2, q9
386 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
387 vmul.s16 q3, q3, q10
388 vmul.s16 q4, q4, q11
389 add DCT_TABLE, DCT_TABLE, #16
390 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
391 vmul.s16 q5, q5, q12
392 vmul.s16 q6, q6, q13
393 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
394 vmul.s16 q7, q7, q14
395 vmul.s16 q8, q8, q15
396
397 /* Pass 1 */
398 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
399 transpose_4x4 d4, d6, d8, d10
400 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
401 transpose_4x4 d5, d7, d9, d11
402
403 /* Pass 2 */
404 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
405 transpose_4x4 d26, d27, d28, d29
406
407 /* Range limit */
408 vmov.u16 q15, #0x80
409 vadd.s16 q13, q13, q15
410 vadd.s16 q14, q14, q15
411 vqmovun.s16 d26, q13
412 vqmovun.s16 d27, q14
413
414 /* Store results to the output buffer */
415 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
416 add TMP1, TMP1, OUTPUT_COL
417 add TMP2, TMP2, OUTPUT_COL
418 add TMP3, TMP3, OUTPUT_COL
419 add TMP4, TMP4, OUTPUT_COL
420
421 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
422 /* We can use much less instructions on little endian systems if the
423 * OS kernel is not configured to trap unaligned memory accesses
424 */
425 vst1.32 {d26[0]}, [TMP1]!
426 vst1.32 {d27[0]}, [TMP3]!
427 vst1.32 {d26[1]}, [TMP2]!
428 vst1.32 {d27[1]}, [TMP4]!
429 #else
430 vst1.8 {d26[0]}, [TMP1]!
431 vst1.8 {d27[0]}, [TMP3]!
432 vst1.8 {d26[1]}, [TMP1]!
433 vst1.8 {d27[1]}, [TMP3]!
434 vst1.8 {d26[2]}, [TMP1]!
435 vst1.8 {d27[2]}, [TMP3]!
436 vst1.8 {d26[3]}, [TMP1]!
437 vst1.8 {d27[3]}, [TMP3]!
438
439 vst1.8 {d26[4]}, [TMP2]!
440 vst1.8 {d27[4]}, [TMP4]!
441 vst1.8 {d26[5]}, [TMP2]!
442 vst1.8 {d27[5]}, [TMP4]!
443 vst1.8 {d26[6]}, [TMP2]!
444 vst1.8 {d27[6]}, [TMP4]!
445 vst1.8 {d26[7]}, [TMP2]!
446 vst1.8 {d27[7]}, [TMP4]!
447 #endif
448
449 vpop {d8-d15}
450 bx lr
451
452 .unreq DCT_TABLE
453 .unreq COEF_BLOCK
454 .unreq OUTPUT_BUF
455 .unreq OUTPUT_COL
456 .unreq TMP1
457 .unreq TMP2
458 .unreq TMP3
459 .unreq TMP4
460 .endfunc
461
462 .purgem idct_helper
463
464 /*****************************************************************************/
465
466 /*
467 * jsimd_idct_2x2_neon
468 *
469 * This function contains inverse-DCT code for getting reduced-size
470 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
471 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
472 * function from jpeg-6b (jidctred.c).
473 *
474 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
475 * requires much less arithmetic operations and hence should be faster.
476 * The primary purpose of this particular NEON optimized function is
477 * bit exact compatibility with jpeg-6b.
478 */
479
480 .balign 8
481 jsimd_idct_2x2_neon_consts:
482 .short -FIX_0_720959822 /* d0[0] */
483 .short FIX_0_850430095 /* d0[1] */
484 .short -FIX_1_272758580 /* d0[2] */
485 .short FIX_3_624509785 /* d0[3] */
486
487 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
488 vshll.s16 q14, \x4, #15
489 vmull.s16 q13, \x6, d0[3]
490 vmlal.s16 q13, \x10, d0[2]
491 vmlal.s16 q13, \x12, d0[1]
492 vmlal.s16 q13, \x16, d0[0]
493
494 vadd.s32 q10, q14, q13
495 vsub.s32 q14, q14, q13
496
497 .if \shift > 16
498 vrshr.s32 q10, q10, #\shift
499 vrshr.s32 q14, q14, #\shift
500 vmovn.s32 \y26, q10
501 vmovn.s32 \y27, q14
502 .else
503 vrshrn.s32 \y26, q10, #\shift
504 vrshrn.s32 \y27, q14, #\shift
505 .endif
506
507 .endm
508
509 asm_function jsimd_idct_2x2_neon
510
511 DCT_TABLE .req r0
512 COEF_BLOCK .req r1
513 OUTPUT_BUF .req r2
514 OUTPUT_COL .req r3
515 TMP1 .req r0
516 TMP2 .req ip
517
518 vpush {d8-d15}
519
520 /* Load constants */
521 adr TMP2, jsimd_idct_2x2_neon_consts
522 vld1.16 {d0}, [TMP2, :64]
523
524 /* Load all COEF_BLOCK into NEON registers with the following allocation:
525 * 0 1 2 3 | 4 5 6 7
526 * ---------+--------
527 * 0 | d4 | d5
528 * 1 | d6 | d7
529 * 2 | - | -
530 * 3 | d10 | d11
531 * 4 | - | -
532 * 5 | d12 | d13
533 * 6 | - | -
534 * 7 | d16 | d17
535 */
536 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
537 add COEF_BLOCK, COEF_BLOCK, #16
538 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
539 add COEF_BLOCK, COEF_BLOCK, #16
540 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
541 add COEF_BLOCK, COEF_BLOCK, #16
542 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
543 /* Dequantize */
544 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
545 vmul.s16 q2, q2, q9
546 vmul.s16 q3, q3, q10
547 add DCT_TABLE, DCT_TABLE, #16
548 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
549 vmul.s16 q5, q5, q12
550 add DCT_TABLE, DCT_TABLE, #16
551 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
552 vmul.s16 q6, q6, q13
553 add DCT_TABLE, DCT_TABLE, #16
554 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
555 vmul.s16 q8, q8, q15
556
557 /* Pass 1 */
558 #if 0
559 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
560 transpose_4x4 d4, d6, d8, d10
561 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
562 transpose_4x4 d5, d7, d9, d11
563 #else
564 vmull.s16 q13, d6, d0[3]
565 vmlal.s16 q13, d10, d0[2]
566 vmlal.s16 q13, d12, d0[1]
567 vmlal.s16 q13, d16, d0[0]
568 vmull.s16 q12, d7, d0[3]
569 vmlal.s16 q12, d11, d0[2]
570 vmlal.s16 q12, d13, d0[1]
571 vmlal.s16 q12, d17, d0[0]
572 vshll.s16 q14, d4, #15
573 vshll.s16 q15, d5, #15
574 vadd.s32 q10, q14, q13
575 vsub.s32 q14, q14, q13
576 vrshrn.s32 d4, q10, #13
577 vrshrn.s32 d6, q14, #13
578 vadd.s32 q10, q15, q12
579 vsub.s32 q14, q15, q12
580 vrshrn.s32 d5, q10, #13
581 vrshrn.s32 d7, q14, #13
582 vtrn.16 q2, q3
583 vtrn.32 q3, q5
584 #endif
585
586 /* Pass 2 */
587 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
588
589 /* Range limit */
590 vmov.u16 q15, #0x80
591 vadd.s16 q13, q13, q15
592 vqmovun.s16 d26, q13
593 vqmovun.s16 d27, q13
594
595 /* Store results to the output buffer */
596 ldmia OUTPUT_BUF, {TMP1, TMP2}
597 add TMP1, TMP1, OUTPUT_COL
598 add TMP2, TMP2, OUTPUT_COL
599
600 vst1.8 {d26[0]}, [TMP1]!
601 vst1.8 {d27[4]}, [TMP1]!
602 vst1.8 {d26[1]}, [TMP2]!
603 vst1.8 {d27[5]}, [TMP2]!
604
605 vpop {d8-d15}
606 bx lr
607
608 .unreq DCT_TABLE
609 .unreq COEF_BLOCK
610 .unreq OUTPUT_BUF
611 .unreq OUTPUT_COL
612 .unreq TMP1
613 .unreq TMP2
614 .endfunc
615
616 .purgem idct_helper
617
618 /*****************************************************************************/
619
620 /*
621 * jsimd_ycc_extrgb_convert_neon
622 * jsimd_ycc_extbgr_convert_neon
623 * jsimd_ycc_extrgbx_convert_neon
624 * jsimd_ycc_extbgrx_convert_neon
625 * jsimd_ycc_extxbgr_convert_neon
626 * jsimd_ycc_extxrgb_convert_neon
627 *
628 * Colorspace conversion YCbCr -> RGB
629 */
630
631
632 .macro do_load size
633 .if \size == 8
634 vld1.8 {d4}, [U]!
635 vld1.8 {d5}, [V]!
636 vld1.8 {d0}, [Y]!
637 pld [Y, #64]
638 pld [U, #64]
639 pld [V, #64]
640 .elseif \size == 4
641 vld1.8 {d4[0]}, [U]!
642 vld1.8 {d4[1]}, [U]!
643 vld1.8 {d4[2]}, [U]!
644 vld1.8 {d4[3]}, [U]!
645 vld1.8 {d5[0]}, [V]!
646 vld1.8 {d5[1]}, [V]!
647 vld1.8 {d5[2]}, [V]!
648 vld1.8 {d5[3]}, [V]!
649 vld1.8 {d0[0]}, [Y]!
650 vld1.8 {d0[1]}, [Y]!
651 vld1.8 {d0[2]}, [Y]!
652 vld1.8 {d0[3]}, [Y]!
653 .elseif \size == 2
654 vld1.8 {d4[4]}, [U]!
655 vld1.8 {d4[5]}, [U]!
656 vld1.8 {d5[4]}, [V]!
657 vld1.8 {d5[5]}, [V]!
658 vld1.8 {d0[4]}, [Y]!
659 vld1.8 {d0[5]}, [Y]!
660 .elseif \size == 1
661 vld1.8 {d4[6]}, [U]!
662 vld1.8 {d5[6]}, [V]!
663 vld1.8 {d0[6]}, [Y]!
664 .else
665 .error unsupported macroblock size
666 .endif
667 .endm
668
669 .macro do_store bpp, size
670 .if \bpp == 24
671 .if \size == 8
672 vst3.8 {d10, d11, d12}, [RGB]!
673 .elseif \size == 4
674 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
675 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
676 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
677 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
678 .elseif \size == 2
679 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
680 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
681 .elseif \size == 1
682 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
683 .else
684 .error unsupported macroblock size
685 .endif
686 .elseif \bpp == 32
687 .if \size == 8
688 vst4.8 {d10, d11, d12, d13}, [RGB]!
689 .elseif \size == 4
690 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
691 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
692 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
693 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
694 .elseif \size == 2
695 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
696 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
697 .elseif \size == 1
698 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
699 .else
700 .error unsupported macroblock size
701 .endif
702 .else
703 .error unsupported bpp
704 .endif
705 .endm
706
707 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
708
709 .macro do_yuv_to_rgb
710 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
711 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
712 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
713 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
714 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
715 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
716 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
717 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
718 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
719 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
720 vrshrn.s32 d20, q10, #15
721 vrshrn.s32 d21, q11, #15
722 vrshrn.s32 d24, q12, #14
723 vrshrn.s32 d25, q13, #14
724 vrshrn.s32 d28, q14, #14
725 vrshrn.s32 d29, q15, #14
726 vaddw.u8 q10, q10, d0
727 vaddw.u8 q12, q12, d0
728 vaddw.u8 q14, q14, d0
729 vqmovun.s16 d1\g_offs, q10
730 vqmovun.s16 d1\r_offs, q12
731 vqmovun.s16 d1\b_offs, q14
732 .endm
733
734 /* Apple gas crashes on adrl, work around that by using adr.
735 * But this requires a copy of these constants for each function.
736 */
737
738 .balign 16
739 jsimd_ycc_\colorid\()_neon_consts:
740 .short 0, 0, 0, 0
741 .short 22971, -11277, -23401, 29033
742 .short -128, -128, -128, -128
743 .short -128, -128, -128, -128
744
745 asm_function jsimd_ycc_\colorid\()_convert_neon
746 OUTPUT_WIDTH .req r0
747 INPUT_BUF .req r1
748 INPUT_ROW .req r2
749 OUTPUT_BUF .req r3
750 NUM_ROWS .req r4
751
752 INPUT_BUF0 .req r5
753 INPUT_BUF1 .req r6
754 INPUT_BUF2 .req INPUT_BUF
755
756 RGB .req r7
757 Y .req r8
758 U .req r9
759 V .req r10
760 N .req ip
761
762 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
763 adr ip, jsimd_ycc_\colorid\()_neon_consts
764 vld1.16 {d0, d1, d2, d3}, [ip, :128]
765
766 /* Save ARM registers and handle input arguments */
767 push {r4, r5, r6, r7, r8, r9, r10, lr}
768 ldr NUM_ROWS, [sp, #(4 * 8)]
769 ldr INPUT_BUF0, [INPUT_BUF]
770 ldr INPUT_BUF1, [INPUT_BUF, #4]
771 ldr INPUT_BUF2, [INPUT_BUF, #8]
772 .unreq INPUT_BUF
773
774 /* Save NEON registers */
775 vpush {d8-d15}
776
777 /* Initially set d10, d11, d12, d13 to 0xFF */
778 vmov.u8 q5, #255
779 vmov.u8 q6, #255
780
781 /* Outer loop over scanlines */
782 cmp NUM_ROWS, #1
783 blt 9f
784 0:
785 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
786 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
787 mov N, OUTPUT_WIDTH
788 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
789 add INPUT_ROW, INPUT_ROW, #1
790 ldr RGB, [OUTPUT_BUF], #4
791
792 /* Inner loop over pixels */
793 subs N, N, #8
794 blt 2f
795 1:
796 do_load 8
797 do_yuv_to_rgb
798 do_store \bpp, 8
799 subs N, N, #8
800 bge 1b
801 tst N, #7
802 beq 8f
803 2:
804 tst N, #4
805 beq 3f
806 do_load 4
807 3:
808 tst N, #2
809 beq 4f
810 do_load 2
811 4:
812 tst N, #1
813 beq 5f
814 do_load 1
815 5:
816 do_yuv_to_rgb
817 tst N, #4
818 beq 6f
819 do_store \bpp, 4
820 6:
821 tst N, #2
822 beq 7f
823 do_store \bpp, 2
824 7:
825 tst N, #1
826 beq 8f
827 do_store \bpp, 1
828 8:
829 subs NUM_ROWS, NUM_ROWS, #1
830 bgt 0b
831 9:
832 /* Restore all registers and return */
833 vpop {d8-d15}
834 pop {r4, r5, r6, r7, r8, r9, r10, pc}
835
836 .unreq OUTPUT_WIDTH
837 .unreq INPUT_ROW
838 .unreq OUTPUT_BUF
839 .unreq NUM_ROWS
840 .unreq INPUT_BUF0
841 .unreq INPUT_BUF1
842 .unreq INPUT_BUF2
843 .unreq RGB
844 .unreq Y
845 .unreq U
846 .unreq V
847 .unreq N
848 .endfunc
849
850 .purgem do_yuv_to_rgb
851
852 .endm
853
854 /*--------------------------------- id ----- bpp R G B */
855 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
856 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
861
862 .purgem do_load
863 .purgem do_store
864
865 /*****************************************************************************/
OLDNEW
« no previous file with comments | « third_party/libjpeg_turbo/simd/jsimd_arm.c ('k') | third_party/libjpeg_turbo/simd/jsimd_i386.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698