Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(89)

Side by Side Diff: simd/jsimd_arm64_neon.S

Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo
Patch Set: Add neon fixes Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « simd/jsimd_arm64.c ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * ARMv8 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 *
10 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the authors be held liable for any damages
12 * arising from the use of this software.
13 *
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute it
16 * freely, subject to the following restrictions:
17 *
18 * 1. The origin of this software must not be misrepresented; you must not
19 * claim that you wrote the original software. If you use this software
20 * in a product, an acknowledgment in the product documentation would be
21 * appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must not be
23 * misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source distribution.
25 */
26
27 #if defined(__linux__) && defined(__ELF__)
28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
29 #endif
30
31 .text
32 .arch armv8-a+fp+simd
33
34
35 #define RESPECT_STRICT_ALIGNMENT 1
36
37
38 /*****************************************************************************/
39
40 /* Supplementary macro for setting function attributes */
41 .macro asm_function fname
42 #ifdef __APPLE__
43 .globl _\fname
44 _\fname:
45 #else
46 .global \fname
47 #ifdef __ELF__
48 .hidden \fname
49 .type \fname, %function
50 #endif
51 \fname:
52 #endif
53 .endm
54
55 /* Transpose elements of single 128 bit registers */
56 .macro transpose_single x0,x1,xi,xilen,literal
57 ins \xi\xilen[0], \x0\xilen[0]
58 ins \x1\xilen[0], \x0\xilen[1]
59 trn1 \x0\literal, \x0\literal, \x1\literal
60 trn2 \x1\literal, \xi\literal, \x1\literal
61 .endm
62
63 /* Transpose elements of 2 differnet registers */
64 .macro transpose x0,x1,xi,xilen,literal
65 mov \xi\xilen, \x0\xilen
66 trn1 \x0\literal, \x0\literal, \x1\literal
67 trn2 \x1\literal, \xi\literal, \x1\literal
68 .endm
69
70 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
71 .macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
72 mov \xi\xilen, \x0\xilen
73 trn1 \x0\x0len, \x0\x0len, \x2\x2len
74 trn2 \x2\x2len, \xi\x0len, \x2\x2len
75 mov \xi\xilen, \x1\xilen
76 trn1 \x1\x1len, \x1\x1len, \x3\x3len
77 trn2 \x3\x3len, \xi\x1len, \x3\x3len
78 .endm
79
80 .macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
81 mov \xi\xilen, \x0\xilen
82 trn1 \x0\x0len, \x0\x0len, \x1\x1len
83 trn2 \x1\x2len, \xi\x0len, \x1\x2len
84 mov \xi\xilen, \x2\xilen
85 trn1 \x2\x2len, \x2\x2len, \x3\x3len
86 trn2 \x3\x2len, \xi\x1len, \x3\x3len
87 .endm
88
89 .macro transpose_4x4 x0, x1, x2, x3,x5
90 transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
91 transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
92 .endm
93
94
95 #define CENTERJSAMPLE 128
96
97 /*****************************************************************************/
98
99 /*
100 * Perform dequantization and inverse DCT on one block of coefficients.
101 *
102 * GLOBAL(void)
103 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
104 * JSAMPARRAY output_buf, JDIMENSION output_col)
105 */
106
107 #define FIX_0_298631336 (2446)
108 #define FIX_0_390180644 (3196)
109 #define FIX_0_541196100 (4433)
110 #define FIX_0_765366865 (6270)
111 #define FIX_0_899976223 (7373)
112 #define FIX_1_175875602 (9633)
113 #define FIX_1_501321110 (12299)
114 #define FIX_1_847759065 (15137)
115 #define FIX_1_961570560 (16069)
116 #define FIX_2_053119869 (16819)
117 #define FIX_2_562915447 (20995)
118 #define FIX_3_072711026 (25172)
119
120 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
121 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
122 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
123 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
124 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
125 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
126 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
127 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
128
129 /*
130 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
131 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
132 */
133 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
134 { \
135 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
136 INT32 q1, q2, q3, q4, q5, q6, q7; \
137 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
138 \
139 /* 1-D iDCT input data */ \
140 row0 = xrow0; \
141 row1 = xrow1; \
142 row2 = xrow2; \
143 row3 = xrow3; \
144 row4 = xrow4; \
145 row5 = xrow5; \
146 row6 = xrow6; \
147 row7 = xrow7; \
148 \
149 q5 = row7 + row3; \
150 q4 = row5 + row1; \
151 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
152 MULTIPLY(q4, FIX_1_175875602); \
153 q7 = MULTIPLY(q5, FIX_1_175875602) + \
154 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
155 q2 = MULTIPLY(row2, FIX_0_541196100) + \
156 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
157 q4 = q6; \
158 q3 = ((INT32) row0 - (INT32) row4) << 13; \
159 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
160 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
161 /* now we can use q1 (reloadable constants have been used up) */ \
162 q1 = q3 + q2; \
163 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
164 MULTIPLY(row1, -FIX_0_899976223); \
165 q5 = q7; \
166 q1 = q1 + q6; \
167 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
168 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
169 \
170 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
171 tmp11_plus_tmp2 = q1; \
172 row1 = 0; \
173 \
174 q1 = q1 - q6; \
175 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
176 MULTIPLY(row3, -FIX_2_562915447); \
177 q1 = q1 - q6; \
178 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
179 MULTIPLY(row6, FIX_0_541196100); \
180 q3 = q3 - q2; \
181 \
182 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
183 tmp11_minus_tmp2 = q1; \
184 \
185 q1 = ((INT32) row0 + (INT32) row4) << 13; \
186 q2 = q1 + q6; \
187 q1 = q1 - q6; \
188 \
189 /* pick up the results */ \
190 tmp0 = q4; \
191 tmp1 = q5; \
192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
193 tmp3 = q7; \
194 tmp10 = q2; \
195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
196 tmp12 = q3; \
197 tmp13 = q1; \
198 }
199
200 #define XFIX_0_899976223 v0.4h[0]
201 #define XFIX_0_541196100 v0.4h[1]
202 #define XFIX_2_562915447 v0.4h[2]
203 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
204 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
205 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
206 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
207 #define XFIX_1_175875602 v1.4h[3]
208 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
209 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
210 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
211 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
212
213 .balign 16
214 jsimd_idct_islow_neon_consts:
215 .short FIX_0_899976223 /* d0[0] */
216 .short FIX_0_541196100 /* d0[1] */
217 .short FIX_2_562915447 /* d0[2] */
218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
222 .short FIX_1_175875602 /* d1[3] */
223 /* reloadable constants */
224 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
225 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
226 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
227 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
228
229 asm_function jsimd_idct_islow_neon
230
231 DCT_TABLE .req x0
232 COEF_BLOCK .req x1
233 OUTPUT_BUF .req x2
234 OUTPUT_COL .req x3
235 TMP1 .req x0
236 TMP2 .req x1
237 TMP3 .req x2
238 TMP4 .req x15
239
240 ROW0L .req v16
241 ROW0R .req v17
242 ROW1L .req v18
243 ROW1R .req v19
244 ROW2L .req v20
245 ROW2R .req v21
246 ROW3L .req v22
247 ROW3R .req v23
248 ROW4L .req v24
249 ROW4R .req v25
250 ROW5L .req v26
251 ROW5R .req v27
252 ROW6L .req v28
253 ROW6R .req v29
254 ROW7L .req v30
255 ROW7R .req v31
256 /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
257 sub sp, sp, 272
258 str x15, [sp], 16
259 adr x15, jsimd_idct_islow_neon_consts
260 st1 {v0.8b - v3.8b}, [sp], 32
261 st1 {v4.8b - v7.8b}, [sp], 32
262 st1 {v8.8b - v11.8b}, [sp], 32
263 st1 {v12.8b - v15.8b}, [sp], 32
264 st1 {v16.8b - v19.8b}, [sp], 32
265 st1 {v20.8b - v23.8b}, [sp], 32
266 st1 {v24.8b - v27.8b}, [sp], 32
267 st1 {v28.8b - v31.8b}, [sp], 32
268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
271 mul v16.4h, v16.4h, v0.4h
272 mul v17.4h, v17.4h, v1.4h
273 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
275 mul v18.4h, v18.4h, v2.4h
276 mul v19.4h, v19.4h, v3.4h
277 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
279 mul v20.4h, v20.4h, v4.4h
280 mul v21.4h, v21.4h, v5.4h
281 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
283 mul v22.4h, v22.4h, v6.4h
284 mul v23.4h, v23.4h, v7.4h
285 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
287 mul v24.4h, v24.4h, v0.4h
288 mul v25.4h, v25.4h, v1.4h
289 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
291 mul v28.4h, v28.4h, v4.4h
292 mul v29.4h, v29.4h, v5.4h
293 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
294 mul v26.4h, v26.4h, v2.4h
295 mul v27.4h, v27.4h, v3.4h
296 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
298 add x15, x15, #16
299 mul v30.4h, v30.4h, v6.4h
300 mul v31.4h, v31.4h, v7.4h
301 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
302 /* Go to the bottom of the stack */
303 sub sp, sp, 352
304 stp x4, x5, [sp], 16
305 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
306 st1 {v12.4h - v15.4h}, [sp], 32
307 /* 1-D IDCT, pass 1, left 4x8 half */
308 add v4.4h, ROW7L.4h, ROW3L.4h
309 add v5.4h, ROW5L.4h, ROW1L.4h
310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
311 smlal v12.4s, v5.4h, XFIX_1_175875602
312 smull v14.4s, v4.4h, XFIX_1_175875602
313 /* Check for the zero coefficients in the right 4x8 half */
314 smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
315 ssubl v6.4s, ROW0L.4h, ROW4L.4h
316 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
317 smull v4.4s, ROW2L.4h, XFIX_0_541196100
318 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
319 orr x0, x4, x5
320 mov v8.16b, v12.16b
321 smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
322 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
323 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
324 shl v6.4s, v6.4s, #13
325 orr x0, x0, x4
326 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
327 orr x0, x0 , x5
328 add v2.4s, v6.4s, v4.4s
329 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
330 mov v10.16b, v14.16b
331 add v2.4s, v2.4s, v12.4s
332 orr x0, x0, x4
333 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
334 orr x0, x0, x5
335 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
336 rshrn ROW1L.4h, v2.4s, #11
337 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
338 sub v2.4s, v2.4s, v12.4s
339 smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
340 orr x0, x0, x4
341 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
342 orr x0, x0, x5
343 sub v2.4s, v2.4s, v12.4s
344 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
345 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
346 smlal v12.4s, ROW6L.4h, XFIX_0_541196100
347 sub v6.4s, v6.4s, v4.4s
348 orr x0, x0, x4
349 rshrn ROW6L.4h, v2.4s, #11
350 orr x0, x0, x5
351 add v2.4s, v6.4s, v10.4s
352 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
353 sub v6.4s, v6.4s, v10.4s
354 saddl v10.4s, ROW0L.4h, ROW4L.4h
355 orr x0, x0, x4
356 rshrn ROW2L.4h, v2.4s, #11
357 orr x0, x0, x5
358 rshrn ROW5L.4h, v6.4s, #11
359 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
360 shl v10.4s, v10.4s, #13
361 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
362 orr x0, x0, x4
363 add v4.4s, v10.4s, v12.4s
364 orr x0, x0, x5
365 cmp x0, #0 /* orrs instruction removed */
366 sub v2.4s, v10.4s, v12.4s
367 add v12.4s, v4.4s, v14.4s
368 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
369 sub v4.4s, v4.4s, v14.4s
370 add v10.4s, v2.4s, v8.4s
371 orr x0, x4, x5
372 sub v6.4s, v2.4s, v8.4s
373 /* pop {x4, x5} */
374 sub sp, sp, 80
375 ldp x4, x5, [sp], 16
376 rshrn ROW7L.4h, v4.4s, #11
377 rshrn ROW3L.4h, v10.4s, #11
378 rshrn ROW0L.4h, v12.4s, #11
379 rshrn ROW4L.4h, v6.4s, #11
380
381 beq 3f /* Go to do some special handling for the sparse right 4x8 half */
382
383 /* 1-D IDCT, pass 1, right 4x8 half */
384 ld1 {v2.4h}, [x15] /* reload constants */
385 add v10.4h, ROW7R.4h, ROW3R.4h
386 add v8.4h, ROW5R.4h, ROW1R.4h
387 /* Transpose ROW6L <-> ROW7L (v3 available free register) */
388 transpose ROW6L, ROW7L, v3, .16b, .4h
389 smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560
390 smlal v12.4s, v8.4h, XFIX_1_175875602
391 /* Transpose ROW2L <-> ROW3L (v3 available free register) */
392 transpose ROW2L, ROW3L, v3, .16b, .4h
393 smull v14.4s, v10.4h, XFIX_1_175875602
394 smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644
395 /* Transpose ROW0L <-> ROW1L (v3 available free register) */
396 transpose ROW0L, ROW1L, v3, .16b, .4h
397 ssubl v6.4s, ROW0R.4h, ROW4R.4h
398 smull v4.4s, ROW2R.4h, XFIX_0_541196100
399 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
400 /* Transpose ROW4L <-> ROW5L (v3 available free register) */
401 transpose ROW4L, ROW5L, v3, .16b, .4h
402 mov v8.16b, v12.16b
403 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
404 smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
405 /* Transpose ROW1L <-> ROW3L (v3 available free register) */
406 transpose ROW1L, ROW3L, v3, .16b, .2s
407 shl v6.4s, v6.4s, #13
408 smlsl v8.4s, ROW1R.4h, XFIX_0_899976223
409 /* Transpose ROW4L <-> ROW6L (v3 available free register) */
410 transpose ROW4L, ROW6L, v3, .16b, .2s
411 add v2.4s, v6.4s, v4.4s
412 mov v10.16b, v14.16b
413 add v2.4s, v2.4s, v12.4s
414 /* Transpose ROW0L <-> ROW2L (v3 available free register) */
415 transpose ROW0L, ROW2L, v3, .16b, .2s
416 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
417 smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
418 rshrn ROW1R.4h, v2.4s, #11
419 /* Transpose ROW5L <-> ROW7L (v3 available free register) */
420 transpose ROW5L, ROW7L, v3, .16b, .2s
421 sub v2.4s, v2.4s, v12.4s
422 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
423 smlsl v10.4s, ROW3R.4h, XFIX_2_562915447
424 sub v2.4s, v2.4s, v12.4s
425 smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
426 smlal v12.4s, ROW6R.4h, XFIX_0_541196100
427 sub v6.4s, v6.4s, v4.4s
428 rshrn ROW6R.4h, v2.4s, #11
429 add v2.4s, v6.4s, v10.4s
430 sub v6.4s, v6.4s, v10.4s
431 saddl v10.4s, ROW0R.4h, ROW4R.4h
432 rshrn ROW2R.4h, v2.4s, #11
433 rshrn ROW5R.4h, v6.4s, #11
434 shl v10.4s, v10.4s, #13
435 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
436 add v4.4s, v10.4s, v12.4s
437 sub v2.4s, v10.4s, v12.4s
438 add v12.4s, v4.4s, v14.4s
439 sub v4.4s, v4.4s, v14.4s
440 add v10.4s, v2.4s, v8.4s
441 sub v6.4s, v2.4s, v8.4s
442 rshrn ROW7R.4h, v4.4s, #11
443 rshrn ROW3R.4h, v10.4s, #11
444 rshrn ROW0R.4h, v12.4s, #11
445 rshrn ROW4R.4h, v6.4s, #11
446 /* Transpose right 4x8 half */
447 transpose ROW6R, ROW7R, v3, .16b, .4h
448 transpose ROW2R, ROW3R, v3, .16b, .4h
449 transpose ROW0R, ROW1R, v3, .16b, .4h
450 transpose ROW4R, ROW5R, v3, .16b, .4h
451 transpose ROW1R, ROW3R, v3, .16b, .2s
452 transpose ROW4R, ROW6R, v3, .16b, .2s
453 transpose ROW0R, ROW2R, v3, .16b, .2s
454 transpose ROW5R, ROW7R, v3, .16b, .2s
455
456 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
457 ld1 {v2.4h}, [x15] /* reload constants */
458 smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */
459 smlal v12.4s, ROW1L.4h, XFIX_1_175875602
460 smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */
461 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
462 smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */
463 smlal v14.4s, ROW3L.4h, XFIX_1_175875602
464 smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */
465 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
466 ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
467 smull v4.4s, ROW2L.4h, XFIX_0_541196100
468 smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* RO W6L.4h <-> ROW2R.4h */
469 mov v8.16b, v12.16b
470 smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4 h */
471 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
472 shl v6.4s, v6.4s, #13
473 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
474 add v2.4s, v6.4s, v4.4s
475 mov v10.16b, v14.16b
476 add v2.4s, v2.4s, v12.4s
477 smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4 h */
478 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
479 shrn ROW1L.4h, v2.4s, #16
480 sub v2.4s, v2.4s, v12.4s
481 smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* RO W5L.4h <-> ROW1R.4h */
482 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
483 sub v2.4s, v2.4s, v12.4s
484 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
485 smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */
486 sub v6.4s, v6.4s, v4.4s
487 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
488 add v2.4s, v6.4s, v10.4s
489 sub v6.4s, v6.4s, v10.4s
490 saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
491 shrn ROW2L.4h, v2.4s, #16
492 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
493 shl v10.4s, v10.4s, #13
494 smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* RO W7L.4h <-> ROW3R.4h */
495 add v4.4s, v10.4s, v12.4s
496 sub v2.4s, v10.4s, v12.4s
497 add v12.4s, v4.4s, v14.4s
498 sub v4.4s, v4.4s, v14.4s
499 add v10.4s, v2.4s, v8.4s
500 sub v6.4s, v2.4s, v8.4s
501 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
502 shrn ROW3L.4h, v10.4s, #16
503 shrn ROW0L.4h, v12.4s, #16
504 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
505 /* 1-D IDCT, pass 2, right 4x8 half */
506 ld1 {v2.4h}, [x15] /* reload constants */
507 smull v12.4s, ROW5R.4h, XFIX_1_175875602
508 smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */
509 smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
510 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */
511 smull v14.4s, ROW7R.4h, XFIX_1_175875602
512 smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */
513 smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
514 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */
515 ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
516 smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */
517 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
518 mov v8.16b, v12.16b
519 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
520 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* RO W7L.4h <-> ROW3R.4h */
521 shl v6.4s, v6.4s, #13
522 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4 h */
523 add v2.4s, v6.4s, v4.4s
524 mov v10.16b, v14.16b
525 add v2.4s, v2.4s, v12.4s
526 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
527 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* RO W5L.4h <-> ROW1R.4h */
528 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
529 sub v2.4s, v2.4s, v12.4s
530 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
531 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4 h */
532 sub v2.4s, v2.4s, v12.4s
533 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW 6L.4h <-> ROW2R.4h */
534 smlal v12.4s, ROW6R.4h, XFIX_0_541196100
535 sub v6.4s, v6.4s, v4.4s
536 shrn ROW6R.4h, v2.4s, #16
537 add v2.4s, v6.4s, v10.4s
538 sub v6.4s, v6.4s, v10.4s
539 saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
540 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
541 shrn ROW5R.4h, v6.4s, #16
542 shl v10.4s, v10.4s, #13
543 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
544 add v4.4s, v10.4s, v12.4s
545 sub v2.4s, v10.4s, v12.4s
546 add v12.4s, v4.4s, v14.4s
547 sub v4.4s, v4.4s, v14.4s
548 add v10.4s, v2.4s, v8.4s
549 sub v6.4s, v2.4s, v8.4s
550 shrn ROW7R.4h, v4.4s, #16
551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
553 shrn ROW4R.4h, v6.4s, #16
554
555 2: /* Descale to 8-bit and range limit */
556 ins v16.2d[1], v17.2d[0]
557 ins v18.2d[1], v19.2d[0]
558 ins v20.2d[1], v21.2d[0]
559 ins v22.2d[1], v23.2d[0]
560 sqrshrn v16.8b, v16.8h, #2
561 sqrshrn2 v16.16b, v18.8h, #2
562 sqrshrn v18.8b, v20.8h, #2
563 sqrshrn2 v18.16b, v22.8h, #2
564
565 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
566 ld1 {v8.4h - v11.4h}, [sp], 32
567 ld1 {v12.4h - v15.4h}, [sp], 32
568 ins v24.2d[1], v25.2d[0]
569
570 sqrshrn v20.8b, v24.8h, #2
571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
572 /* trn1 v16.8h, v16.8h, v18.8h */
573 transpose v16, v18, v3, .16b, .8h
574 ins v26.2d[1], v27.2d[0]
575 ins v28.2d[1], v29.2d[0]
576 ins v30.2d[1], v31.2d[0]
577 sqrshrn2 v20.16b, v26.8h, #2
578 sqrshrn v22.8b, v28.8h, #2
579 movi v0.16b, #(CENTERJSAMPLE)
580 sqrshrn2 v22.16b, v30.8h, #2
581 transpose_single v16, v17, v3, .2d, .8b
582 transpose_single v18, v19, v3, .2d, .8b
583 add v16.8b, v16.8b, v0.8b
584 add v17.8b, v17.8b, v0.8b
585 add v18.8b, v18.8b, v0.8b
586 add v19.8b, v19.8b, v0.8b
587 transpose v20, v22, v3, .16b, .8h
588 /* Store results to the output buffer */
589 ldp TMP1, TMP2, [OUTPUT_BUF], 16
590 add TMP1, TMP1, OUTPUT_COL
591 add TMP2, TMP2, OUTPUT_COL
592 st1 {v16.8b}, [TMP1]
593 transpose_single v20, v21, v3, .2d, .8b
594 st1 {v17.8b}, [TMP2]
595 ldp TMP1, TMP2, [OUTPUT_BUF], 16
596 add TMP1, TMP1, OUTPUT_COL
597 add TMP2, TMP2, OUTPUT_COL
598 st1 {v18.8b}, [TMP1]
599 add v20.8b, v20.8b, v0.8b
600 add v21.8b, v21.8b, v0.8b
601 st1 {v19.8b}, [TMP2]
602 ldp TMP1, TMP2, [OUTPUT_BUF], 16
603 ldp TMP3, TMP4, [OUTPUT_BUF]
604 add TMP1, TMP1, OUTPUT_COL
605 add TMP2, TMP2, OUTPUT_COL
606 add TMP3, TMP3, OUTPUT_COL
607 add TMP4, TMP4, OUTPUT_COL
608 transpose_single v22, v23, v3, .2d, .8b
609 st1 {v20.8b}, [TMP1]
610 add v22.8b, v22.8b, v0.8b
611 add v23.8b, v23.8b, v0.8b
612 st1 {v21.8b}, [TMP2]
613 st1 {v22.8b}, [TMP3]
614 st1 {v23.8b}, [TMP4]
615 ldr x15, [sp], 16
616 ld1 {v0.8b - v3.8b}, [sp], 32
617 ld1 {v4.8b - v7.8b}, [sp], 32
618 ld1 {v8.8b - v11.8b}, [sp], 32
619 ld1 {v12.8b - v15.8b}, [sp], 32
620 ld1 {v16.8b - v19.8b}, [sp], 32
621 ld1 {v20.8b - v23.8b}, [sp], 32
622 ld1 {v24.8b - v27.8b}, [sp], 32
623 ld1 {v28.8b - v31.8b}, [sp], 32
624 blr x30
625
626 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
627
628 /* Transpose left 4x8 half */
629 transpose ROW6L, ROW7L, v3, .16b, .4h
630 transpose ROW2L, ROW3L, v3, .16b, .4h
631 transpose ROW0L, ROW1L, v3, .16b, .4h
632 transpose ROW4L, ROW5L, v3, .16b, .4h
633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
634 transpose ROW1L, ROW3L, v3, .16b, .2s
635 transpose ROW4L, ROW6L, v3, .16b, .2s
636 transpose ROW0L, ROW2L, v3, .16b, .2s
637 transpose ROW5L, ROW7L, v3, .16b, .2s
638 cmp x0, #0
639 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */
640
641 /* Only row 0 is non-zero for the right 4x8 half */
642 dup ROW1R.4h, ROW0R.4h[1]
643 dup ROW2R.4h, ROW0R.4h[2]
644 dup ROW3R.4h, ROW0R.4h[3]
645 dup ROW4R.4h, ROW0R.4h[0]
646 dup ROW5R.4h, ROW0R.4h[1]
647 dup ROW6R.4h, ROW0R.4h[2]
648 dup ROW7R.4h, ROW0R.4h[3]
649 dup ROW0R.4h, ROW0R.4h[0]
650 b 1b /* Go to 'normal' second pass */
651
652 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
653 ld1 {v2.4h}, [x15] /* reload constants */
654 smull v12.4s, ROW1L.4h, XFIX_1_175875602
655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
656 smull v14.4s, ROW3L.4h, XFIX_1_175875602
657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
658 smull v4.4s, ROW2L.4h, XFIX_0_541196100
659 sshll v6.4s, ROW0L.4h, #13
660 mov v8.16b, v12.16b
661 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
662 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
663 add v2.4s, v6.4s, v4.4s
664 mov v10.16b, v14.16b
665 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
666 add v2.4s, v2.4s, v12.4s
667 add v12.4s, v12.4s, v12.4s
668 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
669 shrn ROW1L.4h, v2.4s, #16
670 sub v2.4s, v2.4s, v12.4s
671 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
672 sub v6.4s, v6.4s, v4.4s
673 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
674 add v2.4s, v6.4s, v10.4s
675 sub v6.4s, v6.4s, v10.4s
676 sshll v10.4s, ROW0L.4h, #13
677 shrn ROW2L.4h, v2.4s, #16
678 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
679 add v4.4s, v10.4s, v12.4s
680 sub v2.4s, v10.4s, v12.4s
681 add v12.4s, v4.4s, v14.4s
682 sub v4.4s, v4.4s, v14.4s
683 add v10.4s, v2.4s, v8.4s
684 sub v6.4s, v2.4s, v8.4s
685 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
686 shrn ROW3L.4h, v10.4s, #16
687 shrn ROW0L.4h, v12.4s, #16
688 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
689 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
690 ld1 {v2.4h}, [x15] /* reload constants */
691 smull v12.4s, ROW5L.4h, XFIX_1_175875602
692 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
693 smull v14.4s, ROW7L.4h, XFIX_1_175875602
694 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
695 smull v4.4s, ROW6L.4h, XFIX_0_541196100
696 sshll v6.4s, ROW4L.4h, #13
697 mov v8.16b, v12.16b
698 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
699 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223
700 add v2.4s, v6.4s, v4.4s
701 mov v10.16b, v14.16b
702 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
703 add v2.4s, v2.4s, v12.4s
704 add v12.4s, v12.4s, v12.4s
705 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447
706 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
707 sub v2.4s, v2.4s, v12.4s
708 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
709 sub v6.4s, v6.4s, v4.4s
710 shrn ROW6R.4h, v2.4s, #16
711 add v2.4s, v6.4s, v10.4s
712 sub v6.4s, v6.4s, v10.4s
713 sshll v10.4s, ROW4L.4h, #13
714 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
715 shrn ROW5R.4h, v6.4s, #16
716 add v4.4s, v10.4s, v12.4s
717 sub v2.4s, v10.4s, v12.4s
718 add v12.4s, v4.4s, v14.4s
719 sub v4.4s, v4.4s, v14.4s
720 add v10.4s, v2.4s, v8.4s
721 sub v6.4s, v2.4s, v8.4s
722 shrn ROW7R.4h, v4.4s, #16
723 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
724 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
725 shrn ROW4R.4h, v6.4s, #16
726 b 2b /* Go to epilogue */
727
728 .unreq DCT_TABLE
729 .unreq COEF_BLOCK
730 .unreq OUTPUT_BUF
731 .unreq OUTPUT_COL
732 .unreq TMP1
733 .unreq TMP2
734 .unreq TMP3
735 .unreq TMP4
736
737 .unreq ROW0L
738 .unreq ROW0R
739 .unreq ROW1L
740 .unreq ROW1R
741 .unreq ROW2L
742 .unreq ROW2R
743 .unreq ROW3L
744 .unreq ROW3R
745 .unreq ROW4L
746 .unreq ROW4R
747 .unreq ROW5L
748 .unreq ROW5R
749 .unreq ROW6L
750 .unreq ROW6R
751 .unreq ROW7L
752 .unreq ROW7R
753
754
755 /*****************************************************************************/
756
757 /*
758 * jsimd_idct_ifast_neon
759 *
760 * This function contains a fast, not so accurate integer implementation of
761 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
762 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
763 * function from jidctfst.c
764 *
765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
766 * But in ARM NEON case some extra additions are required because VQDMULH
767 * instruction can't handle the constants larger than 1. So the expressions
768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
769 * which introduces an extra addition. Overall, there are 6 extra additions
770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
771 */
772
773 #define XFIX_1_082392200 v0.4h[0]
774 #define XFIX_1_414213562 v0.4h[1]
775 #define XFIX_1_847759065 v0.4h[2]
776 #define XFIX_2_613125930 v0.4h[3]
777
778 .balign 16
779 jsimd_idct_ifast_neon_consts:
780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
784
785 asm_function jsimd_idct_ifast_neon
786
787 DCT_TABLE .req x0
788 COEF_BLOCK .req x1
789 OUTPUT_BUF .req x2
790 OUTPUT_COL .req x3
791 TMP1 .req x0
792 TMP2 .req x1
793 TMP3 .req x2
794 TMP4 .req x22
795 TMP5 .req x23
796
797 /* Load and dequantize coefficients into NEON registers
798 * with the following allocation:
799 * 0 1 2 3 | 4 5 6 7
800 * ---------+--------
801 * 0 | d16 | d17 ( v8.8h )
802 * 1 | d18 | d19 ( v9.8h )
803 * 2 | d20 | d21 ( v10.8h )
804 * 3 | d22 | d23 ( v11.8h )
805 * 4 | d24 | d25 ( v12.8h )
806 * 5 | d26 | d27 ( v13.8h )
807 * 6 | d28 | d29 ( v14.8h )
808 * 7 | d30 | d31 ( v15.8h )
809 */
810 /* Save NEON registers used in fast IDCT */
811 sub sp, sp, #176
812 stp x22, x23, [sp], 16
813 adr x23, jsimd_idct_ifast_neon_consts
814 st1 {v0.8b - v3.8b}, [sp], 32
815 st1 {v4.8b - v7.8b}, [sp], 32
816 st1 {v8.8b - v11.8b}, [sp], 32
817 st1 {v12.8b - v15.8b}, [sp], 32
818 st1 {v16.8b - v19.8b}, [sp], 32
819 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
820 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
821 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
822 mul v8.8h, v8.8h, v0.8h
823 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
824 mul v9.8h, v9.8h, v1.8h
825 ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32
826 mul v10.8h, v10.8h, v2.8h
827 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
828 mul v11.8h, v11.8h, v3.8h
829 ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32
830 mul v12.8h, v12.8h, v0.8h
831 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
832 mul v14.8h, v14.8h, v2.8h
833 mul v13.8h, v13.8h, v1.8h
834 ld1 {v0.4h}, [x23] /* load constants */
835 mul v15.8h, v15.8h, v3.8h
836
837 /* 1-D IDCT, pass 1 */
838 sub v2.8h, v10.8h, v14.8h
839 add v14.8h, v10.8h, v14.8h
840 sub v1.8h, v11.8h, v13.8h
841 add v13.8h, v11.8h, v13.8h
842 sub v5.8h, v9.8h, v15.8h
843 add v15.8h, v9.8h, v15.8h
844 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
845 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
846 add v3.8h, v1.8h, v1.8h
847 sub v1.8h, v5.8h, v1.8h
848 add v10.8h, v2.8h, v4.8h
849 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
850 sub v2.8h, v15.8h, v13.8h
851 add v3.8h, v3.8h, v6.8h
852 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
853 add v1.8h, v1.8h, v4.8h
854 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
855 sub v10.8h, v10.8h, v14.8h
856 add v2.8h, v2.8h, v6.8h
857 sub v6.8h, v8.8h, v12.8h
858 add v12.8h, v8.8h, v12.8h
859 add v9.8h, v5.8h, v4.8h
860 add v5.8h, v6.8h, v10.8h
861 sub v10.8h, v6.8h, v10.8h
862 add v6.8h, v15.8h, v13.8h
863 add v8.8h, v12.8h, v14.8h
864 sub v3.8h, v6.8h, v3.8h
865 sub v12.8h, v12.8h, v14.8h
866 sub v3.8h, v3.8h, v1.8h
867 sub v1.8h, v9.8h, v1.8h
868 add v2.8h, v3.8h, v2.8h
869 sub v15.8h, v8.8h, v6.8h
870 add v1.8h, v1.8h, v2.8h
871 add v8.8h, v8.8h, v6.8h
872 add v14.8h, v5.8h, v3.8h
873 sub v9.8h, v5.8h, v3.8h
874 sub v13.8h, v10.8h, v2.8h
875 add v10.8h, v10.8h, v2.8h
876 /* Transpose q8-q9 */
877 mov v18.16b, v8.16b
878 trn1 v8.8h, v8.8h, v9.8h
879 trn2 v9.8h, v18.8h, v9.8h
880 sub v11.8h, v12.8h, v1.8h
881 /* Transpose q14-q15 */
882 mov v18.16b, v14.16b
883 trn1 v14.8h, v14.8h, v15.8h
884 trn2 v15.8h, v18.8h, v15.8h
885 add v12.8h, v12.8h, v1.8h
886 /* Transpose q10-q11 */
887 mov v18.16b, v10.16b
888 trn1 v10.8h, v10.8h, v11.8h
889 trn2 v11.8h, v18.8h, v11.8h
890 /* Transpose q12-q13 */
891 mov v18.16b, v12.16b
892 trn1 v12.8h, v12.8h, v13.8h
893 trn2 v13.8h, v18.8h, v13.8h
894 /* Transpose q9-q11 */
895 mov v18.16b, v9.16b
896 trn1 v9.4s, v9.4s, v11.4s
897 trn2 v11.4s, v18.4s, v11.4s
898 /* Transpose q12-q14 */
899 mov v18.16b, v12.16b
900 trn1 v12.4s, v12.4s, v14.4s
901 trn2 v14.4s, v18.4s, v14.4s
902 /* Transpose q8-q10 */
903 mov v18.16b, v8.16b
904 trn1 v8.4s, v8.4s, v10.4s
905 trn2 v10.4s, v18.4s, v10.4s
906 /* Transpose q13-q15 */
907 mov v18.16b, v13.16b
908 trn1 v13.4s, v13.4s, v15.4s
909 trn2 v15.4s, v18.4s, v15.4s
910 /* vswp v14.4h, v10-MSB.4h */
911 umov x22, v14.d[0]
912 ins v14.2d[0], v10.2d[1]
913 ins v10.2d[1], x22
914 /* vswp v13.4h, v9MSB.4h */
915
916 umov x22, v13.d[0]
917 ins v13.2d[0], v9.2d[1]
918 ins v9.2d[1], x22
919 /* 1-D IDCT, pass 2 */
920 sub v2.8h, v10.8h, v14.8h
921 /* vswp v15.4h, v11MSB.4h */
922 umov x22, v15.d[0]
923 ins v15.2d[0], v11.2d[1]
924 ins v11.2d[1], x22
925 add v14.8h, v10.8h, v14.8h
926 /* vswp v12.4h, v8-MSB.4h */
927 umov x22, v12.d[0]
928 ins v12.2d[0], v8.2d[1]
929 ins v8.2d[1], x22
930 sub v1.8h, v11.8h, v13.8h
931 add v13.8h, v11.8h, v13.8h
932 sub v5.8h, v9.8h, v15.8h
933 add v15.8h, v9.8h, v15.8h
934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
936 add v3.8h, v1.8h, v1.8h
937 sub v1.8h, v5.8h, v1.8h
938 add v10.8h, v2.8h, v4.8h
939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
940 sub v2.8h, v15.8h, v13.8h
941 add v3.8h, v3.8h, v6.8h
942 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
943 add v1.8h, v1.8h, v4.8h
944 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
945 sub v10.8h, v10.8h, v14.8h
946 add v2.8h, v2.8h, v6.8h
947 sub v6.8h, v8.8h, v12.8h
948 add v12.8h, v8.8h, v12.8h
949 add v9.8h, v5.8h, v4.8h
950 add v5.8h, v6.8h, v10.8h
951 sub v10.8h, v6.8h, v10.8h
952 add v6.8h, v15.8h, v13.8h
953 add v8.8h, v12.8h, v14.8h
954 sub v3.8h, v6.8h, v3.8h
955 sub v12.8h, v12.8h, v14.8h
956 sub v3.8h, v3.8h, v1.8h
957 sub v1.8h, v9.8h, v1.8h
958 add v2.8h, v3.8h, v2.8h
959 sub v15.8h, v8.8h, v6.8h
960 add v1.8h, v1.8h, v2.8h
961 add v8.8h, v8.8h, v6.8h
962 add v14.8h, v5.8h, v3.8h
963 sub v9.8h, v5.8h, v3.8h
964 sub v13.8h, v10.8h, v2.8h
965 add v10.8h, v10.8h, v2.8h
966 sub v11.8h, v12.8h, v1.8h
967 add v12.8h, v12.8h, v1.8h
968 /* Descale to 8-bit and range limit */
969 movi v0.16b, #0x80
970 sqshrn v8.8b, v8.8h, #5
971 sqshrn2 v8.16b, v9.8h, #5
972 sqshrn v9.8b, v10.8h, #5
973 sqshrn2 v9.16b, v11.8h, #5
974 sqshrn v10.8b, v12.8h, #5
975 sqshrn2 v10.16b, v13.8h, #5
976 sqshrn v11.8b, v14.8h, #5
977 sqshrn2 v11.16b, v15.8h, #5
978 add v8.16b, v8.16b, v0.16b
979 add v9.16b, v9.16b, v0.16b
980 add v10.16b, v10.16b, v0.16b
981 add v11.16b, v11.16b, v0.16b
982 /* Transpose the final 8-bit samples */
983 /* Transpose q8-q9 */
984 mov v18.16b, v8.16b
985 trn1 v8.8h, v8.8h, v9.8h
986 trn2 v9.8h, v18.8h, v9.8h
987 /* Transpose q10-q11 */
988 mov v18.16b, v10.16b
989 trn1 v10.8h, v10.8h, v11.8h
990 trn2 v11.8h, v18.8h, v11.8h
991 /* Transpose q8-q10 */
992 mov v18.16b, v8.16b
993 trn1 v8.4s, v8.4s, v10.4s
994 trn2 v10.4s, v18.4s, v10.4s
995 /* Transpose q9-q11 */
996 mov v18.16b, v9.16b
997 trn1 v9.4s, v9.4s, v11.4s
998 trn2 v11.4s, v18.4s, v11.4s
999 /* make copy */
1000 ins v17.2d[0], v8.2d[1]
1001 /* Transpose d16-d17-msb */
1002 mov v18.16b, v8.16b
1003 trn1 v8.8b, v8.8b, v17.8b
1004 trn2 v17.8b, v18.8b, v17.8b
1005 /* make copy */
1006 ins v19.2d[0], v9.2d[1]
1007 mov v18.16b, v9.16b
1008 trn1 v9.8b, v9.8b, v19.8b
1009 trn2 v19.8b, v18.8b, v19.8b
1010 /* Store results to the output buffer */
1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1012 add TMP1, TMP1, OUTPUT_COL
1013 add TMP2, TMP2, OUTPUT_COL
1014 st1 {v8.8b}, [TMP1]
1015 st1 {v17.8b}, [TMP2]
1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1017 add TMP1, TMP1, OUTPUT_COL
1018 add TMP2, TMP2, OUTPUT_COL
1019 st1 {v9.8b}, [TMP1]
1020 /* make copy */
1021 ins v7.2d[0], v10.2d[1]
1022 mov v18.16b, v10.16b
1023 trn1 v10.8b, v10.8b, v7.8b
1024 trn2 v7.8b, v18.8b, v7.8b
1025 st1 {v19.8b}, [TMP2]
1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16
1028 add TMP1, TMP1, OUTPUT_COL
1029 add TMP2, TMP2, OUTPUT_COL
1030 add TMP4, TMP4, OUTPUT_COL
1031 add TMP5, TMP5, OUTPUT_COL
1032 st1 {v10.8b}, [TMP1]
1033 /* make copy */
1034 ins v16.2d[0], v11.2d[1]
1035 mov v18.16b, v11.16b
1036 trn1 v11.8b, v11.8b, v16.8b
1037 trn2 v16.8b, v18.8b, v16.8b
1038 st1 {v7.8b}, [TMP2]
1039 st1 {v11.8b}, [TMP4]
1040 st1 {v16.8b}, [TMP5]
1041 sub sp, sp, #176
1042 ldp x22, x23, [sp], 16
1043 ld1 {v0.8b - v3.8b}, [sp], 32
1044 ld1 {v4.8b - v7.8b}, [sp], 32
1045 ld1 {v8.8b - v11.8b}, [sp], 32
1046 ld1 {v12.8b - v15.8b}, [sp], 32
1047 ld1 {v16.8b - v19.8b}, [sp], 32
1048 blr x30
1049
1050 .unreq DCT_TABLE
1051 .unreq COEF_BLOCK
1052 .unreq OUTPUT_BUF
1053 .unreq OUTPUT_COL
1054 .unreq TMP1
1055 .unreq TMP2
1056 .unreq TMP3
1057 .unreq TMP4
1058
1059
1060 /*****************************************************************************/
1061
1062 /*
1063 * jsimd_idct_4x4_neon
1064 *
1065 * This function contains inverse-DCT code for getting reduced-size
1066 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
1067 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1068 * function from jpeg-6b (jidctred.c).
1069 *
1070 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1071 * requires much less arithmetic operations and hence should be faster.
1072 * The primary purpose of this particular NEON optimized function is
1073 * bit exact compatibility with jpeg-6b.
1074 *
1075 * TODO: a bit better instructions scheduling can be achieved by expanding
1076 * idct_helper/transpose_4x4 macros and reordering instructions,
1077 * but readability will suffer somewhat.
1078 */
1079
1080 #define CONST_BITS 13
1081
1082 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
1083 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
1084 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
1085 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
1086 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
1087 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
1088 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
1089 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
1090 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
1091 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
1092 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
1093 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
1094 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
1095 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
1096
1097 .balign 16
1098 jsimd_idct_4x4_neon_consts:
1099 .short FIX_1_847759065 /* v0.4h[0] */
1100 .short -FIX_0_765366865 /* v0.4h[1] */
1101 .short -FIX_0_211164243 /* v0.4h[2] */
1102 .short FIX_1_451774981 /* v0.4h[3] */
1103 .short -FIX_2_172734803 /* d1[0] */
1104 .short FIX_1_061594337 /* d1[1] */
1105 .short -FIX_0_509795579 /* d1[2] */
1106 .short -FIX_0_601344887 /* d1[3] */
1107 .short FIX_0_899976223 /* v2.4h[0] */
1108 .short FIX_2_562915447 /* v2.4h[1] */
1109 .short 1 << (CONST_BITS+1) /* v2.4h[2] */
1110 .short 0 /* v2.4h[3] */
1111
1112 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1113 smull v28.4s, \x4, v2.4h[2]
1114 smlal v28.4s, \x8, v0.4h[0]
1115 smlal v28.4s, \x14, v0.4h[1]
1116
1117 smull v26.4s, \x16, v1.4h[2]
1118 smlal v26.4s, \x12, v1.4h[3]
1119 smlal v26.4s, \x10, v2.4h[0]
1120 smlal v26.4s, \x6, v2.4h[1]
1121
1122 smull v30.4s, \x4, v2.4h[2]
1123 smlsl v30.4s, \x8, v0.4h[0]
1124 smlsl v30.4s, \x14, v0.4h[1]
1125
1126 smull v24.4s, \x16, v0.4h[2]
1127 smlal v24.4s, \x12, v0.4h[3]
1128 smlal v24.4s, \x10, v1.4h[0]
1129 smlal v24.4s, \x6, v1.4h[1]
1130
1131 add v20.4s, v28.4s, v26.4s
1132 sub v28.4s, v28.4s, v26.4s
1133
1134 .if \shift > 16
1135 srshr v20.4s, v20.4s, #\shift
1136 srshr v28.4s, v28.4s, #\shift
1137 xtn \y26, v20.4s
1138 xtn \y29, v28.4s
1139 .else
1140 rshrn \y26, v20.4s, #\shift
1141 rshrn \y29, v28.4s, #\shift
1142 .endif
1143
1144 add v20.4s, v30.4s, v24.4s
1145 sub v30.4s, v30.4s, v24.4s
1146
1147 .if \shift > 16
1148 srshr v20.4s, v20.4s, #\shift
1149 srshr v30.4s, v30.4s, #\shift
1150 xtn \y27, v20.4s
1151 xtn \y28, v30.4s
1152 .else
1153 rshrn \y27, v20.4s, #\shift
1154 rshrn \y28, v30.4s, #\shift
1155 .endif
1156
1157 .endm
1158
1159 asm_function jsimd_idct_4x4_neon
1160
1161 DCT_TABLE .req x0
1162 COEF_BLOCK .req x1
1163 OUTPUT_BUF .req x2
1164 OUTPUT_COL .req x3
1165 TMP1 .req x0
1166 TMP2 .req x1
1167 TMP3 .req x2
1168 TMP4 .req x15
1169
1170 /* Save all used NEON registers */
1171 sub sp, sp, 272
1172 str x15, [sp], 16
1173 /* Load constants (v3.4h is just used for padding) */
1174 adr TMP4, jsimd_idct_4x4_neon_consts
1175 st1 {v0.8b - v3.8b}, [sp], 32
1176 st1 {v4.8b - v7.8b}, [sp], 32
1177 st1 {v8.8b - v11.8b}, [sp], 32
1178 st1 {v12.8b - v15.8b}, [sp], 32
1179 st1 {v16.8b - v19.8b}, [sp], 32
1180 st1 {v20.8b - v23.8b}, [sp], 32
1181 st1 {v24.8b - v27.8b}, [sp], 32
1182 st1 {v28.8b - v31.8b}, [sp], 32
1183 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1184
1185 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1186 * 0 1 2 3 | 4 5 6 7
1187 * ---------+--------
1188 * 0 | v4.4h | v5.4h
1189 * 1 | v6.4h | v7.4h
1190 * 2 | v8.4h | v9.4h
1191 * 3 | v10.4h | v11.4h
1192 * 4 | - | -
1193 * 5 | v12.4h | v13.4h
1194 * 6 | v14.4h | v15.4h
1195 * 7 | v16.4h | v17.4h
1196 */
1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1199 add COEF_BLOCK, COEF_BLOCK, #16
1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1202 /* dequantize */
1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1204 mul v4.4h, v4.4h, v18.4h
1205 mul v5.4h, v5.4h, v19.4h
1206 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1208 mul v6.4h, v6.4h, v20.4h
1209 mul v7.4h, v7.4h, v21.4h
1210 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
1211 mul v8.4h, v8.4h, v22.4h
1212 mul v9.4h, v9.4h, v23.4h
1213 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
1214 add DCT_TABLE, DCT_TABLE, #16
1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1216 mul v10.4h, v10.4h, v24.4h
1217 mul v11.4h, v11.4h, v25.4h
1218 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
1219 mul v12.4h, v12.4h, v26.4h
1220 mul v13.4h, v13.4h, v27.4h
1221 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1223 mul v14.4h, v14.4h, v28.4h
1224 mul v15.4h, v15.4h, v29.4h
1225 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
1226 mul v16.4h, v16.4h, v30.4h
1227 mul v17.4h, v17.4h, v31.4h
1228 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
1229
1230 /* Pass 1 */
1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h
1232 transpose_4x4 v4, v6, v8, v10, v3
1233 ins v10.2d[1], v11.2d[0]
1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h
1235 transpose_4x4 v5, v7, v9, v11, v3
1236 ins v10.2d[1], v11.2d[0]
1237 /* Pass 2 */
1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h
1239 transpose_4x4 v26, v27, v28, v29, v3
1240
1241 /* Range limit */
1242 movi v30.8h, #0x80
1243 ins v26.2d[1], v27.2d[0]
1244 ins v28.2d[1], v29.2d[0]
1245 add v26.8h, v26.8h, v30.8h
1246 add v28.8h, v28.8h, v30.8h
1247 sqxtun v26.8b, v26.8h
1248 sqxtun v27.8b, v28.8h
1249
1250 /* Store results to the output buffer */
1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1252 ldp TMP3, TMP4, [OUTPUT_BUF]
1253 add TMP1, TMP1, OUTPUT_COL
1254 add TMP2, TMP2, OUTPUT_COL
1255 add TMP3, TMP3, OUTPUT_COL
1256 add TMP4, TMP4, OUTPUT_COL
1257
1258 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1259 /* We can use much less instructions on little endian systems if the
1260 * OS kernel is not configured to trap unaligned memory accesses
1261 */
1262 st1 {v26.s}[0], [TMP1], 4
1263 st1 {v27.s}[0], [TMP3], 4
1264 st1 {v26.s}[1], [TMP2], 4
1265 st1 {v27.s}[1], [TMP4], 4
1266 #else
1267 st1 {v26.b}[0], [TMP1], 1
1268 st1 {v27.b}[0], [TMP3], 1
1269 st1 {v26.b}[1], [TMP1], 1
1270 st1 {v27.b}[1], [TMP3], 1
1271 st1 {v26.b}[2], [TMP1], 1
1272 st1 {v27.b}[2], [TMP3], 1
1273 st1 {v26.b}[3], [TMP1], 1
1274 st1 {v27.b}[3], [TMP3], 1
1275
1276 st1 {v26.b}[4], [TMP2], 1
1277 st1 {v27.b}[4], [TMP4], 1
1278 st1 {v26.b}[5], [TMP2], 1
1279 st1 {v27.b}[5], [TMP4], 1
1280 st1 {v26.b}[6], [TMP2], 1
1281 st1 {v27.b}[6], [TMP4], 1
1282 st1 {v26.b}[7], [TMP2], 1
1283 st1 {v27.b}[7], [TMP4], 1
1284 #endif
1285
1286 /* vpop {v8.4h - v15.4h} ;not available */
1287 sub sp, sp, #272
1288 ldr x15, [sp], 16
1289 ld1 {v0.8b - v3.8b}, [sp], 32
1290 ld1 {v4.8b - v7.8b}, [sp], 32
1291 ld1 {v8.8b - v11.8b}, [sp], 32
1292 ld1 {v12.8b - v15.8b}, [sp], 32
1293 ld1 {v16.8b - v19.8b}, [sp], 32
1294 ld1 {v20.8b - v23.8b}, [sp], 32
1295 ld1 {v24.8b - v27.8b}, [sp], 32
1296 ld1 {v28.8b - v31.8b}, [sp], 32
1297 blr x30
1298
1299 .unreq DCT_TABLE
1300 .unreq COEF_BLOCK
1301 .unreq OUTPUT_BUF
1302 .unreq OUTPUT_COL
1303 .unreq TMP1
1304 .unreq TMP2
1305 .unreq TMP3
1306 .unreq TMP4
1307
1308 .purgem idct_helper
1309
1310
1311 /*****************************************************************************/
1312
1313 /*
1314 * jsimd_idct_2x2_neon
1315 *
1316 * This function contains inverse-DCT code for getting reduced-size
1317 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1318 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1319 * function from jpeg-6b (jidctred.c).
1320 *
1321 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1322 * requires much less arithmetic operations and hence should be faster.
1323 * The primary purpose of this particular NEON optimized function is
1324 * bit exact compatibility with jpeg-6b.
1325 */
1326
1327 .balign 8
1328 jsimd_idct_2x2_neon_consts:
1329 .short -FIX_0_720959822 /* v14[0] */
1330 .short FIX_0_850430095 /* v14[1] */
1331 .short -FIX_1_272758580 /* v14[2] */
1332 .short FIX_3_624509785 /* v14[3] */
1333
1334 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1335 sshll v15.4s, \x4, #15
1336 smull v26.4s, \x6, v14.4h[3]
1337 smlal v26.4s, \x10, v14.4h[2]
1338 smlal v26.4s, \x12, v14.4h[1]
1339 smlal v26.4s, \x16, v14.4h[0]
1340
1341 add v20.4s, v15.4s, v26.4s
1342 sub v15.4s, v15.4s, v26.4s
1343
1344 .if \shift > 16
1345 srshr v20.4s, v20.4s, #\shift
1346 srshr v15.4s, v15.4s, #\shift
1347 xtn \y26, v20.4s
1348 xtn \y27, v15.4s
1349 .else
1350 rshrn \y26, v20.4s, #\shift
1351 rshrn \y27, v15.4s, #\shift
1352 .endif
1353
1354 .endm
1355
1356 asm_function jsimd_idct_2x2_neon
1357
1358 DCT_TABLE .req x0
1359 COEF_BLOCK .req x1
1360 OUTPUT_BUF .req x2
1361 OUTPUT_COL .req x3
1362 TMP1 .req x0
1363 TMP2 .req x15
1364
1365 /* vpush {v8.4h - v15.4h} ; not available */
1366 sub sp, sp, 208
1367 str x15, [sp], 16
1368
1369 /* Load constants */
1370 adr TMP2, jsimd_idct_2x2_neon_consts
1371 st1 {v4.8b - v7.8b}, [sp], 32
1372 st1 {v8.8b - v11.8b}, [sp], 32
1373 st1 {v12.8b - v15.8b}, [sp], 32
1374 st1 {v16.8b - v19.8b}, [sp], 32
1375 st1 {v21.8b - v22.8b}, [sp], 16
1376 st1 {v24.8b - v27.8b}, [sp], 32
1377 st1 {v30.8b - v31.8b}, [sp], 16
1378 ld1 {v14.4h}, [TMP2]
1379
1380 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1381 * 0 1 2 3 | 4 5 6 7
1382 * ---------+--------
1383 * 0 | v4.4h | v5.4h
1384 * 1 | v6.4h | v7.4h
1385 * 2 | - | -
1386 * 3 | v10.4h | v11.4h
1387 * 4 | - | -
1388 * 5 | v12.4h | v13.4h
1389 * 6 | - | -
1390 * 7 | v16.4h | v17.4h
1391 */
1392 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1393 add COEF_BLOCK, COEF_BLOCK, #16
1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
1395 add COEF_BLOCK, COEF_BLOCK, #16
1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
1397 add COEF_BLOCK, COEF_BLOCK, #16
1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1399 /* Dequantize */
1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1401 mul v4.4h, v4.4h, v18.4h
1402 mul v5.4h, v5.4h, v19.4h
1403 ins v4.2d[1], v5.2d[0]
1404 mul v6.4h, v6.4h, v20.4h
1405 mul v7.4h, v7.4h, v21.4h
1406 ins v6.2d[1], v7.2d[0]
1407 add DCT_TABLE, DCT_TABLE, #16
1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
1409 mul v10.4h, v10.4h, v24.4h
1410 mul v11.4h, v11.4h, v25.4h
1411 ins v10.2d[1], v11.2d[0]
1412 add DCT_TABLE, DCT_TABLE, #16
1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
1414 mul v12.4h, v12.4h, v26.4h
1415 mul v13.4h, v13.4h, v27.4h
1416 ins v12.2d[1], v13.2d[0]
1417 add DCT_TABLE, DCT_TABLE, #16
1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1419 mul v16.4h, v16.4h, v30.4h
1420 mul v17.4h, v17.4h, v31.4h
1421 ins v16.2d[1], v17.2d[0]
1422
1423 /* Pass 1 */
1424 #if 0
1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
1429 #else
1430 smull v26.4s, v6.4h, v14.4h[3]
1431 smlal v26.4s, v10.4h, v14.4h[2]
1432 smlal v26.4s, v12.4h, v14.4h[1]
1433 smlal v26.4s, v16.4h, v14.4h[0]
1434 smull v24.4s, v7.4h, v14.4h[3]
1435 smlal v24.4s, v11.4h, v14.4h[2]
1436 smlal v24.4s, v13.4h, v14.4h[1]
1437 smlal v24.4s, v17.4h, v14.4h[0]
1438 sshll v15.4s, v4.4h, #15
1439 sshll v30.4s, v5.4h, #15
1440 add v20.4s, v15.4s, v26.4s
1441 sub v15.4s, v15.4s, v26.4s
1442 rshrn v4.4h, v20.4s, #13
1443 rshrn v6.4h, v15.4s, #13
1444 add v20.4s, v30.4s, v24.4s
1445 sub v15.4s, v30.4s, v24.4s
1446 rshrn v5.4h, v20.4s, #13
1447 rshrn v7.4h, v15.4s, #13
1448 ins v4.2d[1], v5.2d[0]
1449 ins v6.2d[1], v7.2d[0]
1450 transpose v4, v6, v3, .16b, .8h
1451 transpose v6, v10, v3, .16b, .4s
1452 ins v11.2d[0], v10.2d[1]
1453 ins v7.2d[0], v6.2d[1]
1454 #endif
1455
1456 /* Pass 2 */
1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1458
1459 /* Range limit */
1460 movi v30.8h, #0x80
1461 ins v26.2d[1], v27.2d[0]
1462 add v26.8h, v26.8h, v30.8h
1463 sqxtun v30.8b, v26.8h
1464 ins v26.2d[0], v30.2d[0]
1465 sqxtun v27.8b, v26.8h
1466
1467 /* Store results to the output buffer */
1468 ldp TMP1, TMP2, [OUTPUT_BUF]
1469 add TMP1, TMP1, OUTPUT_COL
1470 add TMP2, TMP2, OUTPUT_COL
1471
1472 st1 {v26.b}[0], [TMP1], 1
1473 st1 {v27.b}[4], [TMP1], 1
1474 st1 {v26.b}[1], [TMP2], 1
1475 st1 {v27.b}[5], [TMP2], 1
1476
1477 sub sp, sp, #208
1478 ldr x15, [sp], 16
1479 ld1 {v4.8b - v7.8b}, [sp], 32
1480 ld1 {v8.8b - v11.8b}, [sp], 32
1481 ld1 {v12.8b - v15.8b}, [sp], 32
1482 ld1 {v16.8b - v19.8b}, [sp], 32
1483 ld1 {v21.8b - v22.8b}, [sp], 16
1484 ld1 {v24.8b - v27.8b}, [sp], 32
1485 ld1 {v30.8b - v31.8b}, [sp], 16
1486 blr x30
1487
1488 .unreq DCT_TABLE
1489 .unreq COEF_BLOCK
1490 .unreq OUTPUT_BUF
1491 .unreq OUTPUT_COL
1492 .unreq TMP1
1493 .unreq TMP2
1494
1495 .purgem idct_helper
1496
1497
1498 /*****************************************************************************/
1499
1500 /*
1501 * jsimd_ycc_extrgb_convert_neon
1502 * jsimd_ycc_extbgr_convert_neon
1503 * jsimd_ycc_extrgbx_convert_neon
1504 * jsimd_ycc_extbgrx_convert_neon
1505 * jsimd_ycc_extxbgr_convert_neon
1506 * jsimd_ycc_extxrgb_convert_neon
1507 *
1508 * Colorspace conversion YCbCr -> RGB
1509 */
1510
1511
1512 .macro do_load size
1513 .if \size == 8
1514 ld1 {v4.8b}, [U], 8
1515 ld1 {v5.8b}, [V], 8
1516 ld1 {v0.8b}, [Y], 8
1517 prfm PLDL1KEEP, [U, #64]
1518 prfm PLDL1KEEP, [V, #64]
1519 prfm PLDL1KEEP, [Y, #64]
1520 .elseif \size == 4
1521 ld1 {v4.b}[0], [U], 1
1522 ld1 {v4.b}[1], [U], 1
1523 ld1 {v4.b}[2], [U], 1
1524 ld1 {v4.b}[3], [U], 1
1525 ld1 {v5.b}[0], [V], 1
1526 ld1 {v5.b}[1], [V], 1
1527 ld1 {v5.b}[2], [V], 1
1528 ld1 {v5.b}[3], [V], 1
1529 ld1 {v0.b}[0], [Y], 1
1530 ld1 {v0.b}[1], [Y], 1
1531 ld1 {v0.b}[2], [Y], 1
1532 ld1 {v0.b}[3], [Y], 1
1533 .elseif \size == 2
1534 ld1 {v4.b}[4], [U], 1
1535 ld1 {v4.b}[5], [U], 1
1536 ld1 {v5.b}[4], [V], 1
1537 ld1 {v5.b}[5], [V], 1
1538 ld1 {v0.b}[4], [Y], 1
1539 ld1 {v0.b}[5], [Y], 1
1540 .elseif \size == 1
1541 ld1 {v4.b}[6], [U], 1
1542 ld1 {v5.b}[6], [V], 1
1543 ld1 {v0.b}[6], [Y], 1
1544 .else
1545 .error unsupported macroblock size
1546 .endif
1547 .endm
1548
1549 .macro do_store bpp, size
1550 .if \bpp == 24
1551 .if \size == 8
1552 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
1553 .elseif \size == 4
1554 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
1555 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
1556 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
1557 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
1558 .elseif \size == 2
1559 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
1560 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
1561 .elseif \size == 1
1562 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
1563 .else
1564 .error unsupported macroblock size
1565 .endif
1566 .elseif \bpp == 32
1567 .if \size == 8
1568 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1569 .elseif \size == 4
1570 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1571 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1572 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1573 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1574 .elseif \size == 2
1575 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1576 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1577 .elseif \size == 1
1578 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1579 .else
1580 .error unsupported macroblock size
1581 .endif
1582 .elseif \bpp==16
1583 .if \size == 8
1584 st1 {v25.8h}, [RGB],16
1585 .elseif \size == 4
1586 st1 {v25.4h}, [RGB],8
1587 .elseif \size == 2
1588 st1 {v25.h}[4], [RGB],2
1589 st1 {v25.h}[5], [RGB],2
1590 .elseif \size == 1
1591 st1 {v25.h}[6], [RGB],2
1592 .else
1593 .error unsupported macroblock size
1594 .endif
1595 .else
1596 .error unsupported bpp
1597 .endif
1598 .endm
1599
1600 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
1601
1602 /*
1603 * 2-stage pipelined YCbCr->RGB conversion
1604 */
1605
1606 .macro do_yuv_to_rgb_stage1
1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1609 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
1610 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
1611 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
1612 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
1613 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
1614 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
1615 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
1616 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
1617 .endm
1618
1619 .macro do_yuv_to_rgb_stage2
1620 rshrn v20.4h, v20.4s, #15
1621 rshrn2 v20.8h, v22.4s, #15
1622 rshrn v24.4h, v24.4s, #14
1623 rshrn2 v24.8h, v26.4s, #14
1624 rshrn v28.4h, v28.4s, #14
1625 rshrn2 v28.8h, v30.4s, #14
1626 uaddw v20.8h, v20.8h, v0.8b
1627 uaddw v24.8h, v24.8h, v0.8b
1628 uaddw v28.8h, v28.8h, v0.8b
1629 .if \bpp != 16
1630 sqxtun v1\g_offs\defsize, v20.8h
1631 sqxtun v1\r_offs\defsize, v24.8h
1632 sqxtun v1\b_offs\defsize, v28.8h
1633 .else
1634 sqshlu v21.8h, v20.8h, #8
1635 sqshlu v25.8h, v24.8h, #8
1636 sqshlu v29.8h, v28.8h, #8
1637 sri v25.8h, v21.8h, #5
1638 sri v25.8h, v29.8h, #11
1639 .endif
1640
1641 .endm
1642
1643 .macro do_yuv_to_rgb_stage2_store_load_stage1
1644 rshrn v20.4h, v20.4s, #15
1645 rshrn v24.4h, v24.4s, #14
1646 rshrn v28.4h, v28.4s, #14
1647 ld1 {v4.8b}, [U], 8
1648 rshrn2 v20.8h, v22.4s, #15
1649 rshrn2 v24.8h, v26.4s, #14
1650 rshrn2 v28.8h, v30.4s, #14
1651 ld1 {v5.8b}, [V], 8
1652 uaddw v20.8h, v20.8h, v0.8b
1653 uaddw v24.8h, v24.8h, v0.8b
1654 uaddw v28.8h, v28.8h, v0.8b
1655 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/
1656 sqxtun v1\g_offs\defsize, v20.8h
1657 ld1 {v0.8b}, [Y], 8
1658 sqxtun v1\r_offs\defsize, v24.8h
1659 prfm PLDL1KEEP, [U, #64]
1660 prfm PLDL1KEEP, [V, #64]
1661 prfm PLDL1KEEP, [Y, #64]
1662 sqxtun v1\b_offs\defsize, v28.8h
1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1665 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
1666 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
1667 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
1668 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
1669 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
1670 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
1671 .else /**************************** rgb565 ***********************************/
1672 sqshlu v21.8h, v20.8h, #8
1673 sqshlu v25.8h, v24.8h, #8
1674 sqshlu v29.8h, v28.8h, #8
1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1677 ld1 {v0.8b}, [Y], 8
1678 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
1679 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
1680 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
1681 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
1682 sri v25.8h, v21.8h, #5
1683 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
1684 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
1685 prfm PLDL1KEEP, [U, #64]
1686 prfm PLDL1KEEP, [V, #64]
1687 prfm PLDL1KEEP, [Y, #64]
1688 sri v25.8h, v29.8h, #11
1689 .endif
1690 do_store \bpp, 8
1691 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
1692 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
1693 .endm
1694
1695 .macro do_yuv_to_rgb
1696 do_yuv_to_rgb_stage1
1697 do_yuv_to_rgb_stage2
1698 .endm
1699
1700 /* Apple gas crashes on adrl, work around that by using adr.
1701 * But this requires a copy of these constants for each function.
1702 */
1703
1704 .balign 16
1705 jsimd_ycc_\colorid\()_neon_consts:
1706 .short 0, 0, 0, 0
1707 .short 22971, -11277, -23401, 29033
1708 .short -128, -128, -128, -128
1709 .short -128, -128, -128, -128
1710
1711 asm_function jsimd_ycc_\colorid\()_convert_neon
1712 OUTPUT_WIDTH .req x0
1713 INPUT_BUF .req x1
1714 INPUT_ROW .req x2
1715 OUTPUT_BUF .req x3
1716 NUM_ROWS .req x4
1717
1718 INPUT_BUF0 .req x5
1719 INPUT_BUF1 .req x6
1720 INPUT_BUF2 .req INPUT_BUF
1721
1722 RGB .req x7
1723 Y .req x8
1724 U .req x9
1725 V .req x10
1726 N .req x15
1727
1728 sub sp, sp, 336
1729 str x15, [sp], 16
1730 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1731 adr x15, jsimd_ycc_\colorid\()_neon_consts
1732 /* Save NEON registers */
1733 st1 {v0.8b - v3.8b}, [sp], 32
1734 st1 {v4.8b - v7.8b}, [sp], 32
1735 st1 {v8.8b - v11.8b}, [sp], 32
1736 st1 {v12.8b - v15.8b}, [sp], 32
1737 st1 {v16.8b - v19.8b}, [sp], 32
1738 st1 {v20.8b - v23.8b}, [sp], 32
1739 st1 {v24.8b - v27.8b}, [sp], 32
1740 st1 {v28.8b - v31.8b}, [sp], 32
1741 ld1 {v0.4h, v1.4h}, [x15], 16
1742 ld1 {v2.8h}, [x15]
1743
1744 /* Save ARM registers and handle input arguments */
1745 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
1746 stp x4, x5, [sp], 16
1747 stp x6, x7, [sp], 16
1748 stp x8, x9, [sp], 16
1749 stp x10, x30, [sp], 16
1750 ldr INPUT_BUF0, [INPUT_BUF]
1751 ldr INPUT_BUF1, [INPUT_BUF, 8]
1752 ldr INPUT_BUF2, [INPUT_BUF, 16]
1753 .unreq INPUT_BUF
1754
1755 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1756 movi v10.16b, #255
1757 movi v13.16b, #255
1758
1759 /* Outer loop over scanlines */
1760 cmp NUM_ROWS, #1
1761 blt 9f
1762 0:
1763 lsl x16, INPUT_ROW, #3
1764 ldr Y, [INPUT_BUF0, x16]
1765 ldr U, [INPUT_BUF1, x16]
1766 mov N, OUTPUT_WIDTH
1767 ldr V, [INPUT_BUF2, x16]
1768 add INPUT_ROW, INPUT_ROW, #1
1769 ldr RGB, [OUTPUT_BUF], #8
1770
1771 /* Inner loop over pixels */
1772 subs N, N, #8
1773 blt 3f
1774 do_load 8
1775 do_yuv_to_rgb_stage1
1776 subs N, N, #8
1777 blt 2f
1778 1:
1779 do_yuv_to_rgb_stage2_store_load_stage1
1780 subs N, N, #8
1781 bge 1b
1782 2:
1783 do_yuv_to_rgb_stage2
1784 do_store \bpp, 8
1785 tst N, #7
1786 beq 8f
1787 3:
1788 tst N, #4
1789 beq 3f
1790 do_load 4
1791 3:
1792 tst N, #2
1793 beq 4f
1794 do_load 2
1795 4:
1796 tst N, #1
1797 beq 5f
1798 do_load 1
1799 5:
1800 do_yuv_to_rgb
1801 tst N, #4
1802 beq 6f
1803 do_store \bpp, 4
1804 6:
1805 tst N, #2
1806 beq 7f
1807 do_store \bpp, 2
1808 7:
1809 tst N, #1
1810 beq 8f
1811 do_store \bpp, 1
1812 8:
1813 subs NUM_ROWS, NUM_ROWS, #1
1814 bgt 0b
1815 9:
1816 /* Restore all registers and return */
1817 sub sp, sp, #336
1818 ldr x15, [sp], 16
1819 ld1 {v0.8b - v3.8b}, [sp], 32
1820 ld1 {v4.8b - v7.8b}, [sp], 32
1821 ld1 {v8.8b - v11.8b}, [sp], 32
1822 ld1 {v12.8b - v15.8b}, [sp], 32
1823 ld1 {v16.8b - v19.8b}, [sp], 32
1824 ld1 {v20.8b - v23.8b}, [sp], 32
1825 ld1 {v24.8b - v27.8b}, [sp], 32
1826 ld1 {v28.8b - v31.8b}, [sp], 32
1827 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
1828 ldp x4, x5, [sp], 16
1829 ldp x6, x7, [sp], 16
1830 ldp x8, x9, [sp], 16
1831 ldp x10, x30, [sp], 16
1832 br x30
1833 .unreq OUTPUT_WIDTH
1834 .unreq INPUT_ROW
1835 .unreq OUTPUT_BUF
1836 .unreq NUM_ROWS
1837 .unreq INPUT_BUF0
1838 .unreq INPUT_BUF1
1839 .unreq INPUT_BUF2
1840 .unreq RGB
1841 .unreq Y
1842 .unreq U
1843 .unreq V
1844 .unreq N
1845
1846 .purgem do_yuv_to_rgb
1847 .purgem do_yuv_to_rgb_stage1
1848 .purgem do_yuv_to_rgb_stage2
1849 .purgem do_yuv_to_rgb_stage2_store_load_stage1
1850 .endm
1851
1852 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */
1853 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b
1854 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b
1855 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b
1856 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b
1857 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b
1858 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b
1859 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b
1860 .purgem do_load
1861 .purgem do_store
OLDNEW
« no previous file with comments | « simd/jsimd_arm64.c ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698