Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(124)

Side by Side Diff: simd/jsimd_arm_neon_64.S

Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« libjpeg.gyp ('K') | « simd/jsimd_arm64.c ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * ARMv8 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013, Linaro Limited
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 *
10 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the authors be held liable for any damages
12 * arising from the use of this software.
13 *
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute it
16 * freely, subject to the following restrictions:
17 *
18 * 1. The origin of this software must not be misrepresented; you must not
19 * claim that you wrote the original software. If you use this software
20 * in a product, an acknowledgment in the product documentation would be
21 * appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must not be
23 * misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source distribution.
25 */
26
27 #if defined(__linux__) && defined(__ELF__)
28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
29 #endif
30
31 .text
32 .arch armv8-a+fp+simd
33
34
35 #define RESPECT_STRICT_ALIGNMENT 1
36
37
38
39 /*****************************************************************************/
40
41 /* Supplementary macro for setting function attributes */
42 .macro asm_function fname
43 #ifdef __APPLE__
44 .func _\fname
45 .globl _\fname
46 _\fname:
47 #else
48 .func \fname
49 .global \fname
50 #ifdef __ELF__
51 .hidden \fname
52 .type \fname, %function
53 #endif
54 \fname:
55 #endif
56 .endm
57
58 /* Transpose elements of single 128 bit registers */
59 .macro transpose_single x0,x1,xi,xilen,literal
60 ins \xi\xilen[0], \x0\xilen[0]
61 ins \x1\xilen[0], \x0\xilen[1]
62 trn1 \x0\literal, \x0\literal, \x1\literal
63 trn2 \x1\literal, \xi\literal, \x1\literal
64 .endm
65
66 /* Transpose elements of 2 differnet registers */
67 .macro transpose x0,x1,xi,xilen,literal
68 mov \xi\xilen, \x0\xilen
69 trn1 \x0\literal, \x0\literal, \x1\literal
70 trn2 \x1\literal, \xi\literal, \x1\literal
71 .endm
72
73 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
74 .macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
75 mov \xi\xilen, \x0\xilen
76 trn1 \x0\x0len, \x0\x0len, \x2\x2len
77 trn2 \x2\x2len, \xi\x0len, \x2\x2len
78 mov \xi\xilen, \x1\xilen
79 trn1 \x1\x1len, \x1\x1len, \x3\x3len
80 trn2 \x3\x3len, \xi\x1len, \x3\x3len
81 .endm
82
83 .macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
84 mov \xi\xilen, \x0\xilen
85 trn1 \x0\x0len, \x0\x0len, \x1\x1len
86 trn2 \x1\x2len, \xi\x0len, \x1\x2len
87 mov \xi\xilen, \x2\xilen
88 trn1 \x2\x2len, \x2\x2len, \x3\x3len
89 trn2 \x3\x2len, \xi\x1len, \x3\x3len
90 .endm
91
92 .macro transpose_4x4 x0, x1, x2, x3,x5
93 transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
94 transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
95 .endm
96
97
98 #define CENTERJSAMPLE 128
99
100 /*****************************************************************************/
101
102 /*
103 * Perform dequantization and inverse DCT on one block of coefficients.
104 *
105 * GLOBAL(void)
106 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
107 * JSAMPARRAY output_buf, JDIMENSION output_col)
108 */
109
110 #define FIX_0_298631336 (2446)
111 #define FIX_0_390180644 (3196)
112 #define FIX_0_541196100 (4433)
113 #define FIX_0_765366865 (6270)
114 #define FIX_0_899976223 (7373)
115 #define FIX_1_175875602 (9633)
116 #define FIX_1_501321110 (12299)
117 #define FIX_1_847759065 (15137)
118 #define FIX_1_961570560 (16069)
119 #define FIX_2_053119869 (16819)
120 #define FIX_2_562915447 (20995)
121 #define FIX_3_072711026 (25172)
122
123 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
124 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
125 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
126 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
127 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
128 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
129 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
130 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
131
132 /*
133 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
134 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
135 */
136 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
137 { \
138 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
139 INT32 q1, q2, q3, q4, q5, q6, q7; \
140 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
141 \
142 /* 1-D iDCT input data */ \
143 row0 = xrow0; \
144 row1 = xrow1; \
145 row2 = xrow2; \
146 row3 = xrow3; \
147 row4 = xrow4; \
148 row5 = xrow5; \
149 row6 = xrow6; \
150 row7 = xrow7; \
151 \
152 q5 = row7 + row3; \
153 q4 = row5 + row1; \
154 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
155 MULTIPLY(q4, FIX_1_175875602); \
156 q7 = MULTIPLY(q5, FIX_1_175875602) + \
157 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
158 q2 = MULTIPLY(row2, FIX_0_541196100) + \
159 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
160 q4 = q6; \
161 q3 = ((INT32) row0 - (INT32) row4) << 13; \
162 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
163 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
164 /* now we can use q1 (reloadable constants have been used up) */ \
165 q1 = q3 + q2; \
166 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
167 MULTIPLY(row1, -FIX_0_899976223); \
168 q5 = q7; \
169 q1 = q1 + q6; \
170 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
171 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
172 \
173 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
174 tmp11_plus_tmp2 = q1; \
175 row1 = 0; \
176 \
177 q1 = q1 - q6; \
178 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
179 MULTIPLY(row3, -FIX_2_562915447); \
180 q1 = q1 - q6; \
181 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
182 MULTIPLY(row6, FIX_0_541196100); \
183 q3 = q3 - q2; \
184 \
185 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
186 tmp11_minus_tmp2 = q1; \
187 \
188 q1 = ((INT32) row0 + (INT32) row4) << 13; \
189 q2 = q1 + q6; \
190 q1 = q1 - q6; \
191 \
192 /* pick up the results */ \
193 tmp0 = q4; \
194 tmp1 = q5; \
195 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
196 tmp3 = q7; \
197 tmp10 = q2; \
198 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
199 tmp12 = q3; \
200 tmp13 = q1; \
201 }
202
203 #define XFIX_0_899976223 v0.4h[0]
204 #define XFIX_0_541196100 v0.4h[1]
205 #define XFIX_2_562915447 v0.4h[2]
206 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
207 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
208 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
209 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
210 #define XFIX_1_175875602 v1.4h[3]
211 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
212 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
213 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
214 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
215
216 .balign 16
217 jsimd_idct_islow_neon_consts:
218 .short FIX_0_899976223 /* d0[0] */
219 .short FIX_0_541196100 /* d0[1] */
220 .short FIX_2_562915447 /* d0[2] */
221 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
222 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
223 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
224 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
225 .short FIX_1_175875602 /* d1[3] */
226 /* reloadable constants */
227 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
228 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
229 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
230 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
231
232 asm_function jsimd_idct_islow_neon
233
234 DCT_TABLE .req x0
235 COEF_BLOCK .req x1
236 OUTPUT_BUF .req x2
237 OUTPUT_COL .req x3
238 TMP1 .req x0
239 TMP2 .req x1
240 TMP3 .req x2
241 TMP4 .req x15
242
243 ROW0L .req v16
244 ROW0R .req v17
245 ROW1L .req v18
246 ROW1R .req v19
247 ROW2L .req v20
248 ROW2R .req v21
249 ROW3L .req v22
250 ROW3R .req v23
251 ROW4L .req v24
252 ROW4R .req v25
253 ROW5L .req v26
254 ROW5R .req v27
255 ROW6L .req v28
256 ROW6R .req v29
257 ROW7L .req v30
258 ROW7R .req v31
259 /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
260 sub sp, sp, 272
261 str x15, [sp], 16
262 adr x15, jsimd_idct_islow_neon_consts
263 st1 {v0.8b - v3.8b}, [sp], 32
264 st1 {v4.8b - v7.8b}, [sp], 32
265 st1 {v8.8b - v11.8b}, [sp], 32
266 st1 {v12.8b - v15.8b}, [sp], 32
267 st1 {v16.8b - v19.8b}, [sp], 32
268 st1 {v20.8b - v23.8b}, [sp], 32
269 st1 {v24.8b - v27.8b}, [sp], 32
270 st1 {v28.8b - v31.8b}, [sp], 32
271 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
272 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
273 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
274 mul v16.4h, v16.4h, v0.4h
275 mul v17.4h, v17.4h, v1.4h
276 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
277 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
278 mul v18.4h, v18.4h, v2.4h
279 mul v19.4h, v19.4h, v3.4h
280 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
281 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
282 mul v20.4h, v20.4h, v4.4h
283 mul v21.4h, v21.4h, v5.4h
284 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
285 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
286 mul v22.4h, v22.4h, v6.4h
287 mul v23.4h, v23.4h, v7.4h
288 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
289 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
290 mul v24.4h, v24.4h, v0.4h
291 mul v25.4h, v25.4h, v1.4h
292 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
293 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
294 mul v28.4h, v28.4h, v4.4h
295 mul v29.4h, v29.4h, v5.4h
296 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
297 mul v26.4h, v26.4h, v2.4h
298 mul v27.4h, v27.4h, v3.4h
299 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
300 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
301 add x15, x15, #16
302 mul v30.4h, v30.4h, v6.4h
303 mul v31.4h, v31.4h, v7.4h
304 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
305 /* Go to the bottom of the stack */
306 sub sp, sp, 352
307 stp x4, x5, [sp], 16
308 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
309 st1 {v12.4h - v15.4h}, [sp], 32
310 /* 1-D IDCT, pass 1, left 4x8 half */
311 add v4.4h, ROW7L.4h, ROW3L.4h
312 add v5.4h, ROW5L.4h, ROW1L.4h
313 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
314 smlal v12.4s, v5.4h, XFIX_1_175875602
315 smull v14.4s, v4.4h, XFIX_1_175875602
316 /* Check for the zero coefficients in the right 4x8 half */
317 smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
318 ssubl v6.4s, ROW0L.4h, ROW4L.4h
319 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
320 smull v4.4s, ROW2L.4h, XFIX_0_541196100
321 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
322 orr x0, x4, x5
323 mov v8.16b, v12.16b
324 smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
325 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
326 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
327 shl v6.4s, v6.4s, #13
328 orr x0, x0, x4
329 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
330 orr x0, x0 , x5
331 add v2.4s, v6.4s, v4.4s
332 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
333 mov v10.16b, v14.16b
334 add v2.4s, v2.4s, v12.4s
335 orr x0, x0, x4
336 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
337 orr x0, x0, x5
338 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
339 rshrn ROW1L.4h, v2.4s, #11
340 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
341 sub v2.4s, v2.4s, v12.4s
342 smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
343 orr x0, x0, x4
344 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
345 orr x0, x0, x5
346 sub v2.4s, v2.4s, v12.4s
347 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
348 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
349 smlal v12.4s, ROW6L.4h, XFIX_0_541196100
350 sub v6.4s, v6.4s, v4.4s
351 orr x0, x0, x4
352 rshrn ROW6L.4h, v2.4s, #11
353 orr x0, x0, x5
354 add v2.4s, v6.4s, v10.4s
355 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
356 sub v6.4s, v6.4s, v10.4s
357 saddl v10.4s, ROW0L.4h, ROW4L.4h
358 orr x0, x0, x4
359 rshrn ROW2L.4h, v2.4s, #11
360 orr x0, x0, x5
361 rshrn ROW5L.4h, v6.4s, #11
362 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
363 shl v10.4s, v10.4s, #13
364 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
365 orr x0, x0, x4
366 add v4.4s, v10.4s, v12.4s
367 orr x0, x0, x5
368 sub v2.4s, v10.4s, v12.4s
369 add v12.4s, v4.4s, v14.4s
370 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
371 sub v4.4s, v4.4s, v14.4s
372 add v10.4s, v2.4s, v8.4s
373 orr x0, x4, x5
374 sub v6.4s, v2.4s, v8.4s
375 /* pop {x4, x5} */
376 sub sp, sp, 80
377 ldp x4, x5, [sp], 16
378 rshrn ROW7L.4h, v4.4s, #11
379 rshrn ROW3L.4h, v10.4s, #11
380 rshrn ROW0L.4h, v12.4s, #11
381 rshrn ROW4L.4h, v6.4s, #11
382 cmp x0, #0 /* orrs instruction removed */
383
384 beq 3f /* Go to do some special handling for the sparse right 4x8 half */
385
386 /* 1-D IDCT, pass 1, right 4x8 half */
387 ld1 {v2.4h}, [x15] /* reload constants */
388 add v10.4h, ROW7R.4h, ROW3R.4h
389 add v8.4h, ROW5R.4h, ROW1R.4h
390 /* Transpose ROW6L <-> ROW7L (v3 available free register) */
391 transpose ROW6L, ROW7L, v3, .16b, .4h
392 smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560
393 smlal v12.4s, v8.4h, XFIX_1_175875602
394 /* Transpose ROW2L <-> ROW3L (v3 available free register) */
395 transpose ROW2L, ROW3L, v3, .16b, .4h
396 smull v14.4s, v10.4h, XFIX_1_175875602
397 smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644
398 /* Transpose ROW0L <-> ROW1L (v3 available free register) */
399 transpose ROW0L, ROW1L, v3, .16b, .4h
400 ssubl v6.4s, ROW0R.4h, ROW4R.4h
401 smull v4.4s, ROW2R.4h, XFIX_0_541196100
402 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
403 /* Transpose ROW4L <-> ROW5L (v3 available free register) */
404 transpose ROW4L, ROW5L, v3, .16b, .4h
405 mov v8.16b, v12.16b
406 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
407 smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
408 /* Transpose ROW1L <-> ROW3L (v3 available free register) */
409 transpose ROW1L, ROW3L, v3, .16b, .2s
410 shl v6.4s, v6.4s, #13
411 smlsl v8.4s, ROW1R.4h, XFIX_0_899976223
412 /* Transpose ROW4L <-> ROW6L (v3 available free register) */
413 transpose ROW4L, ROW6L, v3, .16b, .2s
414 add v2.4s, v6.4s, v4.4s
415 mov v10.16b, v14.16b
416 add v2.4s, v2.4s, v12.4s
417 /* Transpose ROW0L <-> ROW2L (v3 available free register) */
418 transpose ROW0L, ROW2L, v3, .16b, .2s
419 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
420 smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
421 rshrn ROW1R.4h, v2.4s, #11
422 /* Transpose ROW5L <-> ROW7L (v3 available free register) */
423 transpose ROW5L, ROW7L, v3, .16b, .2s
424 sub v2.4s, v2.4s, v12.4s
425 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
426 smlsl v10.4s, ROW3R.4h, XFIX_2_562915447
427 sub v2.4s, v2.4s, v12.4s
428 smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
429 smlal v12.4s, ROW6R.4h, XFIX_0_541196100
430 sub v6.4s, v6.4s, v4.4s
431 rshrn ROW6R.4h, v2.4s, #11
432 add v2.4s, v6.4s, v10.4s
433 sub v6.4s, v6.4s, v10.4s
434 saddl v10.4s, ROW0R.4h, ROW4R.4h
435 rshrn ROW2R.4h, v2.4s, #11
436 rshrn ROW5R.4h, v6.4s, #11
437 shl v10.4s, v10.4s, #13
438 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
439 add v4.4s, v10.4s, v12.4s
440 sub v2.4s, v10.4s, v12.4s
441 add v12.4s, v4.4s, v14.4s
442 sub v4.4s, v4.4s, v14.4s
443 add v10.4s, v2.4s, v8.4s
444 sub v12.4s, v2.4s, v8.4s
445 rshrn ROW7R.4h, v4.4s, #11
446 rshrn ROW3R.4h, v10.4s, #11
447 rshrn ROW0R.4h, v12.4s, #11
448 rshrn ROW4R.4h, v6.4s, #11
449 /* Transpose right 4x8 half */
450 transpose ROW6R, ROW7R, v3, .16b, .4h
451 transpose ROW2R, ROW3R, v3, .16b, .4h
452 transpose ROW0R, ROW1R, v3, .16b, .4h
453 transpose ROW4R, ROW5R, v3, .16b, .4h
454 transpose ROW1R, ROW3R, v3, .16b, .2s
455 transpose ROW4R, ROW6R, v3, .16b, .2s
456 transpose ROW0R, ROW2R, v3, .16b, .2s
457 transpose ROW5R, ROW7R, v3, .16b, .2s
458
459 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
460 ld1 {v2.4h}, [x15] /* reload constants */
461 smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */
462 smlal v12.4s, ROW1L.4h, XFIX_1_175875602
463 smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */
464 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
465 smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */
466 smlal v14.4s, ROW3L.4h, XFIX_1_175875602
467 smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */
468 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
469 ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
470 smull v4.4s, ROW2L.4h, XFIX_0_541196100
471 smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* RO W6L.4h <-> ROW2R.4h */
472 mov v8.16b, v12.16b
473 smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4 h */
474 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
475 shl v6.4s, v6.4s, #13
476 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
477 add v2.4s, v6.4s, v4.4s
478 mov v10.16b, v14.16b
479 add v2.4s, v2.4s, v12.4s
480 smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4 h */
481 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
482 shrn ROW1L.4h, v2.4s, #16
483 sub v2.4s, v2.4s, v12.4s
484 smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* RO W5L.4h <-> ROW1R.4h */
485 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
486 sub v2.4s, v2.4s, v12.4s
487 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
488 smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */
489 sub v6.4s, v6.4s, v4.4s
490 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
491 add v2.4s, v6.4s, v10.4s
492 sub v6.4s, v6.4s, v10.4s
493 saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
494 shrn ROW2L.4h, v2.4s, #16
495 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
496 shl v10.4s, v10.4s, #13
497 smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* RO W7L.4h <-> ROW3R.4h */
498 add v4.4s, v10.4s, v12.4s
499 sub v2.4s, v10.4s, v12.4s
500 add v12.4s, v4.4s, v14.4s
501 sub v4.4s, v4.4s, v14.4s
502 add v10.4s, v2.4s, v8.4s
503 sub v6.4s, v2.4s, v8.4s
504 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
505 shrn ROW3L.4h, v10.4s, #16
506 shrn ROW0L.4h, v12.4s, #16
507 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
508 /* 1-D IDCT, pass 2, right 4x8 half */
509 ld1 {v2.4h}, [x15] /* reload constants */
510 smull v12.4s, ROW5R.4h, XFIX_1_175875602
511 smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */
512 smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
513 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */
514 smull v14.4s, ROW7R.4h, XFIX_1_175875602
515 smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */
516 smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
517 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */
518 ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
519 smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */
520 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
521 mov v8.16b, v12.16b
522 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
523 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* RO W7L.4h <-> ROW3R.4h */
524 shl v6.4s, v6.4s, #13
525 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4 h */
526 add v2.4s, v6.4s, v4.4s
527 mov v10.16b, v14.16b
528 add v2.4s, v2.4s, v12.4s
529 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
530 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* RO W5L.4h <-> ROW1R.4h */
531 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
532 sub v2.4s, v2.4s, v12.4s
533 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
534 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4 h */
535 sub v2.4s, v2.4s, v12.4s
536 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW 6L.4h <-> ROW2R.4h */
537 smlal v12.4s, ROW6R.4h, XFIX_0_541196100
538 sub v6.4s, v6.4s, v4.4s
539 shrn ROW6R.4h, v2.4s, #16
540 add v2.4s, v6.4s, v10.4s
541 sub v6.4s, v6.4s, v10.4s
542 saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
543 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
544 shrn ROW5R.4h, v6.4s, #16
545 shl v10.4s, v10.4s, #13
546 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
547 add v4.4s, v10.4s, v12.4s
548 sub v2.4s, v10.4s, v12.4s
549 add v12.4s, v4.4s, v14.4s
550 sub v4.4s, v4.4s, v14.4s
551 add v10.4s, v2.4s, v8.4s
552 sub v6.4s, v2.4s, v8.4s
553 shrn ROW7R.4h, v4.4s, #16
554 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
555 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
556 shrn ROW4R.4h, v6.4s, #16
557
558 2: /* Descale to 8-bit and range limit */
559 ins v16.2d[1], v17.2d[0]
560 ins v18.2d[1], v19.2d[0]
561 ins v20.2d[1], v21.2d[0]
562 ins v22.2d[1], v23.2d[0]
563 sqrshrn v16.8b, v16.8h, #2
564 sqrshrn2 v16.16b, v18.8h, #2
565 sqrshrn v18.8b, v20.8h, #2
566 sqrshrn2 v18.16b, v22.8h, #2
567
568 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
569 ld1 {v8.4h - v11.4h}, [sp], 32
570 ld1 {v12.4h - v15.4h}, [sp], 32
571 ins v24.2d[1], v25.2d[0]
572
573 sqrshrn v20.8b, v24.8h, #2
574 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
575 /* trn1 v16.8h, v16.8h, v18.8h */
576 transpose v16, v18, v3, .16b, .8h
577 ins v26.2d[1], v27.2d[0]
578 ins v28.2d[1], v29.2d[0]
579 ins v30.2d[1], v31.2d[0]
580 sqrshrn2 v20.16b, v26.8h, #2
581 sqrshrn v22.8b, v28.8h, #2
582 movi v0.16b, #(CENTERJSAMPLE)
583 sqrshrn2 v22.16b, v30.8h, #2
584 transpose_single v16, v17, v3, .2d, .8b
585 transpose_single v18, v19, v3, .2d, .8b
586 add v16.8b, v16.8b, v0.8b
587 add v17.8b, v17.8b, v0.8b
588 add v18.8b, v18.8b, v0.8b
589 add v19.8b, v19.8b, v0.8b
590 transpose v20, v22, v3, .16b, .8h
591 /* Store results to the output buffer */
592 ldp TMP1, TMP2, [OUTPUT_BUF], 16
593 add TMP1, TMP1, OUTPUT_COL
594 add TMP2, TMP2, OUTPUT_COL
595 st1 {v16.8b}, [TMP1]
596 transpose_single v20, v21, v3, .2d, .8b
597 st1 {v17.8b}, [TMP2]
598 ldp TMP1, TMP2, [OUTPUT_BUF], 16
599 add TMP1, TMP1, OUTPUT_COL
600 add TMP2, TMP2, OUTPUT_COL
601 st1 {v18.8b}, [TMP1]
602 add v20.8b, v20.8b, v0.8b
603 add v21.8b, v21.8b, v0.8b
604 st1 {v19.8b}, [TMP2]
605 ldp TMP1, TMP2, [OUTPUT_BUF], 16
606 ldp TMP3, TMP4, [OUTPUT_BUF]
607 add TMP1, TMP1, OUTPUT_COL
608 add TMP2, TMP2, OUTPUT_COL
609 add TMP3, TMP3, OUTPUT_COL
610 add TMP4, TMP4, OUTPUT_COL
611 transpose_single v22, v23, v3, .2d, .8b
612 st1 {v20.8b}, [TMP1]
613 add v22.8b, v22.8b, v0.8b
614 add v23.8b, v23.8b, v0.8b
615 st1 {v21.8b}, [TMP2]
616 st1 {v22.8b}, [TMP3]
617 st1 {v23.8b}, [TMP4]
618 ldr x15, [sp], 16
619 ld1 {v0.8b - v3.8b}, [sp], 32
620 ld1 {v4.8b - v7.8b}, [sp], 32
621 ld1 {v8.8b - v11.8b}, [sp], 32
622 ld1 {v12.8b - v15.8b}, [sp], 32
623 ld1 {v16.8b - v19.8b}, [sp], 32
624 ld1 {v20.8b - v23.8b}, [sp], 32
625 ld1 {v24.8b - v27.8b}, [sp], 32
626 ld1 {v28.8b - v31.8b}, [sp], 32
627 blr x30
628
629 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
630
631 /* Transpose left 4x8 half */
632 transpose ROW6L, ROW7L, v3, .16b, .4h
633 transpose ROW2L, ROW3L, v3, .16b, .4h
634 transpose ROW0L, ROW1L, v3, .16b, .4h
635 transpose ROW4L, ROW5L, v3, .16b, .4h
636 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
637 transpose ROW1L, ROW3L, v3, .16b, .2s
638 transpose ROW4L, ROW6L, v3, .16b, .2s
639 transpose ROW0L, ROW2L, v3, .16b, .2s
640 transpose ROW5L, ROW7L, v3, .16b, .2s
641 cmp x0, #0
642 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */
643
644 /* Only row 0 is non-zero for the right 4x8 half */
645 dup ROW1R.4h, ROW0R.4h[1]
646 dup ROW2R.4h, ROW0R.4h[2]
647 dup ROW3R.4h, ROW0R.4h[3]
648 dup ROW4R.4h, ROW0R.4h[0]
649 dup ROW5R.4h, ROW0R.4h[1]
650 dup ROW6R.4h, ROW0R.4h[2]
651 dup ROW7R.4h, ROW0R.4h[3]
652 dup ROW0R.4h, ROW0R.4h[0]
653 b 1b /* Go to 'normal' second pass */
654
655 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
656 ld1 {v2.4h}, [x15] /* reload constants */
657 smull v12.4s, ROW1L.4h, XFIX_1_175875602
658 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
659 smull v14.4s, ROW3L.4h, XFIX_1_175875602
660 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
661 smull v4.4s, ROW2L.4h, XFIX_0_541196100
662 sshll v6.4s, ROW0L.4h, #13
663 mov v8.16b, v12.16b
664 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
665 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
666 add v2.4s, v6.4s, v4.4s
667 mov v10.16b, v14.16b
668 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
669 add v2.4s, v2.4s, v12.4s
670 add v12.4s, v12.4s, v12.4s
671 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
672 shrn ROW1L.4h, v2.4s, #16
673 sub v2.4s, v2.4s, v12.4s
674 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
675 sub v6.4s, v6.4s, v4.4s
676 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
677 add v2.4s, v6.4s, v10.4s
678 sub v6.4s, v6.4s, v10.4s
679 sshll v10.4s, ROW0L.4h, #13
680 shrn ROW2L.4h, v2.4s, #16
681 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
682 add v4.4s, v10.4s, v12.4s
683 sub v2.4s, v10.4s, v12.4s
684 add v12.4s, v4.4s, v14.4s
685 sub v4.4s, v4.4s, v14.4s
686 add v10.4s, v2.4s, v8.4s
687 sub v6.4s, v2.4s, v8.4s
688 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
689 shrn ROW3L.4h, v10.4s, #16
690 shrn ROW0L.4h, v12.4s, #16
691 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
692 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
693 ld1 {v2.4h}, [x15] /* reload constants */
694 smull v12.4s, ROW5L.4h, XFIX_1_175875602
695 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
696 smull v14.4s, ROW7L.4h, XFIX_1_175875602
697 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
698 smull v4.4s, ROW6L.4h, XFIX_0_541196100
699 sshll v6.4s, ROW4L.4h, #13
700 mov v8.16b, v12.16b
701 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
702 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223
703 add v2.4s, v6.4s, v4.4s
704 mov v10.16b, v14.16b
705 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
706 add v2.4s, v2.4s, v12.4s
707 add v12.4s, v12.4s, v12.4s
708 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447
709 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
710 sub v2.4s, v2.4s, v12.4s
711 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
712 sub v6.4s, v6.4s, v4.4s
713 shrn ROW6R.4h, v2.4s, #16
714 add v2.4s, v6.4s, v10.4s
715 sub v6.4s, v6.4s, v10.4s
716 sshll v10.4s, ROW4L.4h, #13
717 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
718 shrn ROW5R.4h, v6.4s, #16
719 add v4.4s, v10.4s, v12.4s
720 sub v2.4s, v10.4s, v12.4s
721 add v12.4s, v4.4s, v14.4s
722 sub v4.4s, v4.4s, v14.4s
723 add v10.4s, v2.4s, v8.4s
724 sub v6.4s, v2.4s, v8.4s
725 shrn ROW7R.4h, v4.4s, #16
726 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
727 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
728 shrn ROW4R.4h, v6.4s, #16
729 b 2b /* Go to epilogue */
730
731 .unreq DCT_TABLE
732 .unreq COEF_BLOCK
733 .unreq OUTPUT_BUF
734 .unreq OUTPUT_COL
735 .unreq TMP1
736 .unreq TMP2
737 .unreq TMP3
738 .unreq TMP4
739
740 .unreq ROW0L
741 .unreq ROW0R
742 .unreq ROW1L
743 .unreq ROW1R
744 .unreq ROW2L
745 .unreq ROW2R
746 .unreq ROW3L
747 .unreq ROW3R
748 .unreq ROW4L
749 .unreq ROW4R
750 .unreq ROW5L
751 .unreq ROW5R
752 .unreq ROW6L
753 .unreq ROW6R
754 .unreq ROW7L
755 .unreq ROW7R
756 .endfunc
757
758
759 /*****************************************************************************/
760
761 /*
762 * jsimd_idct_ifast_neon
763 *
764 * This function contains a fast, not so accurate integer implementation of
765 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
766 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
767 * function from jidctfst.c
768 *
769 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
770 * But in ARM NEON case some extra additions are required because VQDMULH
771 * instruction can't handle the constants larger than 1. So the expressions
772 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
773 * which introduces an extra addition. Overall, there are 6 extra additions
774 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
775 */
776
777 #define XFIX_1_082392200 v0.4h[0]
778 #define XFIX_1_414213562 v0.4h[1]
779 #define XFIX_1_847759065 v0.4h[2]
780 #define XFIX_2_613125930 v0.4h[3]
781
782 .balign 16
783 jsimd_idct_ifast_neon_consts:
784 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
785 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
786 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
787 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
788
789 asm_function jsimd_idct_ifast_neon
790
791 DCT_TABLE .req x0
792 COEF_BLOCK .req x1
793 OUTPUT_BUF .req x2
794 OUTPUT_COL .req x3
795 TMP1 .req x0
796 TMP2 .req x1
797 TMP3 .req x2
798 TMP4 .req x22
799 TMP5 .req x23
800
801 /* Load and dequantize coefficients into NEON registers
802 * with the following allocation:
803 * 0 1 2 3 | 4 5 6 7
804 * ---------+--------
805 * 0 | d16 | d17 ( v8.8h )
806 * 1 | d18 | d19 ( v9.8h )
807 * 2 | d20 | d21 ( v10.8h )
808 * 3 | d22 | d23 ( v11.8h )
809 * 4 | d24 | d25 ( v12.8h )
810 * 5 | d26 | d27 ( v13.8h )
811 * 6 | d28 | d29 ( v14.8h )
812 * 7 | d30 | d31 ( v15.8h )
813 */
814 /* Save NEON registers used in fast IDCT */
815 sub sp, sp, #176
816 stp x22, x23, [sp], 16
817 adr x23, jsimd_idct_ifast_neon_consts
818 st1 {v0.8b - v3.8b}, [sp], 32
819 st1 {v4.8b - v7.8b}, [sp], 32
820 st1 {v8.8b - v11.8b}, [sp], 32
821 st1 {v12.8b - v15.8b}, [sp], 32
822 st1 {v16.8b - v19.8b}, [sp], 32
823 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
824 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
825 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
826 mul v8.8h, v8.8h, v0.8h
827 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
828 mul v9.8h, v9.8h, v1.8h
829 ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32
830 mul v10.8h, v10.8h, v2.8h
831 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
832 mul v11.8h, v11.8h, v3.8h
833 ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32
834 mul v12.8h, v12.8h, v0.8h
835 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
836 mul v14.8h, v14.8h, v2.8h
837 mul v13.8h, v13.8h, v1.8h
838 ld1 {v0.4h}, [x23] /* load constants */
839 mul v15.8h, v15.8h, v3.8h
840
841 /* 1-D IDCT, pass 1 */
842 sub v2.8h, v10.8h, v14.8h
843 add v14.8h, v10.8h, v14.8h
844 sub v1.8h, v11.8h, v13.8h
845 add v13.8h, v11.8h, v13.8h
846 sub v5.8h, v9.8h, v15.8h
847 add v15.8h, v9.8h, v15.8h
848 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
849 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
850 add v3.8h, v1.8h, v1.8h
851 sub v1.8h, v5.8h, v1.8h
852 add v10.8h, v2.8h, v4.8h
853 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
854 sub v2.8h, v15.8h, v13.8h
855 add v3.8h, v3.8h, v6.8h
856 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
857 add v1.8h, v1.8h, v4.8h
858 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
859 sub v10.8h, v10.8h, v14.8h
860 add v2.8h, v2.8h, v6.8h
861 sub v6.8h, v8.8h, v12.8h
862 add v12.8h, v8.8h, v12.8h
863 add v9.8h, v5.8h, v4.8h
864 add v5.8h, v6.8h, v10.8h
865 sub v10.8h, v6.8h, v10.8h
866 add v6.8h, v15.8h, v13.8h
867 add v8.8h, v12.8h, v14.8h
868 sub v3.8h, v6.8h, v3.8h
869 sub v12.8h, v12.8h, v14.8h
870 sub v3.8h, v3.8h, v1.8h
871 sub v1.8h, v9.8h, v1.8h
872 add v2.8h, v3.8h, v2.8h
873 sub v15.8h, v8.8h, v6.8h
874 add v1.8h, v1.8h, v2.8h
875 add v8.8h, v8.8h, v6.8h
876 add v14.8h, v5.8h, v3.8h
877 sub v9.8h, v5.8h, v3.8h
878 sub v13.8h, v10.8h, v2.8h
879 add v10.8h, v10.8h, v2.8h
880 /* Transpose q8-q9 */
881 mov v18.16b, v8.16b
882 trn1 v8.8h, v8.8h, v9.8h
883 trn2 v9.8h, v18.8h, v9.8h
884 sub v11.8h, v12.8h, v1.8h
885 /* Transpose q14-q15 */
886 mov v18.16b, v14.16b
887 trn1 v14.8h, v14.8h, v15.8h
888 trn2 v15.8h, v18.8h, v15.8h
889 add v12.8h, v12.8h, v1.8h
890 /* Transpose q10-q11 */
891 mov v18.16b, v10.16b
892 trn1 v10.8h, v10.8h, v11.8h
893 trn2 v11.8h, v18.8h, v11.8h
894 /* Transpose q12-q13 */
895 mov v18.16b, v12.16b
896 trn1 v12.8h, v12.8h, v13.8h
897 trn2 v13.8h, v18.8h, v13.8h
898 /* Transpose q9-q11 */
899 mov v18.16b, v9.16b
900 trn1 v9.4s, v9.4s, v11.4s
901 trn2 v11.4s, v18.4s, v11.4s
902 /* Transpose q12-q14 */
903 mov v18.16b, v12.16b
904 trn1 v12.4s, v12.4s, v14.4s
905 trn2 v14.4s, v18.4s, v14.4s
906 /* Transpose q8-q10 */
907 mov v18.16b, v8.16b
908 trn1 v8.4s, v8.4s, v10.4s
909 trn2 v10.4s, v18.4s, v10.4s
910 /* Transpose q13-q15 */
911 mov v18.16b, v13.16b
912 trn1 v13.4s, v13.4s, v15.4s
913 trn2 v15.4s, v18.4s, v15.4s
914 /* vswp v14.4h, v10-MSB.4h */
915 umov x22, v14.d[0]
916 ins v14.2d[0], v10.2d[1]
917 ins v10.2d[1], x22
918 /* vswp v13.4h, v9MSB.4h */
919
920 umov x22, v13.d[0]
921 ins v13.2d[0], v9.2d[1]
922 ins v9.2d[1], x22
923 /* 1-D IDCT, pass 2 */
924 sub v2.8h, v10.8h, v14.8h
925 /* vswp v15.4h, v11MSB.4h */
926 umov x22, v15.d[0]
927 ins v15.2d[0], v11.2d[1]
928 ins v11.2d[1], x22
929 add v14.8h, v10.8h, v14.8h
930 /* vswp v12.4h, v8-MSB.4h */
931 umov x22, v12.d[0]
932 ins v12.2d[0], v8.2d[1]
933 ins v8.2d[1], x22
934 sub v1.8h, v11.8h, v13.8h
935 add v13.8h, v11.8h, v13.8h
936 sub v5.8h, v9.8h, v15.8h
937 add v15.8h, v9.8h, v15.8h
938 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
939 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
940 add v3.8h, v1.8h, v1.8h
941 sub v1.8h, v5.8h, v1.8h
942 add v10.8h, v2.8h, v4.8h
943 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
944 sub v2.8h, v15.8h, v13.8h
945 add v3.8h, v3.8h, v6.8h
946 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
947 add v1.8h, v1.8h, v4.8h
948 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
949 sub v10.8h, v10.8h, v14.8h
950 add v2.8h, v2.8h, v6.8h
951 sub v6.8h, v8.8h, v12.8h
952 add v12.8h, v8.8h, v12.8h
953 add v9.8h, v5.8h, v4.8h
954 add v5.8h, v6.8h, v10.8h
955 sub v10.8h, v6.8h, v10.8h
956 add v6.8h, v15.8h, v13.8h
957 add v8.8h, v12.8h, v14.8h
958 sub v3.8h, v6.8h, v3.8h
959 sub v12.8h, v12.8h, v14.8h
960 sub v3.8h, v3.8h, v1.8h
961 sub v1.8h, v9.8h, v1.8h
962 add v2.8h, v3.8h, v2.8h
963 sub v15.8h, v8.8h, v6.8h
964 add v1.8h, v1.8h, v2.8h
965 add v8.8h, v8.8h, v6.8h
966 add v14.8h, v5.8h, v3.8h
967 sub v9.8h, v5.8h, v3.8h
968 sub v13.8h, v10.8h, v2.8h
969 add v10.8h, v10.8h, v2.8h
970 sub v11.8h, v12.8h, v1.8h
971 add v12.8h, v12.8h, v1.8h
972 /* Descale to 8-bit and range limit */
973 movi v0.16b, #0x80
974 sqshrn v8.8b, v8.8h, #5
975 sqshrn2 v8.16b, v9.8h, #5
976 sqshrn v9.8b, v10.8h, #5
977 sqshrn2 v9.16b, v11.8h, #5
978 sqshrn v10.8b, v12.8h, #5
979 sqshrn2 v10.16b, v13.8h, #5
980 sqshrn v11.8b, v14.8h, #5
981 sqshrn2 v11.16b, v15.8h, #5
982 add v8.16b, v8.16b, v0.16b
983 add v9.16b, v9.16b, v0.16b
984 add v10.16b, v10.16b, v0.16b
985 add v11.16b, v11.16b, v0.16b
986 /* Transpose the final 8-bit samples */
987 /* Transpose q8-q9 */
988 mov v18.16b, v8.16b
989 trn1 v8.8h, v8.8h, v9.8h
990 trn2 v9.8h, v18.8h, v9.8h
991 /* Transpose q10-q11 */
992 mov v18.16b, v10.16b
993 trn1 v10.8h, v10.8h, v11.8h
994 trn2 v11.8h, v18.8h, v11.8h
995 /* Transpose q8-q10 */
996 mov v18.16b, v8.16b
997 trn1 v8.4s, v8.4s, v10.4s
998 trn2 v10.4s, v18.4s, v10.4s
999 /* Transpose q9-q11 */
1000 mov v18.16b, v9.16b
1001 trn1 v9.4s, v9.4s, v11.4s
1002 trn2 v11.4s, v18.4s, v11.4s
1003 /* make copy */
1004 ins v17.2d[0], v8.2d[1]
1005 /* Transpose d16-d17-msb */
1006 mov v18.16b, v8.16b
1007 trn1 v8.8b, v8.8b, v17.8b
1008 trn2 v17.8b, v18.8b, v17.8b
1009 /* make copy */
1010 ins v19.2d[0], v9.2d[1]
1011 mov v18.16b, v9.16b
1012 trn1 v9.8b, v9.8b, v19.8b
1013 trn2 v19.8b, v18.8b, v19.8b
1014 /* Store results to the output buffer */
1015 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1016 add TMP1, TMP1, OUTPUT_COL
1017 add TMP2, TMP2, OUTPUT_COL
1018 st1 {v8.8b}, [TMP1]
1019 st1 {v17.8b}, [TMP2]
1020 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1021 add TMP1, TMP1, OUTPUT_COL
1022 add TMP2, TMP2, OUTPUT_COL
1023 st1 {v9.8b}, [TMP1]
1024 /* make copy */
1025 ins v7.2d[0], v10.2d[1]
1026 mov v18.16b, v10.16b
1027 trn1 v10.8b, v10.8b, v7.8b
1028 trn2 v7.8b, v18.8b, v7.8b
1029 st1 {v19.8b}, [TMP2]
1030 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1031 ldp TMP4, TMP5, [OUTPUT_BUF], 16
1032 add TMP1, TMP1, OUTPUT_COL
1033 add TMP2, TMP2, OUTPUT_COL
1034 add TMP4, TMP4, OUTPUT_COL
1035 add TMP5, TMP5, OUTPUT_COL
1036 st1 {v10.8b}, [TMP1]
1037 /* make copy */
1038 ins v16.2d[0], v11.2d[1]
1039 mov v18.16b, v11.16b
1040 trn1 v11.8b, v11.8b, v16.8b
1041 trn2 v16.8b, v18.8b, v16.8b
1042 st1 {v7.8b}, [TMP2]
1043 st1 {v11.8b}, [TMP4]
1044 st1 {v16.8b}, [TMP5]
1045 sub sp, sp, #176
1046 ldp x22, x23, [sp], 16
1047 ld1 {v0.8b - v3.8b}, [sp], 32
1048 ld1 {v4.8b - v7.8b}, [sp], 32
1049 ld1 {v8.8b - v11.8b}, [sp], 32
1050 ld1 {v12.8b - v15.8b}, [sp], 32
1051 ld1 {v16.8b - v19.8b}, [sp], 32
1052 blr x30
1053
1054 .unreq DCT_TABLE
1055 .unreq COEF_BLOCK
1056 .unreq OUTPUT_BUF
1057 .unreq OUTPUT_COL
1058 .unreq TMP1
1059 .unreq TMP2
1060 .unreq TMP3
1061 .unreq TMP4
1062 .endfunc
1063
1064
1065 /*****************************************************************************/
1066
1067 /*
1068 * jsimd_idct_4x4_neon
1069 *
1070 * This function contains inverse-DCT code for getting reduced-size
1071 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
1072 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1073 * function from jpeg-6b (jidctred.c).
1074 *
1075 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1076 * requires much less arithmetic operations and hence should be faster.
1077 * The primary purpose of this particular NEON optimized function is
1078 * bit exact compatibility with jpeg-6b.
1079 *
1080 * TODO: a bit better instructions scheduling can be achieved by expanding
1081 * idct_helper/transpose_4x4 macros and reordering instructions,
1082 * but readability will suffer somewhat.
1083 */
1084
1085 #define CONST_BITS 13
1086
1087 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
1088 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
1089 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
1090 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
1091 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
1092 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
1093 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
1094 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
1095 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
1096 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
1097 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
1098 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
1099 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
1100 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
1101
1102 .balign 16
1103 jsimd_idct_4x4_neon_consts:
1104 .short FIX_1_847759065 /* v0.4h[0] */
1105 .short -FIX_0_765366865 /* v0.4h[1] */
1106 .short -FIX_0_211164243 /* v0.4h[2] */
1107 .short FIX_1_451774981 /* v0.4h[3] */
1108 .short -FIX_2_172734803 /* d1[0] */
1109 .short FIX_1_061594337 /* d1[1] */
1110 .short -FIX_0_509795579 /* d1[2] */
1111 .short -FIX_0_601344887 /* d1[3] */
1112 .short FIX_0_899976223 /* v2.4h[0] */
1113 .short FIX_2_562915447 /* v2.4h[1] */
1114 .short 1 << (CONST_BITS+1) /* v2.4h[2] */
1115 .short 0 /* v2.4h[3] */
1116
1117 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1118 smull v28.4s, \x4, v2.4h[2]
1119 smlal v28.4s, \x8, v0.4h[0]
1120 smlal v28.4s, \x14, v0.4h[1]
1121
1122 smull v26.4s, \x16, v1.4h[2]
1123 smlal v26.4s, \x12, v1.4h[3]
1124 smlal v26.4s, \x10, v2.4h[0]
1125 smlal v26.4s, \x6, v2.4h[1]
1126
1127 smull v30.4s, \x4, v2.4h[2]
1128 smlsl v30.4s, \x8, v0.4h[0]
1129 smlsl v30.4s, \x14, v0.4h[1]
1130
1131 smull v24.4s, \x16, v0.4h[2]
1132 smlal v24.4s, \x12, v0.4h[3]
1133 smlal v24.4s, \x10, v1.4h[0]
1134 smlal v24.4s, \x6, v1.4h[1]
1135
1136 add v20.4s, v28.4s, v26.4s
1137 sub v28.4s, v28.4s, v26.4s
1138
1139 .if \shift > 16
1140 srshr v20.4s, v20.4s, #\shift
1141 srshr v28.4s, v28.4s, #\shift
1142 xtn \y26, v20.4s
1143 xtn \y29, v28.4s
1144 .else
1145 rshrn \y26, v20.4s, #\shift
1146 rshrn \y29, v28.4s, #\shift
1147 .endif
1148
1149 add v20.4s, v30.4s, v24.4s
1150 sub v30.4s, v30.4s, v24.4s
1151
1152 .if \shift > 16
1153 srshr v20.4s, v20.4s, #\shift
1154 srshr v30.4s, v30.4s, #\shift
1155 xtn \y27, v20.4s
1156 xtn \y28, v30.4s
1157 .else
1158 rshrn \y27, v20.4s, #\shift
1159 rshrn \y28, v30.4s, #\shift
1160 .endif
1161
1162 .endm
1163
1164 asm_function jsimd_idct_4x4_neon
1165
1166 DCT_TABLE .req x0
1167 COEF_BLOCK .req x1
1168 OUTPUT_BUF .req x2
1169 OUTPUT_COL .req x3
1170 TMP1 .req x0
1171 TMP2 .req x1
1172 TMP3 .req x2
1173 TMP4 .req x15
1174
1175 /* Save all used NEON registers */
1176 sub sp, sp, 272
1177 str x15, [sp], 16
1178 /* Load constants (v3.4h is just used for padding) */
1179 adr TMP4, jsimd_idct_4x4_neon_consts
1180 st1 {v0.8b - v3.8b}, [sp], 32
1181 st1 {v4.8b - v7.8b}, [sp], 32
1182 st1 {v8.8b - v11.8b}, [sp], 32
1183 st1 {v12.8b - v15.8b}, [sp], 32
1184 st1 {v16.8b - v19.8b}, [sp], 32
1185 st1 {v20.8b - v23.8b}, [sp], 32
1186 st1 {v24.8b - v27.8b}, [sp], 32
1187 st1 {v28.8b - v31.8b}, [sp], 32
1188 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1189
1190 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1191 * 0 1 2 3 | 4 5 6 7
1192 * ---------+--------
1193 * 0 | v4.4h | v5.4h
1194 * 1 | v6.4h | v7.4h
1195 * 2 | v8.4h | v9.4h
1196 * 3 | v10.4h | v11.4h
1197 * 4 | - | -
1198 * 5 | v12.4h | v13.4h
1199 * 6 | v14.4h | v15.4h
1200 * 7 | v16.4h | v17.4h
1201 */
1202 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1203 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1204 add COEF_BLOCK, COEF_BLOCK, #16
1205 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1206 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1207 /* dequantize */
1208 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1209 mul v4.4h, v4.4h, v18.4h
1210 mul v5.4h, v5.4h, v19.4h
1211 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
1212 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1213 mul v6.4h, v6.4h, v20.4h
1214 mul v7.4h, v7.4h, v21.4h
1215 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
1216 mul v8.4h, v8.4h, v22.4h
1217 mul v9.4h, v9.4h, v23.4h
1218 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
1219 add DCT_TABLE, DCT_TABLE, #16
1220 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1221 mul v10.4h, v10.4h, v24.4h
1222 mul v11.4h, v11.4h, v25.4h
1223 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
1224 mul v12.4h, v12.4h, v26.4h
1225 mul v13.4h, v13.4h, v27.4h
1226 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
1227 ld1 {v30.8h}, [DCT_TABLE], 16
1228 mul v14.4h, v14.4h, v28.4h
1229 mul v15.4h, v15.4h, v29.4h
1230 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
1231 mul v16.4h, v16.4h, v30.4h
1232 mul v17.4h, v17.4h, v31.4h
1233 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
1234
1235 /* Pass 1 */
1236 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h
1237 transpose_4x4 v4, v6, v8, v10, v3
1238 ins v10.2d[1], v11.2d[0]
1239 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h
1240 transpose_4x4 v5, v7, v9, v11, v3
1241 ins v10.2d[1], v11.2d[0]
1242 /* Pass 2 */
1243 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h
1244 transpose_4x4 v26, v27, v28, v29, v3
1245
1246 /* Range limit */
1247 movi v30.8h, #0x80
1248 ins v26.2d[1], v27.2d[0]
1249 ins v28.2d[1], v29.2d[0]
1250 add v26.8h, v26.8h, v30.8h
1251 add v28.8h, v28.8h, v30.8h
1252 sqxtun v26.8b, v26.8h
1253 sqxtun v27.8b, v28.8h
1254
1255 /* Store results to the output buffer */
1256 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1257 ldp TMP3, TMP4, [OUTPUT_BUF]
1258 add TMP1, TMP1, OUTPUT_COL
1259 add TMP2, TMP2, OUTPUT_COL
1260 add TMP3, TMP3, OUTPUT_COL
1261 add TMP4, TMP4, OUTPUT_COL
1262
1263 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1264 /* We can use much less instructions on little endian systems if the
1265 * OS kernel is not configured to trap unaligned memory accesses
1266 */
1267 st1 {v26.s}[0], [TMP1], 4
1268 st1 {v27.s}[0], [TMP3], 4
1269 st1 {v26.s}[1], [TMP2], 4
1270 st1 {v27.s}[1], [TMP4], 4
1271 #else
1272 st1 {v26.b}[0], [TMP1], 1
1273 st1 {v27.b}[0], [TMP3], 1
1274 st1 {v26.b}[1], [TMP1], 1
1275 st1 {v27.b}[1], [TMP3], 1
1276 st1 {v26.b}[2], [TMP1], 1
1277 st1 {v27.b}[2], [TMP3], 1
1278 st1 {v26.b}[3], [TMP1], 1
1279 st1 {v27.b}[3], [TMP3], 1
1280
1281 st1 {v26.b}[4], [TMP2], 1
1282 st1 {v27.b}[4], [TMP4], 1
1283 st1 {v26.b}[5], [TMP2], 1
1284 st1 {v27.b}[5], [TMP4], 1
1285 st1 {v26.b}[6], [TMP2], 1
1286 st1 {v27.b}[6], [TMP4], 1
1287 st1 {v26.b}[7], [TMP2], 1
1288 st1 {v27.b}[7], [TMP4], 1
1289 #endif
1290
1291 /* vpop {v8.4h - v15.4h} ;not available */
1292 sub sp, sp, #272
1293 ldr x15, [sp], 16
1294 ld1 {v0.8b - v3.8b}, [sp], 32
1295 ld1 {v4.8b - v7.8b}, [sp], 32
1296 ld1 {v8.8b - v11.8b}, [sp], 32
1297 ld1 {v12.8b - v15.8b}, [sp], 32
1298 ld1 {v16.8b - v19.8b}, [sp], 32
1299 ld1 {v20.8b - v23.8b}, [sp], 32
1300 ld1 {v24.8b - v27.8b}, [sp], 32
1301 ld1 {v28.8b - v31.8b}, [sp], 32
1302 blr x30
1303
1304 .unreq DCT_TABLE
1305 .unreq COEF_BLOCK
1306 .unreq OUTPUT_BUF
1307 .unreq OUTPUT_COL
1308 .unreq TMP1
1309 .unreq TMP2
1310 .unreq TMP3
1311 .unreq TMP4
1312 .endfunc
1313
1314 .purgem idct_helper
1315
1316
1317 /*****************************************************************************/
1318
1319 /*
1320 * jsimd_idct_2x2_neon
1321 *
1322 * This function contains inverse-DCT code for getting reduced-size
1323 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1324 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1325 * function from jpeg-6b (jidctred.c).
1326 *
1327 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1328 * requires much less arithmetic operations and hence should be faster.
1329 * The primary purpose of this particular NEON optimized function is
1330 * bit exact compatibility with jpeg-6b.
1331 */
1332
1333 .balign 8
1334 jsimd_idct_2x2_neon_consts:
1335 .short -FIX_0_720959822 /* d0[0] */
1336 .short FIX_0_850430095 /* d0[1] */
1337 .short -FIX_1_272758580 /* d0[2] */
1338 .short FIX_3_624509785 /* d0[3] */
1339
1340 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1341 sshll v15.4s, \x4, #15
1342 smull v26.4s, \x6, v0.4h[3]
1343 smlal v26.4s, \x10, v0.4h[2]
1344 smlal v26.4s, \x12, v0.4h[1]
1345 smlal v26.4s, \x16, v0.4h[0]
1346
1347 add v20.4s, v15.4s, v26.4s
1348 sub v15.4s, v15.4s, v26.4s
1349
1350 .if \shift > 16
1351 srshr v20.4s, v20.4s, #\shift
1352 srshr v15.4s, v15.4s, #\shift
1353 xtn \y26, v20.4s
1354 xtn \y27, v15.4s
1355 .else
1356 rshrn \y26, v20.4s, #\shift
1357 rshrn \y27, v15.4s, #\shift
1358 .endif
1359
1360 .endm
1361
1362 asm_function jsimd_idct_2x2_neon
1363
1364 DCT_TABLE .req x0
1365 COEF_BLOCK .req x1
1366 OUTPUT_BUF .req x2
1367 OUTPUT_COL .req x3
1368 TMP1 .req x0
1369 TMP2 .req x15
1370
1371 /* vpush {v8.4h - v15.4h} ; not available */
1372 sub sp, sp, 208
1373 str x15, [sp], 16
1374
1375 /* Load constants */
1376 adr TMP2, jsimd_idct_2x2_neon_consts
1377 st1 {v4.8b - v7.8b}, [sp], 32
1378 st1 {v8.8b - v11.8b}, [sp], 32
1379 st1 {v12.8b - v15.8b}, [sp], 32
1380 st1 {v16.8b - v19.8b}, [sp], 32
1381 st1 {v21.8b - v22.8b}, [sp], 16
1382 st1 {v24.8b - v27.8b}, [sp], 32
1383 st1 {v30.8b - v31.8b}, [sp], 16
1384 ld1 {v14.4h}, [TMP2]
1385
1386 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1387 * 0 1 2 3 | 4 5 6 7
1388 * ---------+--------
1389 * 0 | v4.4h | v5.4h
1390 * 1 | v6.4h | v7.4h
1391 * 2 | - | -
1392 * 3 | v10.4h | v11.4h
1393 * 4 | - | -
1394 * 5 | v12.4h | v13.4h
1395 * 6 | - | -
1396 * 7 | v16.4h | v17.4h
1397 */
1398 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1399 add COEF_BLOCK, COEF_BLOCK, #16
1400 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
1401 add COEF_BLOCK, COEF_BLOCK, #16
1402 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
1403 add COEF_BLOCK, COEF_BLOCK, #16
1404 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1405 /* Dequantize */
1406 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1407 mul v4.8h, v4.8h, v18.8h
1408 mul v5.8h, v5.8h, v18.8h
1409 ins v4.2d[1], v5.2d[0]
1410 mul v6.8h, v6.8h, v20.8h
1411 mul v7.8h, v7.8h, v21.8h
1412 ins v6.2d[1], v7.2d[0]
1413 add DCT_TABLE, DCT_TABLE, #16
1414 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
1415 mul v10.8h, v10.8h, v24.8h
1416 mul v11.8h, v11.8h, v25.8h
1417 ins v10.2d[1], v11.2d[0]
1418 add DCT_TABLE, DCT_TABLE, #16
1419 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
1420 mul v12.8h, v12.8h, v26.8h
1421 mul v13.8h, v13.8h, v27.8h
1422 ins v12.2d[1], v13.2d[0]
1423 add DCT_TABLE, DCT_TABLE, #16
1424 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1425 mul v16.8h, v16.8h, v30.8h
1426 mul v17.8h, v17.8h, v31.8h
1427 ins v16.2d[1], v17.2d[0]
1428
1429 /* Pass 1 */
1430 #if 0
1431 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1432 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
1433 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1434 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
1435 #else
1436 smull v26.4s, v6.4h, v14.4h[3]
1437 smlal v26.4s, v10.4h, v14.4h[2]
1438 smlal v26.4s, v12.4h, v14.4h[1]
1439 smlal v26.4s, v16.4h, v14.4h[0]
1440 smull v24.4s, v7.4h, v14.4h[3]
1441 smlal v24.4s, v11.4h, v14.4h[2]
1442 smlal v24.4s, v13.4h, v14.4h[1]
1443 smlal v24.4s, v17.4h, v14.4h[0]
1444 sshll v15.4s, v4.4h, #15
1445 sshll v30.4s, v5.4h, #15
1446 add v20.4s, v15.4s, v26.4s
1447 sub v15.4s, v15.4s, v26.4s
1448 rshrn v4.4h, v20.4s, #13
1449 rshrn v6.4h, v15.4s, #13
1450 add v20.4s, v30.4s, v24.4s
1451 sub v15.4s, v30.4s, v24.4s
1452 rshrn v5.4h, v20.4s, #13
1453 rshrn v7.4h, v15.4s, #13
1454 transpose v4, v6, v3, .16b, .8h
1455 transpose v6, v10, v3, .16b, .4s
1456 #endif
1457
1458 /* Pass 2 */
1459 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1460
1461 /* Range limit */
1462 movi v30.8h, #0x80
1463 ins v26.2d[1], v27.2d[0]
1464 add v26.8h, v26.8h, v30.8h
1465 sqxtun v30.8b, v26.8h
1466 ins v26.2d[0], v30.2d[0]
1467 sqxtun v27.8b, v26.8h
1468
1469 /* Store results to the output buffer */
1470 ldp TMP1, TMP2, [OUTPUT_BUF]
1471 add TMP1, TMP1, OUTPUT_COL
1472 add TMP2, TMP2, OUTPUT_COL
1473
1474 st1 {v26.b}[0], [TMP1], 1
1475 st1 {v27.b}[4], [TMP1], 1
1476 st1 {v26.b}[1], [TMP2], 1
1477 st1 {v27.b}[5], [TMP2], 1
1478
1479 sub sp, sp, #208
1480 ldr x15, [sp], 16
1481 ld1 {v4.8b - v7.8b}, [sp], 32
1482 ld1 {v8.8b - v11.8b}, [sp], 32
1483 ld1 {v12.8b - v15.8b}, [sp], 32
1484 ld1 {v16.8b - v19.8b}, [sp], 32
1485 ld1 {v21.8b - v22.8b}, [sp], 16
1486 ld1 {v24.8b - v27.8b}, [sp], 32
1487 ld1 {v30.8b - v31.8b}, [sp], 16
1488 blr x30
1489
1490 .unreq DCT_TABLE
1491 .unreq COEF_BLOCK
1492 .unreq OUTPUT_BUF
1493 .unreq OUTPUT_COL
1494 .unreq TMP1
1495 .unreq TMP2
1496 .endfunc
1497
1498 .purgem idct_helper
1499
1500
1501 /*****************************************************************************/
1502
1503 /*
1504 * jsimd_ycc_extrgb_convert_neon
1505 * jsimd_ycc_extbgr_convert_neon
1506 * jsimd_ycc_extrgbx_convert_neon
1507 * jsimd_ycc_extbgrx_convert_neon
1508 * jsimd_ycc_extxbgr_convert_neon
1509 * jsimd_ycc_extxrgb_convert_neon
1510 *
1511 * Colorspace conversion YCbCr -> RGB
1512 */
1513
1514
1515 .macro do_load size
1516 .if \size == 8
1517 ld1 {v4.8b}, [U], 8
1518 ld1 {v5.8b}, [V], 8
1519 ld1 {v0.8b}, [Y], 8
1520 prfm PLDL1KEEP, [U, #64]
1521 prfm PLDL1KEEP, [V, #64]
1522 prfm PLDL1KEEP, [Y, #64]
1523 .elseif \size == 4
1524 ld1 {v4.b}[0], [U]
1525 ld1 {v4.b}[1], [U]
1526 ld1 {v4.b}[2], [U]
1527 ld1 {v4.b}[3], [U]
1528 ld1 {v5.b}[0], [V]
1529 ld1 {v5.b}[1], [V], 1
1530 ld1 {v5.b}[2], [V], 1
1531 ld1 {v5.b}[3], [V], 1
1532 ld1 {v0.b}[0], [Y], 1
1533 ld1 {v0.b}[1], [Y], 1
1534 ld1 {v0.b}[2], [Y], 1
1535 ld1 {v0.b}[3], [Y], 1
1536 .elseif \size == 2
1537 ld1 {v4.b}[4], [U], 1
1538 ld1 {v4.b}[5], [U], 1
1539 ld1 {v5.b}[4], [V], 1
1540 ld1 {v5.b}[5], [V], 1
1541 ld1 {v0.b}[4], [Y], 1
1542 ld1 {v0.b}[5], [Y], 1
1543 .elseif \size == 1
1544 ld1 {v4.b}[6], [U], 1
1545 ld1 {v5.b}[6], [V], 1
1546 ld1 {v0.b}[6], [Y], 1
1547 .else
1548 .error unsupported macroblock size
1549 .endif
1550 .endm
1551
1552 .macro do_store bpp, size
1553 .if \bpp == 24
1554 .if \size == 8
1555 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
1556 .elseif \size == 4
1557 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
1558 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
1559 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
1560 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
1561 .elseif \size == 2
1562 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
1563 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
1564 .elseif \size == 1
1565 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
1566 .else
1567 .error unsupported macroblock size
1568 .endif
1569 .elseif \bpp == 32
1570 .if \size == 8
1571 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1572 .elseif \size == 4
1573 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1574 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1575 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1576 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1577 .elseif \size == 2
1578 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1579 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1580 .elseif \size == 1
1581 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1582 .else
1583 .error unsupported macroblock size
1584 .endif
1585 .else
1586 .error unsupported bpp
1587 .endif
1588 .endm
1589
1590 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
1591
1592 /*
1593 * 2-stage pipelined YCbCr->RGB conversion
1594 */
1595
1596 .macro do_yuv_to_rgb_stage1
1597 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
1598 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1599 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
1600 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
1601 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
1602 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
1603 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
1604 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
1605 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
1606 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
1607 .endm
1608
1609 .macro do_yuv_to_rgb_stage2
1610 rshrn v20.4h, v20.4s, #15
1611 rshrn2 v20.8h, v22.4s, #15
1612 rshrn v24.4h, v24.4s, #14
1613 rshrn2 v24.8h, v26.4s, #14
1614 rshrn v28.4h, v28.4s, #14
1615 rshrn2 v28.8h, v30.4s, #14
1616 uaddw v20.8h, v20.8h, v0.8b
1617 uaddw v24.8h, v24.8h, v0.8b
1618 uaddw v28.8h, v28.8h, v0.8b
1619 sqxtun v1\g_offs\defsize, v20.8h
1620 sqxtun v1\r_offs\defsize, v24.8h
1621 sqxtun v1\b_offs\defsize, v28.8h
1622
1623 .endm
1624
1625 .macro do_yuv_to_rgb_stage2_store_load_stage1
1626 ld1 {v4.8b}, [U], 8
1627 rshrn v20.4h, v20.4s, #15
1628 rshrn2 v20.8h, v22.4s, #15
1629 rshrn v24.4h, v24.4s, #14
1630 rshrn2 v24.8h, v26.4s, #14
1631 rshrn v28.4h, v28.4s, #14
1632 ld1 {v5.8b}, [V], 8
1633 rshrn2 v28.8h, v30.4s, #14
1634 uaddw v20.8h, v20.8h, v0.8b
1635 uaddw v24.8h, v24.8h, v0.8b
1636 uaddw v28.8h, v28.8h, v0.8b
1637 sqxtun v1\g_offs\defsize, v20.8h
1638 ld1 {v0.8b}, [Y], 8
1639 sqxtun v1\r_offs\defsize, v24.8h
1640 prfm PLDL1KEEP, [U, #64]
1641 prfm PLDL1KEEP, [V, #64]
1642 prfm PLDL1KEEP, [Y, #64]
1643 sqxtun v1\b_offs\defsize, v28.8h
1644 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1645 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1646 do_store \bpp, 8
1647 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
1648 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
1649 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
1650 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
1651 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
1652 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
1653 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
1654 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
1655 .endm
1656
1657 .macro do_yuv_to_rgb
1658 do_yuv_to_rgb_stage1
1659 do_yuv_to_rgb_stage2
1660 .endm
1661
1662 /* Apple gas crashes on adrl, work around that by using adr.
1663 * But this requires a copy of these constants for each function.
1664 */
1665
1666 .balign 16
1667 jsimd_ycc_\colorid\()_neon_consts:
1668 .short 0, 0, 0, 0
1669 .short 22971, -11277, -23401, 29033
1670 .short -128, -128, -128, -128
1671 .short -128, -128, -128, -128
1672
1673 asm_function jsimd_ycc_\colorid\()_convert_neon
1674 OUTPUT_WIDTH .req x0
1675 INPUT_BUF .req x1
1676 INPUT_ROW .req x2
1677 OUTPUT_BUF .req x3
1678 NUM_ROWS .req x4
1679
1680 INPUT_BUF0 .req x5
1681 INPUT_BUF1 .req x6
1682 INPUT_BUF2 .req INPUT_BUF
1683
1684 RGB .req x7
1685 Y .req x8
1686 U .req x9
1687 V .req x10
1688 N .req x15
1689
1690 sub sp, sp, 336
1691 str x15, [sp], 16
1692 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1693 adr x15, jsimd_ycc_\colorid\()_neon_consts
1694 /* Save NEON registers */
1695 st1 {v0.8b - v3.8b}, [sp], 32
1696 st1 {v4.8b - v7.8b}, [sp], 32
1697 st1 {v8.8b - v11.8b}, [sp], 32
1698 st1 {v12.8b - v15.8b}, [sp], 32
1699 st1 {v16.8b - v19.8b}, [sp], 32
1700 st1 {v20.8b - v23.8b}, [sp], 32
1701 st1 {v24.8b - v27.8b}, [sp], 32
1702 st1 {v28.8b - v31.8b}, [sp], 32
1703 ld1 {v0.4h, v1.4h}, [x15], 16
1704 ld1 {v2.8h}, [x15]
1705
1706 /* Save ARM registers and handle input arguments */
1707 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
1708 stp x4, x5, [sp], 16
1709 stp x6, x7, [sp], 16
1710 stp x8, x9, [sp], 16
1711 stp x10, x30, [sp], 16
1712 ldr INPUT_BUF0, [INPUT_BUF]
1713 ldr INPUT_BUF1, [INPUT_BUF, 8]
1714 ldr INPUT_BUF2, [INPUT_BUF, 16]
1715 .unreq INPUT_BUF
1716
1717 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1718 movi v10.16b, #255
1719 movi v12.16b, #255
1720
1721 /* Outer loop over scanlines */
1722 cmp NUM_ROWS, #1
1723 blt 9f
1724 0:
1725 lsl x16, INPUT_ROW, #3
1726 ldr Y, [INPUT_BUF0, x16]
1727 ldr U, [INPUT_BUF1, x16]
1728 mov N, OUTPUT_WIDTH
1729 ldr V, [INPUT_BUF2, x16]
1730 add INPUT_ROW, INPUT_ROW, #1
1731 ldr RGB, [OUTPUT_BUF], #8
1732
1733 /* Inner loop over pixels */
1734 subs N, N, #8
1735 blt 3f
1736 do_load 8
1737 do_yuv_to_rgb_stage1
1738 subs N, N, #8
1739 blt 2f
1740 1:
1741 do_yuv_to_rgb_stage2_store_load_stage1
1742 subs N, N, #8
1743 bge 1b
1744 2:
1745 do_yuv_to_rgb_stage2
1746 do_store \bpp, 8
1747 tst N, #7
1748 beq 8f
1749 3:
1750 tst N, #4
1751 beq 3f
1752 do_load 4
1753 3:
1754 tst N, #2
1755 beq 4f
1756 do_load 2
1757 4:
1758 tst N, #1
1759 beq 5f
1760 do_load 1
1761 5:
1762 do_yuv_to_rgb
1763 tst N, #4
1764 beq 6f
1765 do_store \bpp, 4
1766 6:
1767 tst N, #2
1768 beq 7f
1769 do_store \bpp, 2
1770 7:
1771 tst N, #1
1772 beq 8f
1773 do_store \bpp, 1
1774 8:
1775 subs NUM_ROWS, NUM_ROWS, #1
1776 bgt 0b
1777 9:
1778 /* Restore all registers and return */
1779 sub sp, sp, #336
1780 ldr x15, [sp], 16
1781 ld1 {v0.8b - v3.8b}, [sp], 32
1782 ld1 {v4.8b - v7.8b}, [sp], 32
1783 ld1 {v8.8b - v11.8b}, [sp], 32
1784 ld1 {v12.8b - v15.8b}, [sp], 32
1785 ld1 {v16.8b - v19.8b}, [sp], 32
1786 ld1 {v20.8b - v23.8b}, [sp], 32
1787 ld1 {v24.8b - v27.8b}, [sp], 32
1788 ld1 {v28.8b - v31.8b}, [sp], 32
1789 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
1790 ldp x4, x5, [sp], 16
1791 ldp x6, x7, [sp], 16
1792 ldp x8, x9, [sp], 16
1793 ldp x10, x30, [sp], 16
1794 br x30
1795 .unreq OUTPUT_WIDTH
1796 .unreq INPUT_ROW
1797 .unreq OUTPUT_BUF
1798 .unreq NUM_ROWS
1799 .unreq INPUT_BUF0
1800 .unreq INPUT_BUF1
1801 .unreq INPUT_BUF2
1802 .unreq RGB
1803 .unreq Y
1804 .unreq U
1805 .unreq V
1806 .unreq N
1807 .endfunc
1808
1809 .purgem do_yuv_to_rgb
1810 .purgem do_yuv_to_rgb_stage1
1811 .purgem do_yuv_to_rgb_stage2
1812 .purgem do_yuv_to_rgb_stage2_store_load_stage1
1813 .endm
1814
1815 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */
1816 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b
1817 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b
1818 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b
1819 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b
1820 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b
1821 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b
1822
1823 .purgem do_load
1824 .purgem do_store
OLDNEW
« libjpeg.gyp ('K') | « simd/jsimd_arm64.c ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698