Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(415)

Side by Side Diff: simd/jsimd_arm64_neon.S

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « simd/jsimd_arm64.c ('k') | simd/jsimd_arm_neon.S » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * ARMv8 NEON optimizations for libjpeg-turbo 2 * ARMv8 NEON optimizations for libjpeg-turbo
3 * 3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved. 5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited 7 * Copyright (C) 2013-2014, Linaro Limited
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
10 * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved.
11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
9 * 12 *
10 * This software is provided 'as-is', without any express or implied 13 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the authors be held liable for any damages 14 * warranty. In no event will the authors be held liable for any damages
12 * arising from the use of this software. 15 * arising from the use of this software.
13 * 16 *
14 * Permission is granted to anyone to use this software for any purpose, 17 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute it 18 * including commercial applications, and to alter it and redistribute it
16 * freely, subject to the following restrictions: 19 * freely, subject to the following restrictions:
17 * 20 *
18 * 1. The origin of this software must not be misrepresented; you must not 21 * 1. The origin of this software must not be misrepresented; you must not
19 * claim that you wrote the original software. If you use this software 22 * claim that you wrote the original software. If you use this software
20 * in a product, an acknowledgment in the product documentation would be 23 * in a product, an acknowledgment in the product documentation would be
21 * appreciated but is not required. 24 * appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must not be 25 * 2. Altered source versions must be plainly marked as such, and must not be
23 * misrepresented as being the original software. 26 * misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source distribution. 27 * 3. This notice may not be removed or altered from any source distribution.
25 */ 28 */
26 29
27 #if defined(__linux__) && defined(__ELF__) 30 #if defined(__linux__) && defined(__ELF__)
28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 31 .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
29 #endif 32 #endif
30 33
31 .text 34 .text
32 .arch armv8-a+fp+simd
33 35
34 36
35 #define RESPECT_STRICT_ALIGNMENT 1 37 #define RESPECT_STRICT_ALIGNMENT 1
36 38
37 39
38 /*****************************************************************************/ 40 /*****************************************************************************/
39 41
40 /* Supplementary macro for setting function attributes */ 42 /* Supplementary macro for setting function attributes */
41 .macro asm_function fname 43 .macro asm_function fname
42 #ifdef __APPLE__ 44 #ifdef __APPLE__
43 .globl _\fname 45 .globl _\fname
44 _\fname: 46 _\fname:
45 #else 47 #else
46 .global \fname 48 .global \fname
47 #ifdef __ELF__ 49 #ifdef __ELF__
48 .hidden \fname 50 .hidden \fname
49 .type \fname, %function 51 .type \fname, %function
50 #endif 52 #endif
51 \fname: 53 \fname:
52 #endif 54 #endif
53 .endm 55 .endm
54 56
55 /* Transpose elements of single 128 bit registers */ 57 /* Transpose elements of single 128 bit registers */
56 .macro transpose_single x0,x1,xi,xilen,literal 58 .macro transpose_single x0, x1, xi, xilen, literal
57 ins \xi\xilen[0], \x0\xilen[0] 59 ins \xi\xilen[0], \x0\xilen[0]
58 ins \x1\xilen[0], \x0\xilen[1] 60 ins \x1\xilen[0], \x0\xilen[1]
59 trn1 \x0\literal, \x0\literal, \x1\literal 61 trn1 \x0\literal, \x0\literal, \x1\literal
60 trn2 \x1\literal, \xi\literal, \x1\literal 62 trn2 \x1\literal, \xi\literal, \x1\literal
61 .endm 63 .endm
62 64
63 /* Transpose elements of 2 differnet registers */ 65 /* Transpose elements of 2 differnet registers */
64 .macro transpose x0,x1,xi,xilen,literal 66 .macro transpose x0, x1, xi, xilen, literal
65 mov \xi\xilen, \x0\xilen 67 mov \xi\xilen, \x0\xilen
66 trn1 \x0\literal, \x0\literal, \x1\literal 68 trn1 \x0\literal, \x0\literal, \x1\literal
67 trn2 \x1\literal, \xi\literal, \x1\literal 69 trn2 \x1\literal, \xi\literal, \x1\literal
68 .endm 70 .endm
69 71
70 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ 72 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
71 .macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen 73 .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
72 mov \xi\xilen, \x0\xilen 74 mov \xi\xilen, \x0\xilen
73 trn1 \x0\x0len, \x0\x0len, \x2\x2len 75 trn1 \x0\x0len, \x0\x0len, \x2\x2len
74 trn2 \x2\x2len, \xi\x0len, \x2\x2len 76 trn2 \x2\x2len, \xi\x0len, \x2\x2len
75 mov \xi\xilen, \x1\xilen 77 mov \xi\xilen, \x1\xilen
76 trn1 \x1\x1len, \x1\x1len, \x3\x3len 78 trn1 \x1\x1len, \x1\x1len, \x3\x3len
77 trn2 \x3\x3len, \xi\x1len, \x3\x3len 79 trn2 \x3\x3len, \xi\x1len, \x3\x3len
78 .endm 80 .endm
79 81
80 .macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen 82 .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
81 mov \xi\xilen, \x0\xilen 83 mov \xi\xilen, \x0\xilen
82 trn1 \x0\x0len, \x0\x0len, \x1\x1len 84 trn1 \x0\x0len, \x0\x0len, \x1\x1len
83 trn2 \x1\x2len, \xi\x0len, \x1\x2len 85 trn2 \x1\x2len, \xi\x0len, \x1\x2len
84 mov \xi\xilen, \x2\xilen 86 mov \xi\xilen, \x2\xilen
85 trn1 \x2\x2len, \x2\x2len, \x3\x3len 87 trn1 \x2\x2len, \x2\x2len, \x3\x3len
86 trn2 \x3\x2len, \xi\x1len, \x3\x3len 88 trn2 \x3\x2len, \xi\x1len, \x3\x3len
87 .endm 89 .endm
88 90
89 .macro transpose_4x4 x0, x1, x2, x3,x5 91 .macro transpose_4x4 x0, x1, x2, x3, x5
90 transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b 92 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
91 transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b 93 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
94 .endm
95
96 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
97 trn1 \t0\().8h, \l0\().8h, \l1\().8h
98 trn1 \t1\().8h, \l2\().8h, \l3\().8h
99 trn1 \t2\().8h, \l4\().8h, \l5\().8h
100 trn1 \t3\().8h, \l6\().8h, \l7\().8h
101 trn2 \l1\().8h, \l0\().8h, \l1\().8h
102 trn2 \l3\().8h, \l2\().8h, \l3\().8h
103 trn2 \l5\().8h, \l4\().8h, \l5\().8h
104 trn2 \l7\().8h, \l6\().8h, \l7\().8h
105
106 trn1 \l4\().4s, \t2\().4s, \t3\().4s
107 trn2 \t3\().4s, \t2\().4s, \t3\().4s
108 trn1 \t2\().4s, \t0\().4s, \t1\().4s
109 trn2 \l2\().4s, \t0\().4s, \t1\().4s
110 trn1 \t0\().4s, \l1\().4s, \l3\().4s
111 trn2 \l3\().4s, \l1\().4s, \l3\().4s
112 trn2 \t1\().4s, \l5\().4s, \l7\().4s
113 trn1 \l5\().4s, \l5\().4s, \l7\().4s
114
115 trn2 \l6\().2d, \l2\().2d, \t3\().2d
116 trn1 \l0\().2d, \t2\().2d, \l4\().2d
117 trn1 \l1\().2d, \t0\().2d, \l5\().2d
118 trn2 \l7\().2d, \l3\().2d, \t1\().2d
119 trn1 \l2\().2d, \l2\().2d, \t3\().2d
120 trn2 \l4\().2d, \t2\().2d, \l4\().2d
121 trn1 \l3\().2d, \l3\().2d, \t1\().2d
122 trn2 \l5\().2d, \t0\().2d, \l5\().2d
92 .endm 123 .endm
93 124
94 125
95 #define CENTERJSAMPLE 128 126 #define CENTERJSAMPLE 128
96 127
97 /*****************************************************************************/ 128 /*****************************************************************************/
98 129
99 /* 130 /*
100 * Perform dequantization and inverse DCT on one block of coefficients. 131 * Perform dequantization and inverse DCT on one block of coefficients.
101 * 132 *
102 * GLOBAL(void) 133 * GLOBAL(void)
103 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, 134 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
104 * JSAMPARRAY output_buf, JDIMENSION output_col) 135 * JSAMPARRAY output_buf, JDIMENSION output_col)
105 */ 136 */
106 137
107 #define FIX_0_298631336 (2446) 138 #define CONST_BITS 13
108 #define FIX_0_390180644 (3196) 139 #define PASS1_BITS 2
109 #define FIX_0_541196100 (4433) 140
110 #define FIX_0_765366865 (6270) 141 #define F_0_298 2446 /* FIX(0.298631336) */
111 #define FIX_0_899976223 (7373) 142 #define F_0_390 3196 /* FIX(0.390180644) */
112 #define FIX_1_175875602 (9633) 143 #define F_0_541 4433 /* FIX(0.541196100) */
113 #define FIX_1_501321110 (12299) 144 #define F_0_765 6270 /* FIX(0.765366865) */
114 #define FIX_1_847759065 (15137) 145 #define F_0_899 7373 /* FIX(0.899976223) */
115 #define FIX_1_961570560 (16069) 146 #define F_1_175 9633 /* FIX(1.175875602) */
116 #define FIX_2_053119869 (16819) 147 #define F_1_501 12299 /* FIX(1.501321110) */
117 #define FIX_2_562915447 (20995) 148 #define F_1_847 15137 /* FIX(1.847759065) */
118 #define FIX_3_072711026 (25172) 149 #define F_1_961 16069 /* FIX(1.961570560) */
119 150 #define F_2_053 16819 /* FIX(2.053119869) */
120 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) 151 #define F_2_562 20995 /* FIX(2.562915447) */
121 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) 152 #define F_3_072 25172 /* FIX(3.072711026) */
122 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
123 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
124 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
125 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
126 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
127 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
128
129 /*
130 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
131 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
132 */
133 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
134 { \
135 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
136 INT32 q1, q2, q3, q4, q5, q6, q7; \
137 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
138 \
139 /* 1-D iDCT input data */ \
140 row0 = xrow0; \
141 row1 = xrow1; \
142 row2 = xrow2; \
143 row3 = xrow3; \
144 row4 = xrow4; \
145 row5 = xrow5; \
146 row6 = xrow6; \
147 row7 = xrow7; \
148 \
149 q5 = row7 + row3; \
150 q4 = row5 + row1; \
151 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
152 MULTIPLY(q4, FIX_1_175875602); \
153 q7 = MULTIPLY(q5, FIX_1_175875602) + \
154 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
155 q2 = MULTIPLY(row2, FIX_0_541196100) + \
156 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
157 q4 = q6; \
158 q3 = ((INT32) row0 - (INT32) row4) << 13; \
159 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
160 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
161 /* now we can use q1 (reloadable constants have been used up) */ \
162 q1 = q3 + q2; \
163 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
164 MULTIPLY(row1, -FIX_0_899976223); \
165 q5 = q7; \
166 q1 = q1 + q6; \
167 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
168 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
169 \
170 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
171 tmp11_plus_tmp2 = q1; \
172 row1 = 0; \
173 \
174 q1 = q1 - q6; \
175 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
176 MULTIPLY(row3, -FIX_2_562915447); \
177 q1 = q1 - q6; \
178 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
179 MULTIPLY(row6, FIX_0_541196100); \
180 q3 = q3 - q2; \
181 \
182 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
183 tmp11_minus_tmp2 = q1; \
184 \
185 q1 = ((INT32) row0 + (INT32) row4) << 13; \
186 q2 = q1 + q6; \
187 q1 = q1 - q6; \
188 \
189 /* pick up the results */ \
190 tmp0 = q4; \
191 tmp1 = q5; \
192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
193 tmp3 = q7; \
194 tmp10 = q2; \
195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
196 tmp12 = q3; \
197 tmp13 = q1; \
198 }
199
200 #define XFIX_0_899976223 v0.4h[0]
201 #define XFIX_0_541196100 v0.4h[1]
202 #define XFIX_2_562915447 v0.4h[2]
203 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
204 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
205 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
206 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
207 #define XFIX_1_175875602 v1.4h[3]
208 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
209 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
210 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
211 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
212 153
213 .balign 16 154 .balign 16
214 jsimd_idct_islow_neon_consts: 155 Ljsimd_idct_islow_neon_consts:
215 .short FIX_0_899976223 /* d0[0] */ 156 .short F_0_298
216 .short FIX_0_541196100 /* d0[1] */ 157 .short -F_0_390
217 .short FIX_2_562915447 /* d0[2] */ 158 .short F_0_541
218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 159 .short F_0_765
219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 160 .short - F_0_899
220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 161 .short F_1_175
221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 162 .short F_1_501
222 .short FIX_1_175875602 /* d1[3] */ 163 .short - F_1_847
223 /* reloadable constants */ 164 .short - F_1_961
224 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ 165 .short F_2_053
225 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ 166 .short - F_2_562
226 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ 167 .short F_3_072
227 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ 168 .short 0 /* padding */
169 .short 0
170 .short 0
171 .short 0
172
173 #undef F_0_298
174 #undef F_0_390
175 #undef F_0_541
176 #undef F_0_765
177 #undef F_0_899
178 #undef F_1_175
179 #undef F_1_501
180 #undef F_1_847
181 #undef F_1_961
182 #undef F_2_053
183 #undef F_2_562
184 #undef F_3_072
185
186 #define XFIX_P_0_298 v0.h[0]
187 #define XFIX_N_0_390 v0.h[1]
188 #define XFIX_P_0_541 v0.h[2]
189 #define XFIX_P_0_765 v0.h[3]
190 #define XFIX_N_0_899 v0.h[4]
191 #define XFIX_P_1_175 v0.h[5]
192 #define XFIX_P_1_501 v0.h[6]
193 #define XFIX_N_1_847 v0.h[7]
194 #define XFIX_N_1_961 v1.h[0]
195 #define XFIX_P_2_053 v1.h[1]
196 #define XFIX_N_2_562 v1.h[2]
197 #define XFIX_P_3_072 v1.h[3]
228 198
229 asm_function jsimd_idct_islow_neon 199 asm_function jsimd_idct_islow_neon
230
231 DCT_TABLE .req x0 200 DCT_TABLE .req x0
232 COEF_BLOCK .req x1 201 COEF_BLOCK .req x1
233 OUTPUT_BUF .req x2 202 OUTPUT_BUF .req x2
234 OUTPUT_COL .req x3 203 OUTPUT_COL .req x3
235 TMP1 .req x0 204 TMP1 .req x0
236 TMP2 .req x1 205 TMP2 .req x1
237 TMP3 .req x2 206 TMP3 .req x9
238 TMP4 .req x15 207 TMP4 .req x10
239 208 TMP5 .req x11
240 ROW0L .req v16 209 TMP6 .req x12
241 ROW0R .req v17 210 TMP7 .req x13
242 ROW1L .req v18 211 TMP8 .req x14
243 ROW1R .req v19 212
244 ROW2L .req v20 213 sub sp, sp, #64
245 ROW2R .req v21 214 adr x15, Ljsimd_idct_islow_neon_consts
246 ROW3L .req v22 215 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
247 ROW3R .req v23 216 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
248 ROW4L .req v24 217 ld1 {v0.8h, v1.8h}, [x15]
249 ROW4R .req v25 218 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
250 ROW5L .req v26 219 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
251 ROW5R .req v27 220 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
252 ROW6L .req v28 221 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
253 ROW6R .req v29 222
254 ROW7L .req v30 223 cmeq v16.8h, v3.8h, #0
255 ROW7R .req v31 224 cmeq v26.8h, v4.8h, #0
256 /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */ 225 cmeq v27.8h, v5.8h, #0
257 sub sp, sp, 272 226 cmeq v28.8h, v6.8h, #0
258 str x15, [sp], 16 227 cmeq v29.8h, v7.8h, #0
259 adr x15, jsimd_idct_islow_neon_consts 228 cmeq v30.8h, v8.8h, #0
260 st1 {v0.8b - v3.8b}, [sp], 32 229 cmeq v31.8h, v9.8h, #0
261 st1 {v4.8b - v7.8b}, [sp], 32 230
262 st1 {v8.8b - v11.8b}, [sp], 32 231 and v10.16b, v16.16b, v26.16b
263 st1 {v12.8b - v15.8b}, [sp], 32 232 and v11.16b, v27.16b, v28.16b
264 st1 {v16.8b - v19.8b}, [sp], 32 233 and v12.16b, v29.16b, v30.16b
265 st1 {v20.8b - v23.8b}, [sp], 32 234 and v13.16b, v31.16b, v10.16b
266 st1 {v24.8b - v27.8b}, [sp], 32 235 and v14.16b, v11.16b, v12.16b
267 st1 {v28.8b - v31.8b}, [sp], 32 236 mul v2.8h, v2.8h, v18.8h
268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 237 and v15.16b, v13.16b, v14.16b
269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 238 shl v10.8h, v2.8h, #(PASS1_BITS)
270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 239 sqxtn v16.8b, v15.8h
271 mul v16.4h, v16.4h, v0.4h 240 mov TMP1, v16.d[0]
272 mul v17.4h, v17.4h, v1.4h 241 sub sp, sp, #64
273 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */ 242 mvn TMP2, TMP1
274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 243
275 mul v18.4h, v18.4h, v2.4h 244 cbnz TMP2, 2f
276 mul v19.4h, v19.4h, v3.4h 245 /* case all AC coeffs are zeros */
277 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */ 246 dup v2.2d, v10.d[0]
278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 247 dup v6.2d, v10.d[1]
279 mul v20.4h, v20.4h, v4.4h 248 mov v3.16b, v2.16b
280 mul v21.4h, v21.4h, v5.4h 249 mov v7.16b, v6.16b
281 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */ 250 mov v4.16b, v2.16b
282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 251 mov v8.16b, v6.16b
283 mul v22.4h, v22.4h, v6.4h 252 mov v5.16b, v2.16b
284 mul v23.4h, v23.4h, v7.4h 253 mov v9.16b, v6.16b
285 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */ 254 1:
286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] 255 /* for this transpose, we should organise data like this:
287 mul v24.4h, v24.4h, v0.4h 256 * 00, 01, 02, 03, 40, 41, 42, 43
288 mul v25.4h, v25.4h, v1.4h 257 * 10, 11, 12, 13, 50, 51, 52, 53
289 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */ 258 * 20, 21, 22, 23, 60, 61, 62, 63
290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 259 * 30, 31, 32, 33, 70, 71, 72, 73
291 mul v28.4h, v28.4h, v4.4h 260 * 04, 05, 06, 07, 44, 45, 46, 47
292 mul v29.4h, v29.4h, v5.4h 261 * 14, 15, 16, 17, 54, 55, 56, 57
293 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */ 262 * 24, 25, 26, 27, 64, 65, 66, 67
294 mul v26.4h, v26.4h, v2.4h 263 * 34, 35, 36, 37, 74, 75, 76, 77
295 mul v27.4h, v27.4h, v3.4h 264 */
296 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */ 265 trn1 v28.8h, v2.8h, v3.8h
297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ 266 trn1 v29.8h, v4.8h, v5.8h
298 add x15, x15, #16 267 trn1 v30.8h, v6.8h, v7.8h
299 mul v30.4h, v30.4h, v6.4h 268 trn1 v31.8h, v8.8h, v9.8h
300 mul v31.4h, v31.4h, v7.4h 269 trn2 v16.8h, v2.8h, v3.8h
301 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ 270 trn2 v17.8h, v4.8h, v5.8h
302 /* Go to the bottom of the stack */ 271 trn2 v18.8h, v6.8h, v7.8h
303 sub sp, sp, 352 272 trn2 v19.8h, v8.8h, v9.8h
304 stp x4, x5, [sp], 16 273 trn1 v2.4s, v28.4s, v29.4s
305 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ 274 trn1 v6.4s, v30.4s, v31.4s
306 st1 {v12.4h - v15.4h}, [sp], 32 275 trn1 v3.4s, v16.4s, v17.4s
307 /* 1-D IDCT, pass 1, left 4x8 half */ 276 trn1 v7.4s, v18.4s, v19.4s
308 add v4.4h, ROW7L.4h, ROW3L.4h 277 trn2 v4.4s, v28.4s, v29.4s
309 add v5.4h, ROW5L.4h, ROW1L.4h 278 trn2 v8.4s, v30.4s, v31.4s
310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 279 trn2 v5.4s, v16.4s, v17.4s
311 smlal v12.4s, v5.4h, XFIX_1_175875602 280 trn2 v9.4s, v18.4s, v19.4s
312 smull v14.4s, v4.4h, XFIX_1_175875602 281 /* Even part: reverse the even part of the forward DCT. */
313 /* Check for the zero coefficients in the right 4x8 half */ 282 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZ E*6]) */
314 smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644 283 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ E*4]) */
315 ssubl v6.4s, ROW0L.4h, ROW4L.4h 284 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
316 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] 285 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr [DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ E*4]) */
317 smull v4.4s, ROW2L.4h, XFIX_0_541196100 286 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
318 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065 287 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
319 orr x0, x4, x5 288 mov v21.16b, v19.16b /* tmp3 = z1 */
320 mov v8.16b, v12.16b 289 mov v20.16b, v18.16b /* tmp3 = z1 */
321 smlsl v12.4s, ROW5L.4h, XFIX_2_562915447 290 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */
322 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] 291 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */
323 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 292 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
324 shl v6.4s, v6.4s, #13 293 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */
325 orr x0, x0, x4 294 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */
326 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 295 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
327 orr x0, x0 , x5 296 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
328 add v2.4s, v6.4s, v4.4s 297 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3 ; */
329 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] 298 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3 ; */
330 mov v10.16b, v14.16b 299 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2 ; */
331 add v2.4s, v2.4s, v12.4s 300 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2 ; */
332 orr x0, x0, x4 301 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3 ; */
333 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223 302 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3 ; */
334 orr x0, x0, x5 303 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2 ; */
335 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 304 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2 ; */
336 rshrn ROW1L.4h, v2.4s, #11 305
337 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] 306 /* Odd part per figure 8; the matrix is unitary and hence its
338 sub v2.4s, v2.4s, v12.4s 307 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
339 smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447 308 */
340 orr x0, x0, x4 309
341 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 310 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS IZE*3]) */
342 orr x0, x0, x5 311 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS IZE*1]) */
343 sub v2.4s, v2.4s, v12.4s 312 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS IZE*1]) */
344 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 313 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS IZE*3]) */
345 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] 314 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
346 smlal v12.4s, ROW6L.4h, XFIX_0_541196100 315
347 sub v6.4s, v6.4s, v4.4s 316 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */
348 orr x0, x0, x4 317 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */
349 rshrn ROW6L.4h, v2.4s, #11 318 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */
350 orr x0, x0, x5 319 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */
351 add v2.4s, v6.4s, v10.4s 320 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
352 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] 321 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */
353 sub v6.4s, v6.4s, v10.4s 322 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */
354 saddl v10.4s, ROW0L.4h, ROW4L.4h 323 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */
355 orr x0, x0, x4 324 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */
356 rshrn ROW2L.4h, v2.4s, #11 325
357 orr x0, x0, x5 326 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */
358 rshrn ROW5L.4h, v6.4s, #11 327 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */
359 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] 328 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */
360 shl v10.4s, v10.4s, #13 329 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */
361 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223 330 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
362 orr x0, x0, x4 331 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */
363 add v4.4s, v10.4s, v12.4s 332 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */
364 orr x0, x0, x5 333 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */
365 cmp x0, #0 /* orrs instruction removed */ 334 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */
366 sub v2.4s, v10.4s, v12.4s 335
367 add v12.4s, v4.4s, v14.4s 336 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
368 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] 337 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
369 sub v4.4s, v4.4s, v14.4s 338 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
370 add v10.4s, v2.4s, v8.4s 339 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
371 orr x0, x4, x5 340
372 sub v6.4s, v2.4s, v8.4s 341 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
373 /* pop {x4, x5} */ 342 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
374 sub sp, sp, 80 343 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
375 ldp x4, x5, [sp], 16 344 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
376 rshrn ROW7L.4h, v4.4s, #11 345 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
377 rshrn ROW3L.4h, v10.4s, #11 346 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
378 rshrn ROW0L.4h, v12.4s, #11 347 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
379 rshrn ROW4L.4h, v6.4s, #11 348 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
380 349
381 beq 3f /* Go to do some special handling for the sparse right 4x8 half */ 350 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
382 351 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
383 /* 1-D IDCT, pass 1, right 4x8 half */ 352 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
384 ld1 {v2.4h}, [x15] /* reload constants */ 353 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
385 add v10.4h, ROW7R.4h, ROW3R.4h 354 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
386 add v8.4h, ROW5R.4h, ROW1R.4h 355 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
387 /* Transpose ROW6L <-> ROW7L (v3 available free register) */ 356 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
388 transpose ROW6L, ROW7L, v3, .16b, .4h 357 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
389 smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560 358
390 smlal v12.4s, v8.4h, XFIX_1_175875602 359 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
391 /* Transpose ROW2L <-> ROW3L (v3 available free register) */ 360
392 transpose ROW2L, ROW3L, v3, .16b, .4h 361 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
393 smull v14.4s, v10.4h, XFIX_1_175875602 362 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
394 smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644 363 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
395 /* Transpose ROW0L <-> ROW1L (v3 available free register) */ 364 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
396 transpose ROW0L, ROW1L, v3, .16b, .4h 365 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
397 ssubl v6.4s, ROW0R.4h, ROW4R.4h 366 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
398 smull v4.4s, ROW2R.4h, XFIX_0_541196100 367 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
399 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 368 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
400 /* Transpose ROW4L <-> ROW5L (v3 available free register) */ 369 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
401 transpose ROW4L, ROW5L, v3, .16b, .4h 370 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
402 mov v8.16b, v12.16b 371 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
403 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 372 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
404 smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447 373 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
405 /* Transpose ROW1L <-> ROW3L (v3 available free register) */ 374 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
406 transpose ROW1L, ROW3L, v3, .16b, .2s 375 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
407 shl v6.4s, v6.4s, #13 376 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
408 smlsl v8.4s, ROW1R.4h, XFIX_0_899976223 377
409 /* Transpose ROW4L <-> ROW6L (v3 available free register) */ 378 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp1 0 + tmp3, CONST_BITS+PASS1_BITS+3) */
410 transpose ROW4L, ROW6L, v3, .16b, .2s 379 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp1 0 - tmp3, CONST_BITS+PASS1_BITS+3) */
411 add v2.4s, v6.4s, v4.4s 380 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp1 1 + tmp2, CONST_BITS+PASS1_BITS+3) */
412 mov v10.16b, v14.16b 381 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp1 1 - tmp2, CONST_BITS+PASS1_BITS+3) */
413 add v2.4s, v2.4s, v12.4s 382 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp1 2 + tmp1, CONST_BITS+PASS1_BITS+3) */
414 /* Transpose ROW0L <-> ROW2L (v3 available free register) */ 383 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp1 2 - tmp1, CONST_BITS+PASS1_BITS+3) */
415 transpose ROW0L, ROW2L, v3, .16b, .2s 384 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp1 3 + tmp0, CONST_BITS+PASS1_BITS+3) */
416 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 385 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp1 3 - tmp0, CONST_BITS+PASS1_BITS+3) */
417 smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223 386 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp1 0 + tmp3, CONST_BITS+PASS1_BITS+3) */
418 rshrn ROW1R.4h, v2.4s, #11 387 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp1 0 - tmp3, CONST_BITS+PASS1_BITS+3) */
419 /* Transpose ROW5L <-> ROW7L (v3 available free register) */ 388 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp1 1 + tmp2, CONST_BITS+PASS1_BITS+3) */
420 transpose ROW5L, ROW7L, v3, .16b, .2s 389 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp1 1 - tmp2, CONST_BITS+PASS1_BITS+3) */
421 sub v2.4s, v2.4s, v12.4s 390 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp1 2 + tmp1, CONST_BITS+PASS1_BITS+3) */
422 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 391 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp1 2 - tmp1, CONST_BITS+PASS1_BITS+3) */
423 smlsl v10.4s, ROW3R.4h, XFIX_2_562915447 392 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp1 3 + tmp0, CONST_BITS+PASS1_BITS+3) */
424 sub v2.4s, v2.4s, v12.4s 393 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp1 3 - tmp0, CONST_BITS+PASS1_BITS+3) */
425 smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865 394 movi v0.16b, #(CENTERJSAMPLE)
426 smlal v12.4s, ROW6R.4h, XFIX_0_541196100 395 /* Prepare pointers (dual-issue with NEON instructions) */
427 sub v6.4s, v6.4s, v4.4s 396 ldp TMP1, TMP2, [OUTPUT_BUF], 16
428 rshrn ROW6R.4h, v2.4s, #11 397 sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
429 add v2.4s, v6.4s, v10.4s 398 ldp TMP3, TMP4, [OUTPUT_BUF], 16
430 sub v6.4s, v6.4s, v10.4s 399 sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
431 saddl v10.4s, ROW0R.4h, ROW4R.4h 400 add TMP1, TMP1, OUTPUT_COL
432 rshrn ROW2R.4h, v2.4s, #11 401 sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
433 rshrn ROW5R.4h, v6.4s, #11 402 add TMP2, TMP2, OUTPUT_COL
434 shl v10.4s, v10.4s, #13 403 sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
435 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 404 add TMP3, TMP3, OUTPUT_COL
436 add v4.4s, v10.4s, v12.4s 405 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
437 sub v2.4s, v10.4s, v12.4s 406 add TMP4, TMP4, OUTPUT_COL
438 add v12.4s, v4.4s, v14.4s 407 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
439 sub v4.4s, v4.4s, v14.4s 408 ldp TMP5, TMP6, [OUTPUT_BUF], 16
440 add v10.4s, v2.4s, v8.4s 409 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
441 sub v6.4s, v2.4s, v8.4s 410 ldp TMP7, TMP8, [OUTPUT_BUF], 16
442 rshrn ROW7R.4h, v4.4s, #11 411 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
443 rshrn ROW3R.4h, v10.4s, #11 412 add TMP5, TMP5, OUTPUT_COL
444 rshrn ROW0R.4h, v12.4s, #11 413 add v16.16b, v28.16b, v0.16b
445 rshrn ROW4R.4h, v6.4s, #11 414 add TMP6, TMP6, OUTPUT_COL
446 /* Transpose right 4x8 half */ 415 add v18.16b, v29.16b, v0.16b
447 transpose ROW6R, ROW7R, v3, .16b, .4h 416 add TMP7, TMP7, OUTPUT_COL
448 transpose ROW2R, ROW3R, v3, .16b, .4h 417 add v20.16b, v30.16b, v0.16b
449 transpose ROW0R, ROW1R, v3, .16b, .4h 418 add TMP8, TMP8, OUTPUT_COL
450 transpose ROW4R, ROW5R, v3, .16b, .4h 419 add v22.16b, v31.16b, v0.16b
451 transpose ROW1R, ROW3R, v3, .16b, .2s 420
452 transpose ROW4R, ROW6R, v3, .16b, .2s 421 /* Transpose the final 8-bit samples */
453 transpose ROW0R, ROW2R, v3, .16b, .2s 422 trn1 v28.16b, v16.16b, v18.16b
454 transpose ROW5R, ROW7R, v3, .16b, .2s 423 trn1 v30.16b, v20.16b, v22.16b
455 424 trn2 v29.16b, v16.16b, v18.16b
456 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ 425 trn2 v31.16b, v20.16b, v22.16b
457 ld1 {v2.4h}, [x15] /* reload constants */ 426
458 smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */ 427 trn1 v16.8h, v28.8h, v30.8h
459 smlal v12.4s, ROW1L.4h, XFIX_1_175875602 428 trn2 v18.8h, v28.8h, v30.8h
460 smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */ 429 trn1 v20.8h, v29.8h, v31.8h
461 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 430 trn2 v22.8h, v29.8h, v31.8h
462 smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */ 431
463 smlal v14.4s, ROW3L.4h, XFIX_1_175875602 432 uzp1 v28.4s, v16.4s, v18.4s
464 smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */ 433 uzp2 v30.4s, v16.4s, v18.4s
465 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 434 uzp1 v29.4s, v20.4s, v22.4s
466 ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ 435 uzp2 v31.4s, v20.4s, v22.4s
467 smull v4.4s, ROW2L.4h, XFIX_0_541196100 436
468 smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* RO W6L.4h <-> ROW2R.4h */
469 mov v8.16b, v12.16b
470 smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4 h */
471 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
472 shl v6.4s, v6.4s, #13
473 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
474 add v2.4s, v6.4s, v4.4s
475 mov v10.16b, v14.16b
476 add v2.4s, v2.4s, v12.4s
477 smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4 h */
478 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
479 shrn ROW1L.4h, v2.4s, #16
480 sub v2.4s, v2.4s, v12.4s
481 smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* RO W5L.4h <-> ROW1R.4h */
482 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
483 sub v2.4s, v2.4s, v12.4s
484 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
485 smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */
486 sub v6.4s, v6.4s, v4.4s
487 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
488 add v2.4s, v6.4s, v10.4s
489 sub v6.4s, v6.4s, v10.4s
490 saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
491 shrn ROW2L.4h, v2.4s, #16
492 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
493 shl v10.4s, v10.4s, #13
494 smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* RO W7L.4h <-> ROW3R.4h */
495 add v4.4s, v10.4s, v12.4s
496 sub v2.4s, v10.4s, v12.4s
497 add v12.4s, v4.4s, v14.4s
498 sub v4.4s, v4.4s, v14.4s
499 add v10.4s, v2.4s, v8.4s
500 sub v6.4s, v2.4s, v8.4s
501 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
502 shrn ROW3L.4h, v10.4s, #16
503 shrn ROW0L.4h, v12.4s, #16
504 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
505 /* 1-D IDCT, pass 2, right 4x8 half */
506 ld1 {v2.4h}, [x15] /* reload constants */
507 smull v12.4s, ROW5R.4h, XFIX_1_175875602
508 smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */
509 smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
510 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */
511 smull v14.4s, ROW7R.4h, XFIX_1_175875602
512 smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */
513 smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
514 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */
515 ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
516 smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */
517 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
518 mov v8.16b, v12.16b
519 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
520 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* RO W7L.4h <-> ROW3R.4h */
521 shl v6.4s, v6.4s, #13
522 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4 h */
523 add v2.4s, v6.4s, v4.4s
524 mov v10.16b, v14.16b
525 add v2.4s, v2.4s, v12.4s
526 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
527 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* RO W5L.4h <-> ROW1R.4h */
528 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
529 sub v2.4s, v2.4s, v12.4s
530 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
531 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4 h */
532 sub v2.4s, v2.4s, v12.4s
533 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW 6L.4h <-> ROW2R.4h */
534 smlal v12.4s, ROW6R.4h, XFIX_0_541196100
535 sub v6.4s, v6.4s, v4.4s
536 shrn ROW6R.4h, v2.4s, #16
537 add v2.4s, v6.4s, v10.4s
538 sub v6.4s, v6.4s, v10.4s
539 saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
540 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
541 shrn ROW5R.4h, v6.4s, #16
542 shl v10.4s, v10.4s, #13
543 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
544 add v4.4s, v10.4s, v12.4s
545 sub v2.4s, v10.4s, v12.4s
546 add v12.4s, v4.4s, v14.4s
547 sub v4.4s, v4.4s, v14.4s
548 add v10.4s, v2.4s, v8.4s
549 sub v6.4s, v2.4s, v8.4s
550 shrn ROW7R.4h, v4.4s, #16
551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
553 shrn ROW4R.4h, v6.4s, #16
554
555 2: /* Descale to 8-bit and range limit */
556 ins v16.2d[1], v17.2d[0]
557 ins v18.2d[1], v19.2d[0]
558 ins v20.2d[1], v21.2d[0]
559 ins v22.2d[1], v23.2d[0]
560 sqrshrn v16.8b, v16.8h, #2
561 sqrshrn2 v16.16b, v18.8h, #2
562 sqrshrn v18.8b, v20.8h, #2
563 sqrshrn2 v18.16b, v22.8h, #2
564
565 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
566 ld1 {v8.4h - v11.4h}, [sp], 32
567 ld1 {v12.4h - v15.4h}, [sp], 32
568 ins v24.2d[1], v25.2d[0]
569
570 sqrshrn v20.8b, v24.8h, #2
571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
572 /* trn1 v16.8h, v16.8h, v18.8h */
573 transpose v16, v18, v3, .16b, .8h
574 ins v26.2d[1], v27.2d[0]
575 ins v28.2d[1], v29.2d[0]
576 ins v30.2d[1], v31.2d[0]
577 sqrshrn2 v20.16b, v26.8h, #2
578 sqrshrn v22.8b, v28.8h, #2
579 movi v0.16b, #(CENTERJSAMPLE)
580 sqrshrn2 v22.16b, v30.8h, #2
581 transpose_single v16, v17, v3, .2d, .8b
582 transpose_single v18, v19, v3, .2d, .8b
583 add v16.8b, v16.8b, v0.8b
584 add v17.8b, v17.8b, v0.8b
585 add v18.8b, v18.8b, v0.8b
586 add v19.8b, v19.8b, v0.8b
587 transpose v20, v22, v3, .16b, .8h
588 /* Store results to the output buffer */ 437 /* Store results to the output buffer */
589 ldp TMP1, TMP2, [OUTPUT_BUF], 16 438 st1 {v28.d}[0], [TMP1]
590 add TMP1, TMP1, OUTPUT_COL 439 st1 {v29.d}[0], [TMP2]
591 add TMP2, TMP2, OUTPUT_COL 440 st1 {v28.d}[1], [TMP3]
592 st1 {v16.8b}, [TMP1] 441 st1 {v29.d}[1], [TMP4]
593 transpose_single v20, v21, v3, .2d, .8b 442 st1 {v30.d}[0], [TMP5]
594 st1 {v17.8b}, [TMP2] 443 st1 {v31.d}[0], [TMP6]
595 ldp TMP1, TMP2, [OUTPUT_BUF], 16 444 st1 {v30.d}[1], [TMP7]
596 add TMP1, TMP1, OUTPUT_COL 445 st1 {v31.d}[1], [TMP8]
597 add TMP2, TMP2, OUTPUT_COL 446 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
598 st1 {v18.8b}, [TMP1] 447 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
599 add v20.8b, v20.8b, v0.8b
600 add v21.8b, v21.8b, v0.8b
601 st1 {v19.8b}, [TMP2]
602 ldp TMP1, TMP2, [OUTPUT_BUF], 16
603 ldp TMP3, TMP4, [OUTPUT_BUF]
604 add TMP1, TMP1, OUTPUT_COL
605 add TMP2, TMP2, OUTPUT_COL
606 add TMP3, TMP3, OUTPUT_COL
607 add TMP4, TMP4, OUTPUT_COL
608 transpose_single v22, v23, v3, .2d, .8b
609 st1 {v20.8b}, [TMP1]
610 add v22.8b, v22.8b, v0.8b
611 add v23.8b, v23.8b, v0.8b
612 st1 {v21.8b}, [TMP2]
613 st1 {v22.8b}, [TMP3]
614 st1 {v23.8b}, [TMP4]
615 ldr x15, [sp], 16
616 ld1 {v0.8b - v3.8b}, [sp], 32
617 ld1 {v4.8b - v7.8b}, [sp], 32
618 ld1 {v8.8b - v11.8b}, [sp], 32
619 ld1 {v12.8b - v15.8b}, [sp], 32
620 ld1 {v16.8b - v19.8b}, [sp], 32
621 ld1 {v20.8b - v23.8b}, [sp], 32
622 ld1 {v24.8b - v27.8b}, [sp], 32
623 ld1 {v28.8b - v31.8b}, [sp], 32
624 blr x30 448 blr x30
625 449
626 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ 450 .balign 16
627 451 2:
628 /* Transpose left 4x8 half */ 452 mul v3.8h, v3.8h, v19.8h
629 transpose ROW6L, ROW7L, v3, .16b, .4h 453 mul v4.8h, v4.8h, v20.8h
630 transpose ROW2L, ROW3L, v3, .16b, .4h 454 mul v5.8h, v5.8h, v21.8h
631 transpose ROW0L, ROW1L, v3, .16b, .4h 455 add TMP4, xzr, TMP2, LSL #32
632 transpose ROW4L, ROW5L, v3, .16b, .4h 456 mul v6.8h, v6.8h, v22.8h
633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ 457 mul v7.8h, v7.8h, v23.8h
634 transpose ROW1L, ROW3L, v3, .16b, .2s 458 adds TMP3, xzr, TMP2, LSR #32
635 transpose ROW4L, ROW6L, v3, .16b, .2s 459 mul v8.8h, v8.8h, v24.8h
636 transpose ROW0L, ROW2L, v3, .16b, .2s 460 mul v9.8h, v9.8h, v25.8h
637 transpose ROW5L, ROW7L, v3, .16b, .2s 461 b.ne 3f
638 cmp x0, #0 462 /* Right AC coef is zero */
639 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */ 463 dup v15.2d, v10.d[1]
640 464 /* Even part: reverse the even part of the forward DCT. */
641 /* Only row 0 is non-zero for the right 4x8 half */ 465 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZ E*6]) */
642 dup ROW1R.4h, ROW0R.4h[1] 466 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ E*4]) */
643 dup ROW2R.4h, ROW0R.4h[2] 467 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr [DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ E*4]) */
644 dup ROW3R.4h, ROW0R.4h[3] 468 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
645 dup ROW4R.4h, ROW0R.4h[0] 469 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
646 dup ROW5R.4h, ROW0R.4h[1] 470 mov v20.16b, v18.16b /* tmp3 = z1 */
647 dup ROW6R.4h, ROW0R.4h[2] 471 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
648 dup ROW7R.4h, ROW0R.4h[3] 472 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */
649 dup ROW0R.4h, ROW0R.4h[0] 473 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */
650 b 1b /* Go to 'normal' second pass */ 474 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3 ; */
651 475 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3 ; */
652 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 476 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2 ; */
653 ld1 {v2.4h}, [x15] /* reload constants */ 477 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2 ; */
654 smull v12.4s, ROW1L.4h, XFIX_1_175875602 478
655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 479 /* Odd part per figure 8; the matrix is unitary and hence its
656 smull v14.4s, ROW3L.4h, XFIX_1_175875602 480 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 481 */
658 smull v4.4s, ROW2L.4h, XFIX_0_541196100 482
659 sshll v6.4s, ROW0L.4h, #13 483 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS IZE*3]) */
660 mov v8.16b, v12.16b 484 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS IZE*1]) */
661 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 485 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS IZE*1]) */
662 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 486 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS IZE*3]) */
663 add v2.4s, v6.4s, v4.4s 487 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
664 mov v10.16b, v14.16b 488
665 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 489 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */
666 add v2.4s, v2.4s, v12.4s 490 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */
667 add v12.4s, v12.4s, v12.4s 491 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */
668 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 492 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */
669 shrn ROW1L.4h, v2.4s, #16 493 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
670 sub v2.4s, v2.4s, v12.4s 494 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */
671 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 495 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */
672 sub v6.4s, v6.4s, v4.4s 496 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */
673 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ 497 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */
674 add v2.4s, v6.4s, v10.4s 498
675 sub v6.4s, v6.4s, v10.4s 499 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
676 sshll v10.4s, ROW0L.4h, #13 500 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
677 shrn ROW2L.4h, v2.4s, #16 501
678 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ 502 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
679 add v4.4s, v10.4s, v12.4s 503 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
680 sub v2.4s, v10.4s, v12.4s 504 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
681 add v12.4s, v4.4s, v14.4s 505 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
682 sub v4.4s, v4.4s, v14.4s 506
683 add v10.4s, v2.4s, v8.4s 507 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
684 sub v6.4s, v2.4s, v8.4s 508 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
685 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ 509 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
686 shrn ROW3L.4h, v10.4s, #16 510 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
687 shrn ROW0L.4h, v12.4s, #16 511
688 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ 512 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
689 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ 513
690 ld1 {v2.4h}, [x15] /* reload constants */ 514 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
691 smull v12.4s, ROW5L.4h, XFIX_1_175875602 515 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
692 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 516 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
693 smull v14.4s, ROW7L.4h, XFIX_1_175875602 517 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
694 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 518 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
695 smull v4.4s, ROW6L.4h, XFIX_0_541196100 519 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
696 sshll v6.4s, ROW4L.4h, #13 520 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
697 mov v8.16b, v12.16b 521 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
698 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 522
699 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 523 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
700 add v2.4s, v6.4s, v4.4s 524 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
701 mov v10.16b, v14.16b 525 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
702 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 526 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
703 add v2.4s, v2.4s, v12.4s 527 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
704 add v12.4s, v12.4s, v12.4s 528 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
705 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 529 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
706 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ 530 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
707 sub v2.4s, v2.4s, v12.4s 531 mov v6.16b, v15.16b
708 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 532 mov v7.16b, v15.16b
709 sub v6.4s, v6.4s, v4.4s 533 mov v8.16b, v15.16b
710 shrn ROW6R.4h, v2.4s, #16 534 mov v9.16b, v15.16b
711 add v2.4s, v6.4s, v10.4s 535 b 1b
712 sub v6.4s, v6.4s, v10.4s 536
713 sshll v10.4s, ROW4L.4h, #13 537 .balign 16
714 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ 538 3:
715 shrn ROW5R.4h, v6.4s, #16 539 cbnz TMP4, 4f
716 add v4.4s, v10.4s, v12.4s 540 /* Left AC coef is zero */
717 sub v2.4s, v10.4s, v12.4s 541 dup v14.2d, v10.d[0]
718 add v12.4s, v4.4s, v14.4s 542 /* Even part: reverse the even part of the forward DCT. */
719 sub v4.4s, v4.4s, v14.4s 543 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZ E*6]) */
720 add v10.4s, v2.4s, v8.4s 544 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ E*4]) */
721 sub v6.4s, v2.4s, v8.4s 545 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
722 shrn ROW7R.4h, v4.4s, #16 546 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr [DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ E*4]) */
723 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ 547 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
724 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ 548 mov v21.16b, v19.16b /* tmp3 = z1 */
725 shrn ROW4R.4h, v6.4s, #16 549 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */
726 b 2b /* Go to epilogue */ 550 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
551 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */
552 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3 ; */
553 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3 ; */
554 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2 ; */
555 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2 ; */
556
557 /* Odd part per figure 8; the matrix is unitary and hence its
558 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
559 */
560
561 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS IZE*3]) */
562 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS IZE*1]) */
563 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS IZE*1]) */
564 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS IZE*3]) */
565 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
566
567 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */
568 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */
569 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */
570 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */
571 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
572 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */
573 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */
574 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */
575 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */
576
577 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
578 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
579 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
580 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
581
582 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
583 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
584 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
585 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
586
587 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
588 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
589 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
590 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
591
592 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
593
594 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
595 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
596 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
597 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
598 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
599 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
600 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
601 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
602
603 mov v2.16b, v14.16b
604 mov v3.16b, v14.16b
605 mov v4.16b, v14.16b
606 mov v5.16b, v14.16b
607 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
608 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
609 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
610 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
611 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
612 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
613 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
614 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
615 b 1b
616
617 .balign 16
618 4:
619 /* "No" AC coef is zero */
620 /* Even part: reverse the even part of the forward DCT. */
621 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZ E*6]) */
622 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ E*4]) */
623 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
624 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr [DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ E*4]) */
625 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
626 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
627 mov v21.16b, v19.16b /* tmp3 = z1 */
628 mov v20.16b, v18.16b /* tmp3 = z1 */
629 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */
630 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */
631 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
632 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */
633 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */
634 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
635 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
636 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3 ; */
637 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3 ; */
638 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2 ; */
639 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2 ; */
640 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3 ; */
641 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3 ; */
642 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2 ; */
643 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2 ; */
644
645 /* Odd part per figure 8; the matrix is unitary and hence its
646 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
647 */
648
649 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS IZE*3]) */
650 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS IZE*1]) */
651 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS IZE*1]) */
652 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS IZE*3]) */
653 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
654
655 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */
656 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */
657 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */
658 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */
659 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
660 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */
661 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */
662 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */
663 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */
664
665 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */
666 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */
667 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */
668 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */
669 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
670 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */
671 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */
672 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */
673 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */
674
675 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
676 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
677 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
678 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
679
680 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
681 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
682 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
683 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
684 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
685 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
686 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
687 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
688
689 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
690 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
691 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
692 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
693 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
694 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
695 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
696 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
697
698 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
699
700 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
701 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
702 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
703 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
704 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
705 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
706 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
707 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
708 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
709 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
710 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
711 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
712 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
713 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
714 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
715 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
716
717 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
718 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
719 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
720 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
721 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
722 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
723 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
724 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
725 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
726 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
727 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
728 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
729 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
730 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
731 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
732 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
733 b 1b
727 734
728 .unreq DCT_TABLE 735 .unreq DCT_TABLE
729 .unreq COEF_BLOCK 736 .unreq COEF_BLOCK
730 .unreq OUTPUT_BUF 737 .unreq OUTPUT_BUF
731 .unreq OUTPUT_COL 738 .unreq OUTPUT_COL
732 .unreq TMP1 739 .unreq TMP1
733 .unreq TMP2 740 .unreq TMP2
734 .unreq TMP3 741 .unreq TMP3
735 .unreq TMP4 742 .unreq TMP4
743 .unreq TMP5
744 .unreq TMP6
745 .unreq TMP7
746 .unreq TMP8
736 747
737 .unreq ROW0L 748 #undef CENTERJSAMPLE
738 .unreq ROW0R 749 #undef CONST_BITS
739 .unreq ROW1L 750 #undef PASS1_BITS
740 .unreq ROW1R 751 #undef XFIX_P_0_298
741 .unreq ROW2L 752 #undef XFIX_N_0_390
742 .unreq ROW2R 753 #undef XFIX_P_0_541
743 .unreq ROW3L 754 #undef XFIX_P_0_765
744 .unreq ROW3R 755 #undef XFIX_N_0_899
745 .unreq ROW4L 756 #undef XFIX_P_1_175
746 .unreq ROW4R 757 #undef XFIX_P_1_501
747 .unreq ROW5L 758 #undef XFIX_N_1_847
748 .unreq ROW5R 759 #undef XFIX_N_1_961
749 .unreq ROW6L 760 #undef XFIX_P_2_053
750 .unreq ROW6R 761 #undef XFIX_N_2_562
751 .unreq ROW7L 762 #undef XFIX_P_3_072
752 .unreq ROW7R
753 763
754 764
755 /*****************************************************************************/ 765 /*****************************************************************************/
756 766
757 /* 767 /*
758 * jsimd_idct_ifast_neon 768 * jsimd_idct_ifast_neon
759 * 769 *
760 * This function contains a fast, not so accurate integer implementation of 770 * This function contains a fast, not so accurate integer implementation of
761 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 771 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
762 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 772 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
763 * function from jidctfst.c 773 * function from jidctfst.c
764 * 774 *
765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 775 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
766 * But in ARM NEON case some extra additions are required because VQDMULH 776 * But in ARM NEON case some extra additions are required because VQDMULH
767 * instruction can't handle the constants larger than 1. So the expressions 777 * instruction can't handle the constants larger than 1. So the expressions
768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 778 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
769 * which introduces an extra addition. Overall, there are 6 extra additions 779 * which introduces an extra addition. Overall, there are 6 extra additions
770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 780 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
771 */ 781 */
772 782
773 #define XFIX_1_082392200 v0.4h[0] 783 #define XFIX_1_082392200 v0.h[0]
774 #define XFIX_1_414213562 v0.4h[1] 784 #define XFIX_1_414213562 v0.h[1]
775 #define XFIX_1_847759065 v0.4h[2] 785 #define XFIX_1_847759065 v0.h[2]
776 #define XFIX_2_613125930 v0.4h[3] 786 #define XFIX_2_613125930 v0.h[3]
777 787
778 .balign 16 788 .balign 16
779 jsimd_idct_ifast_neon_consts: 789 Ljsimd_idct_ifast_neon_consts:
780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 790 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 791 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 792 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 793 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
784 794
785 asm_function jsimd_idct_ifast_neon 795 asm_function jsimd_idct_ifast_neon
786 796
787 DCT_TABLE .req x0 797 DCT_TABLE .req x0
788 COEF_BLOCK .req x1 798 COEF_BLOCK .req x1
789 OUTPUT_BUF .req x2 799 OUTPUT_BUF .req x2
790 OUTPUT_COL .req x3 800 OUTPUT_COL .req x3
791 TMP1 .req x0 801 TMP1 .req x0
792 TMP2 .req x1 802 TMP2 .req x1
793 TMP3 .req x2 803 TMP3 .req x9
794 TMP4 .req x22 804 TMP4 .req x10
795 TMP5 .req x23 805 TMP5 .req x11
806 TMP6 .req x12
807 TMP7 .req x13
808 TMP8 .req x14
796 809
797 /* Load and dequantize coefficients into NEON registers 810 /* Load and dequantize coefficients into NEON registers
798 * with the following allocation: 811 * with the following allocation:
799 * 0 1 2 3 | 4 5 6 7 812 * 0 1 2 3 | 4 5 6 7
800 * ---------+-------- 813 * ---------+--------
801 * 0 | d16 | d17 ( v8.8h ) 814 * 0 | d16 | d17 ( v16.8h )
802 * 1 | d18 | d19 ( v9.8h ) 815 * 1 | d18 | d19 ( v17.8h )
803 * 2 | d20 | d21 ( v10.8h ) 816 * 2 | d20 | d21 ( v18.8h )
804 * 3 | d22 | d23 ( v11.8h ) 817 * 3 | d22 | d23 ( v19.8h )
805 * 4 | d24 | d25 ( v12.8h ) 818 * 4 | d24 | d25 ( v20.8h )
806 * 5 | d26 | d27 ( v13.8h ) 819 * 5 | d26 | d27 ( v21.8h )
807 * 6 | d28 | d29 ( v14.8h ) 820 * 6 | d28 | d29 ( v22.8h )
808 * 7 | d30 | d31 ( v15.8h ) 821 * 7 | d30 | d31 ( v23.8h )
809 */ 822 */
810 /* Save NEON registers used in fast IDCT */ 823 /* Save NEON registers used in fast IDCT */
811 sub sp, sp, #176 824 adr TMP5, Ljsimd_idct_ifast_neon_consts
812 stp x22, x23, [sp], 16 825 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
813 adr x23, jsimd_idct_ifast_neon_consts
814 st1 {v0.8b - v3.8b}, [sp], 32
815 st1 {v4.8b - v7.8b}, [sp], 32
816 st1 {v8.8b - v11.8b}, [sp], 32
817 st1 {v12.8b - v15.8b}, [sp], 32
818 st1 {v16.8b - v19.8b}, [sp], 32
819 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
820 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 826 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
821 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32 827 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
822 mul v8.8h, v8.8h, v0.8h 828 mul v16.8h, v16.8h, v0.8h
823 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 829 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
824 mul v9.8h, v9.8h, v1.8h 830 mul v17.8h, v17.8h, v1.8h
825 ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32 831 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32
826 mul v10.8h, v10.8h, v2.8h 832 mul v18.8h, v18.8h, v2.8h
827 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 833 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
828 mul v11.8h, v11.8h, v3.8h 834 mul v19.8h, v19.8h, v3.8h
829 ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32 835 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32
830 mul v12.8h, v12.8h, v0.8h 836 mul v20.8h, v20.8h, v0.8h
831 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 837 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
832 mul v14.8h, v14.8h, v2.8h 838 mul v22.8h, v22.8h, v2.8h
833 mul v13.8h, v13.8h, v1.8h 839 mul v21.8h, v21.8h, v1.8h
834 ld1 {v0.4h}, [x23] /* load constants */ 840 ld1 {v0.4h}, [TMP5] /* load constants */
835 mul v15.8h, v15.8h, v3.8h 841 mul v23.8h, v23.8h, v3.8h
836 842
837 /* 1-D IDCT, pass 1 */ 843 /* 1-D IDCT, pass 1 */
838 sub v2.8h, v10.8h, v14.8h 844 sub v2.8h, v18.8h, v22.8h
839 add v14.8h, v10.8h, v14.8h 845 add v22.8h, v18.8h, v22.8h
840 sub v1.8h, v11.8h, v13.8h 846 sub v1.8h, v19.8h, v21.8h
841 add v13.8h, v11.8h, v13.8h 847 add v21.8h, v19.8h, v21.8h
842 sub v5.8h, v9.8h, v15.8h 848 sub v5.8h, v17.8h, v23.8h
843 add v15.8h, v9.8h, v15.8h 849 add v23.8h, v17.8h, v23.8h
844 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 850 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
845 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 851 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
846 add v3.8h, v1.8h, v1.8h 852 add v3.8h, v1.8h, v1.8h
847 sub v1.8h, v5.8h, v1.8h 853 sub v1.8h, v5.8h, v1.8h
848 add v10.8h, v2.8h, v4.8h 854 add v18.8h, v2.8h, v4.8h
849 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 855 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
850 sub v2.8h, v15.8h, v13.8h 856 sub v2.8h, v23.8h, v21.8h
851 add v3.8h, v3.8h, v6.8h 857 add v3.8h, v3.8h, v6.8h
852 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 858 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
853 add v1.8h, v1.8h, v4.8h 859 add v1.8h, v1.8h, v4.8h
854 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 860 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
855 sub v10.8h, v10.8h, v14.8h 861 sub v18.8h, v18.8h, v22.8h
856 add v2.8h, v2.8h, v6.8h 862 add v2.8h, v2.8h, v6.8h
857 sub v6.8h, v8.8h, v12.8h 863 sub v6.8h, v16.8h, v20.8h
858 add v12.8h, v8.8h, v12.8h 864 add v20.8h, v16.8h, v20.8h
859 add v9.8h, v5.8h, v4.8h 865 add v17.8h, v5.8h, v4.8h
860 add v5.8h, v6.8h, v10.8h 866 add v5.8h, v6.8h, v18.8h
861 sub v10.8h, v6.8h, v10.8h 867 sub v18.8h, v6.8h, v18.8h
862 add v6.8h, v15.8h, v13.8h 868 add v6.8h, v23.8h, v21.8h
863 add v8.8h, v12.8h, v14.8h 869 add v16.8h, v20.8h, v22.8h
864 sub v3.8h, v6.8h, v3.8h 870 sub v3.8h, v6.8h, v3.8h
865 sub v12.8h, v12.8h, v14.8h 871 sub v20.8h, v20.8h, v22.8h
866 sub v3.8h, v3.8h, v1.8h 872 sub v3.8h, v3.8h, v1.8h
867 sub v1.8h, v9.8h, v1.8h 873 sub v1.8h, v17.8h, v1.8h
868 add v2.8h, v3.8h, v2.8h 874 add v2.8h, v3.8h, v2.8h
869 sub v15.8h, v8.8h, v6.8h 875 sub v23.8h, v16.8h, v6.8h
870 add v1.8h, v1.8h, v2.8h 876 add v1.8h, v1.8h, v2.8h
871 add v8.8h, v8.8h, v6.8h 877 add v16.8h, v16.8h, v6.8h
872 add v14.8h, v5.8h, v3.8h 878 add v22.8h, v5.8h, v3.8h
873 sub v9.8h, v5.8h, v3.8h 879 sub v17.8h, v5.8h, v3.8h
874 sub v13.8h, v10.8h, v2.8h 880 sub v21.8h, v18.8h, v2.8h
875 add v10.8h, v10.8h, v2.8h 881 add v18.8h, v18.8h, v2.8h
876 /* Transpose q8-q9 */ 882 sub v19.8h, v20.8h, v1.8h
877 mov v18.16b, v8.16b 883 add v20.8h, v20.8h, v1.8h
878 trn1 v8.8h, v8.8h, v9.8h 884 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
879 trn2 v9.8h, v18.8h, v9.8h 885 /* 1-D IDCT, pass 2 */
880 sub v11.8h, v12.8h, v1.8h 886 sub v2.8h, v18.8h, v22.8h
881 /* Transpose q14-q15 */ 887 add v22.8h, v18.8h, v22.8h
882 mov v18.16b, v14.16b 888 sub v1.8h, v19.8h, v21.8h
883 trn1 v14.8h, v14.8h, v15.8h 889 add v21.8h, v19.8h, v21.8h
884 trn2 v15.8h, v18.8h, v15.8h 890 sub v5.8h, v17.8h, v23.8h
885 add v12.8h, v12.8h, v1.8h 891 add v23.8h, v17.8h, v23.8h
886 /* Transpose q10-q11 */ 892 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
887 mov v18.16b, v10.16b 893 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
888 trn1 v10.8h, v10.8h, v11.8h 894 add v3.8h, v1.8h, v1.8h
889 trn2 v11.8h, v18.8h, v11.8h 895 sub v1.8h, v5.8h, v1.8h
890 /* Transpose q12-q13 */ 896 add v18.8h, v2.8h, v4.8h
891 mov v18.16b, v12.16b 897 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
892 trn1 v12.8h, v12.8h, v13.8h 898 sub v2.8h, v23.8h, v21.8h
893 trn2 v13.8h, v18.8h, v13.8h 899 add v3.8h, v3.8h, v6.8h
894 /* Transpose q9-q11 */ 900 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
895 mov v18.16b, v9.16b 901 add v1.8h, v1.8h, v4.8h
896 trn1 v9.4s, v9.4s, v11.4s 902 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
897 trn2 v11.4s, v18.4s, v11.4s 903 sub v18.8h, v18.8h, v22.8h
898 /* Transpose q12-q14 */ 904 add v2.8h, v2.8h, v6.8h
899 mov v18.16b, v12.16b 905 sub v6.8h, v16.8h, v20.8h
900 trn1 v12.4s, v12.4s, v14.4s 906 add v20.8h, v16.8h, v20.8h
901 trn2 v14.4s, v18.4s, v14.4s 907 add v17.8h, v5.8h, v4.8h
902 /* Transpose q8-q10 */ 908 add v5.8h, v6.8h, v18.8h
903 mov v18.16b, v8.16b 909 sub v18.8h, v6.8h, v18.8h
904 trn1 v8.4s, v8.4s, v10.4s 910 add v6.8h, v23.8h, v21.8h
905 trn2 v10.4s, v18.4s, v10.4s 911 add v16.8h, v20.8h, v22.8h
906 /* Transpose q13-q15 */ 912 sub v3.8h, v6.8h, v3.8h
907 mov v18.16b, v13.16b 913 sub v20.8h, v20.8h, v22.8h
908 trn1 v13.4s, v13.4s, v15.4s 914 sub v3.8h, v3.8h, v1.8h
909 trn2 v15.4s, v18.4s, v15.4s 915 sub v1.8h, v17.8h, v1.8h
910 /* vswp v14.4h, v10-MSB.4h */ 916 add v2.8h, v3.8h, v2.8h
911 umov x22, v14.d[0] 917 sub v23.8h, v16.8h, v6.8h
912 ins v14.2d[0], v10.2d[1] 918 add v1.8h, v1.8h, v2.8h
913 ins v10.2d[1], x22 919 add v16.8h, v16.8h, v6.8h
914 /* vswp v13.4h, v9MSB.4h */ 920 add v22.8h, v5.8h, v3.8h
921 sub v17.8h, v5.8h, v3.8h
922 sub v21.8h, v18.8h, v2.8h
923 add v18.8h, v18.8h, v2.8h
924 sub v19.8h, v20.8h, v1.8h
925 add v20.8h, v20.8h, v1.8h
926 /* Descale to 8-bit and range limit */
927 movi v0.16b, #0x80
928 /* Prepare pointers (dual-issue with NEON instructions) */
929 ldp TMP1, TMP2, [OUTPUT_BUF], 16
930 sqshrn v28.8b, v16.8h, #5
931 ldp TMP3, TMP4, [OUTPUT_BUF], 16
932 sqshrn v29.8b, v17.8h, #5
933 add TMP1, TMP1, OUTPUT_COL
934 sqshrn v30.8b, v18.8h, #5
935 add TMP2, TMP2, OUTPUT_COL
936 sqshrn v31.8b, v19.8h, #5
937 add TMP3, TMP3, OUTPUT_COL
938 sqshrn2 v28.16b, v20.8h, #5
939 add TMP4, TMP4, OUTPUT_COL
940 sqshrn2 v29.16b, v21.8h, #5
941 ldp TMP5, TMP6, [OUTPUT_BUF], 16
942 sqshrn2 v30.16b, v22.8h, #5
943 ldp TMP7, TMP8, [OUTPUT_BUF], 16
944 sqshrn2 v31.16b, v23.8h, #5
945 add TMP5, TMP5, OUTPUT_COL
946 add v16.16b, v28.16b, v0.16b
947 add TMP6, TMP6, OUTPUT_COL
948 add v18.16b, v29.16b, v0.16b
949 add TMP7, TMP7, OUTPUT_COL
950 add v20.16b, v30.16b, v0.16b
951 add TMP8, TMP8, OUTPUT_COL
952 add v22.16b, v31.16b, v0.16b
915 953
916 umov x22, v13.d[0]
917 ins v13.2d[0], v9.2d[1]
918 ins v9.2d[1], x22
919 /* 1-D IDCT, pass 2 */
920 sub v2.8h, v10.8h, v14.8h
921 /* vswp v15.4h, v11MSB.4h */
922 umov x22, v15.d[0]
923 ins v15.2d[0], v11.2d[1]
924 ins v11.2d[1], x22
925 add v14.8h, v10.8h, v14.8h
926 /* vswp v12.4h, v8-MSB.4h */
927 umov x22, v12.d[0]
928 ins v12.2d[0], v8.2d[1]
929 ins v8.2d[1], x22
930 sub v1.8h, v11.8h, v13.8h
931 add v13.8h, v11.8h, v13.8h
932 sub v5.8h, v9.8h, v15.8h
933 add v15.8h, v9.8h, v15.8h
934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
936 add v3.8h, v1.8h, v1.8h
937 sub v1.8h, v5.8h, v1.8h
938 add v10.8h, v2.8h, v4.8h
939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
940 sub v2.8h, v15.8h, v13.8h
941 add v3.8h, v3.8h, v6.8h
942 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
943 add v1.8h, v1.8h, v4.8h
944 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
945 sub v10.8h, v10.8h, v14.8h
946 add v2.8h, v2.8h, v6.8h
947 sub v6.8h, v8.8h, v12.8h
948 add v12.8h, v8.8h, v12.8h
949 add v9.8h, v5.8h, v4.8h
950 add v5.8h, v6.8h, v10.8h
951 sub v10.8h, v6.8h, v10.8h
952 add v6.8h, v15.8h, v13.8h
953 add v8.8h, v12.8h, v14.8h
954 sub v3.8h, v6.8h, v3.8h
955 sub v12.8h, v12.8h, v14.8h
956 sub v3.8h, v3.8h, v1.8h
957 sub v1.8h, v9.8h, v1.8h
958 add v2.8h, v3.8h, v2.8h
959 sub v15.8h, v8.8h, v6.8h
960 add v1.8h, v1.8h, v2.8h
961 add v8.8h, v8.8h, v6.8h
962 add v14.8h, v5.8h, v3.8h
963 sub v9.8h, v5.8h, v3.8h
964 sub v13.8h, v10.8h, v2.8h
965 add v10.8h, v10.8h, v2.8h
966 sub v11.8h, v12.8h, v1.8h
967 add v12.8h, v12.8h, v1.8h
968 /* Descale to 8-bit and range limit */
969 movi v0.16b, #0x80
970 sqshrn v8.8b, v8.8h, #5
971 sqshrn2 v8.16b, v9.8h, #5
972 sqshrn v9.8b, v10.8h, #5
973 sqshrn2 v9.16b, v11.8h, #5
974 sqshrn v10.8b, v12.8h, #5
975 sqshrn2 v10.16b, v13.8h, #5
976 sqshrn v11.8b, v14.8h, #5
977 sqshrn2 v11.16b, v15.8h, #5
978 add v8.16b, v8.16b, v0.16b
979 add v9.16b, v9.16b, v0.16b
980 add v10.16b, v10.16b, v0.16b
981 add v11.16b, v11.16b, v0.16b
982 /* Transpose the final 8-bit samples */ 954 /* Transpose the final 8-bit samples */
983 /* Transpose q8-q9 */ 955 trn1 v28.16b, v16.16b, v18.16b
984 mov v18.16b, v8.16b 956 trn1 v30.16b, v20.16b, v22.16b
985 trn1 v8.8h, v8.8h, v9.8h 957 trn2 v29.16b, v16.16b, v18.16b
986 trn2 v9.8h, v18.8h, v9.8h 958 trn2 v31.16b, v20.16b, v22.16b
987 /* Transpose q10-q11 */ 959
988 mov v18.16b, v10.16b 960 trn1 v16.8h, v28.8h, v30.8h
989 trn1 v10.8h, v10.8h, v11.8h 961 trn2 v18.8h, v28.8h, v30.8h
990 trn2 v11.8h, v18.8h, v11.8h 962 trn1 v20.8h, v29.8h, v31.8h
991 /* Transpose q8-q10 */ 963 trn2 v22.8h, v29.8h, v31.8h
992 mov v18.16b, v8.16b 964
993 trn1 v8.4s, v8.4s, v10.4s 965 uzp1 v28.4s, v16.4s, v18.4s
994 trn2 v10.4s, v18.4s, v10.4s 966 uzp2 v30.4s, v16.4s, v18.4s
995 /* Transpose q9-q11 */ 967 uzp1 v29.4s, v20.4s, v22.4s
996 mov v18.16b, v9.16b 968 uzp2 v31.4s, v20.4s, v22.4s
997 trn1 v9.4s, v9.4s, v11.4s 969
998 trn2 v11.4s, v18.4s, v11.4s
999 /* make copy */
1000 ins v17.2d[0], v8.2d[1]
1001 /* Transpose d16-d17-msb */
1002 mov v18.16b, v8.16b
1003 trn1 v8.8b, v8.8b, v17.8b
1004 trn2 v17.8b, v18.8b, v17.8b
1005 /* make copy */
1006 ins v19.2d[0], v9.2d[1]
1007 mov v18.16b, v9.16b
1008 trn1 v9.8b, v9.8b, v19.8b
1009 trn2 v19.8b, v18.8b, v19.8b
1010 /* Store results to the output buffer */ 970 /* Store results to the output buffer */
1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16 971 st1 {v28.d}[0], [TMP1]
1012 add TMP1, TMP1, OUTPUT_COL 972 st1 {v29.d}[0], [TMP2]
1013 add TMP2, TMP2, OUTPUT_COL 973 st1 {v28.d}[1], [TMP3]
1014 st1 {v8.8b}, [TMP1] 974 st1 {v29.d}[1], [TMP4]
1015 st1 {v17.8b}, [TMP2] 975 st1 {v30.d}[0], [TMP5]
1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16 976 st1 {v31.d}[0], [TMP6]
1017 add TMP1, TMP1, OUTPUT_COL 977 st1 {v30.d}[1], [TMP7]
1018 add TMP2, TMP2, OUTPUT_COL 978 st1 {v31.d}[1], [TMP8]
1019 st1 {v9.8b}, [TMP1]
1020 /* make copy */
1021 ins v7.2d[0], v10.2d[1]
1022 mov v18.16b, v10.16b
1023 trn1 v10.8b, v10.8b, v7.8b
1024 trn2 v7.8b, v18.8b, v7.8b
1025 st1 {v19.8b}, [TMP2]
1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16
1028 add TMP1, TMP1, OUTPUT_COL
1029 add TMP2, TMP2, OUTPUT_COL
1030 add TMP4, TMP4, OUTPUT_COL
1031 add TMP5, TMP5, OUTPUT_COL
1032 st1 {v10.8b}, [TMP1]
1033 /* make copy */
1034 ins v16.2d[0], v11.2d[1]
1035 mov v18.16b, v11.16b
1036 trn1 v11.8b, v11.8b, v16.8b
1037 trn2 v16.8b, v18.8b, v16.8b
1038 st1 {v7.8b}, [TMP2]
1039 st1 {v11.8b}, [TMP4]
1040 st1 {v16.8b}, [TMP5]
1041 sub sp, sp, #176
1042 ldp x22, x23, [sp], 16
1043 ld1 {v0.8b - v3.8b}, [sp], 32
1044 ld1 {v4.8b - v7.8b}, [sp], 32
1045 ld1 {v8.8b - v11.8b}, [sp], 32
1046 ld1 {v12.8b - v15.8b}, [sp], 32
1047 ld1 {v16.8b - v19.8b}, [sp], 32
1048 blr x30 979 blr x30
1049 980
1050 .unreq DCT_TABLE 981 .unreq DCT_TABLE
1051 .unreq COEF_BLOCK 982 .unreq COEF_BLOCK
1052 .unreq OUTPUT_BUF 983 .unreq OUTPUT_BUF
1053 .unreq OUTPUT_COL 984 .unreq OUTPUT_COL
1054 .unreq TMP1 985 .unreq TMP1
1055 .unreq TMP2 986 .unreq TMP2
1056 .unreq TMP3 987 .unreq TMP3
1057 .unreq TMP4 988 .unreq TMP4
989 .unreq TMP5
990 .unreq TMP6
991 .unreq TMP7
992 .unreq TMP8
1058 993
1059 994
1060 /*****************************************************************************/ 995 /*****************************************************************************/
1061 996
1062 /* 997 /*
1063 * jsimd_idct_4x4_neon 998 * jsimd_idct_4x4_neon
1064 * 999 *
1065 * This function contains inverse-DCT code for getting reduced-size 1000 * This function contains inverse-DCT code for getting reduced-size
1066 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 1001 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
1067 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 1002 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1068 * function from jpeg-6b (jidctred.c). 1003 * function from jpeg-6b (jidctred.c).
1069 * 1004 *
1070 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 1005 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1071 * requires much less arithmetic operations and hence should be faster. 1006 * requires much less arithmetic operations and hence should be faster.
1072 * The primary purpose of this particular NEON optimized function is 1007 * The primary purpose of this particular NEON optimized function is
1073 * bit exact compatibility with jpeg-6b. 1008 * bit exact compatibility with jpeg-6b.
1074 * 1009 *
1075 * TODO: a bit better instructions scheduling can be achieved by expanding 1010 * TODO: a bit better instructions scheduling can be achieved by expanding
1076 * idct_helper/transpose_4x4 macros and reordering instructions, 1011 * idct_helper/transpose_4x4 macros and reordering instructions,
1077 * but readability will suffer somewhat. 1012 * but readability will suffer somewhat.
1078 */ 1013 */
1079 1014
1080 #define CONST_BITS 13 1015 #define CONST_BITS 13
1081 1016
1082 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 1017 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
1083 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 1018 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
1084 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 1019 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
1085 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 1020 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
1086 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 1021 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
1087 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 1022 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
1088 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 1023 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
1089 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 1024 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
1090 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 1025 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
1091 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 1026 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
1092 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 1027 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
1093 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 1028 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
1094 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 1029 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
1095 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 1030 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
1096 1031
1097 .balign 16 1032 .balign 16
1098 jsimd_idct_4x4_neon_consts: 1033 Ljsimd_idct_4x4_neon_consts:
1099 .short FIX_1_847759065 /* v0.4h[0] */ 1034 .short FIX_1_847759065 /* v0.h[0] */
1100 .short -FIX_0_765366865 /* v0.4h[1] */ 1035 .short -FIX_0_765366865 /* v0.h[1] */
1101 .short -FIX_0_211164243 /* v0.4h[2] */ 1036 .short -FIX_0_211164243 /* v0.h[2] */
1102 .short FIX_1_451774981 /* v0.4h[3] */ 1037 .short FIX_1_451774981 /* v0.h[3] */
1103 .short -FIX_2_172734803 /* d1[0] */ 1038 .short -FIX_2_172734803 /* d1[0] */
1104 .short FIX_1_061594337 /* d1[1] */ 1039 .short FIX_1_061594337 /* d1[1] */
1105 .short -FIX_0_509795579 /* d1[2] */ 1040 .short -FIX_0_509795579 /* d1[2] */
1106 .short -FIX_0_601344887 /* d1[3] */ 1041 .short -FIX_0_601344887 /* d1[3] */
1107 .short FIX_0_899976223 /* v2.4h[0] */ 1042 .short FIX_0_899976223 /* v2.h[0] */
1108 .short FIX_2_562915447 /* v2.4h[1] */ 1043 .short FIX_2_562915447 /* v2.h[1] */
1109 .short 1 << (CONST_BITS+1) /* v2.4h[2] */ 1044 .short 1 << (CONST_BITS+1) /* v2.h[2] */
1110 .short 0 /* v2.4h[3] */ 1045 .short 0 /* v2.h[3] */
1111 1046
1112 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 1047 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1113 smull v28.4s, \x4, v2.4h[2] 1048 smull v28.4s, \x4, v2.h[2]
1114 smlal v28.4s, \x8, v0.4h[0] 1049 smlal v28.4s, \x8, v0.h[0]
1115 smlal v28.4s, \x14, v0.4h[1] 1050 smlal v28.4s, \x14, v0.h[1]
1116 1051
1117 smull v26.4s, \x16, v1.4h[2] 1052 smull v26.4s, \x16, v1.h[2]
1118 smlal v26.4s, \x12, v1.4h[3] 1053 smlal v26.4s, \x12, v1.h[3]
1119 smlal v26.4s, \x10, v2.4h[0] 1054 smlal v26.4s, \x10, v2.h[0]
1120 smlal v26.4s, \x6, v2.4h[1] 1055 smlal v26.4s, \x6, v2.h[1]
1121 1056
1122 smull v30.4s, \x4, v2.4h[2] 1057 smull v30.4s, \x4, v2.h[2]
1123 smlsl v30.4s, \x8, v0.4h[0] 1058 smlsl v30.4s, \x8, v0.h[0]
1124 smlsl v30.4s, \x14, v0.4h[1] 1059 smlsl v30.4s, \x14, v0.h[1]
1125 1060
1126 smull v24.4s, \x16, v0.4h[2] 1061 smull v24.4s, \x16, v0.h[2]
1127 smlal v24.4s, \x12, v0.4h[3] 1062 smlal v24.4s, \x12, v0.h[3]
1128 smlal v24.4s, \x10, v1.4h[0] 1063 smlal v24.4s, \x10, v1.h[0]
1129 smlal v24.4s, \x6, v1.4h[1] 1064 smlal v24.4s, \x6, v1.h[1]
1130 1065
1131 add v20.4s, v28.4s, v26.4s 1066 add v20.4s, v28.4s, v26.4s
1132 sub v28.4s, v28.4s, v26.4s 1067 sub v28.4s, v28.4s, v26.4s
1133 1068
1134 .if \shift > 16 1069 .if \shift > 16
1135 srshr v20.4s, v20.4s, #\shift 1070 srshr v20.4s, v20.4s, #\shift
1136 srshr v28.4s, v28.4s, #\shift 1071 srshr v28.4s, v28.4s, #\shift
1137 xtn \y26, v20.4s 1072 xtn \y26, v20.4s
1138 xtn \y29, v28.4s 1073 xtn \y29, v28.4s
1139 .else 1074 .else
1140 rshrn \y26, v20.4s, #\shift 1075 rshrn \y26, v20.4s, #\shift
1141 rshrn \y29, v28.4s, #\shift 1076 rshrn \y29, v28.4s, #\shift
1142 .endif 1077 .endif
1143 1078
1144 add v20.4s, v30.4s, v24.4s 1079 add v20.4s, v30.4s, v24.4s
1145 sub v30.4s, v30.4s, v24.4s 1080 sub v30.4s, v30.4s, v24.4s
1146 1081
1147 .if \shift > 16 1082 .if \shift > 16
1148 srshr v20.4s, v20.4s, #\shift 1083 srshr v20.4s, v20.4s, #\shift
1149 srshr v30.4s, v30.4s, #\shift 1084 srshr v30.4s, v30.4s, #\shift
1150 xtn \y27, v20.4s 1085 xtn \y27, v20.4s
1151 xtn \y28, v30.4s 1086 xtn \y28, v30.4s
1152 .else 1087 .else
1153 rshrn \y27, v20.4s, #\shift 1088 rshrn \y27, v20.4s, #\shift
1154 rshrn \y28, v30.4s, #\shift 1089 rshrn \y28, v30.4s, #\shift
1155 .endif 1090 .endif
1156
1157 .endm 1091 .endm
1158 1092
1159 asm_function jsimd_idct_4x4_neon 1093 asm_function jsimd_idct_4x4_neon
1160 1094
1161 DCT_TABLE .req x0 1095 DCT_TABLE .req x0
1162 COEF_BLOCK .req x1 1096 COEF_BLOCK .req x1
1163 OUTPUT_BUF .req x2 1097 OUTPUT_BUF .req x2
1164 OUTPUT_COL .req x3 1098 OUTPUT_COL .req x3
1165 TMP1 .req x0 1099 TMP1 .req x0
1166 TMP2 .req x1 1100 TMP2 .req x1
1167 TMP3 .req x2 1101 TMP3 .req x2
1168 TMP4 .req x15 1102 TMP4 .req x15
1169 1103
1170 /* Save all used NEON registers */ 1104 /* Save all used NEON registers */
1171 sub sp, sp, 272 1105 sub sp, sp, 272
1172 str x15, [sp], 16 1106 str x15, [sp], 16
1173 /* Load constants (v3.4h is just used for padding) */ 1107 /* Load constants (v3.4h is just used for padding) */
1174 adr TMP4, jsimd_idct_4x4_neon_consts 1108 adr TMP4, Ljsimd_idct_4x4_neon_consts
1175 st1 {v0.8b - v3.8b}, [sp], 32 1109 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1176 st1 {v4.8b - v7.8b}, [sp], 32 1110 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1177 st1 {v8.8b - v11.8b}, [sp], 32 1111 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1178 st1 {v12.8b - v15.8b}, [sp], 32 1112 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1179 st1 {v16.8b - v19.8b}, [sp], 32 1113 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1180 st1 {v20.8b - v23.8b}, [sp], 32 1114 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1181 st1 {v24.8b - v27.8b}, [sp], 32 1115 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1182 st1 {v28.8b - v31.8b}, [sp], 32 1116 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
1183 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] 1117 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1184 1118
1185 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1119 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1186 * 0 1 2 3 | 4 5 6 7 1120 * 0 1 2 3 | 4 5 6 7
1187 * ---------+-------- 1121 * ---------+--------
1188 * 0 | v4.4h | v5.4h 1122 * 0 | v4.4h | v5.4h
1189 * 1 | v6.4h | v7.4h 1123 * 1 | v6.4h | v7.4h
1190 * 2 | v8.4h | v9.4h 1124 * 2 | v8.4h | v9.4h
1191 * 3 | v10.4h | v11.4h 1125 * 3 | v10.4h | v11.4h
1192 * 4 | - | - 1126 * 4 | - | -
1193 * 5 | v12.4h | v13.4h 1127 * 5 | v12.4h | v13.4h
1194 * 6 | v14.4h | v15.4h 1128 * 6 | v14.4h | v15.4h
1195 * 7 | v16.4h | v17.4h 1129 * 7 | v16.4h | v17.4h
1196 */ 1130 */
1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1131 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 1132 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1199 add COEF_BLOCK, COEF_BLOCK, #16 1133 add COEF_BLOCK, COEF_BLOCK, #16
1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 1134 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1135 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1202 /* dequantize */ 1136 /* dequantize */
1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1137 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1204 mul v4.4h, v4.4h, v18.4h 1138 mul v4.4h, v4.4h, v18.4h
1205 mul v5.4h, v5.4h, v19.4h 1139 mul v5.4h, v5.4h, v19.4h
1206 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */ 1140 ins v4.d[1], v5.d[0] /* 128 bit q4 */
1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 1141 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1208 mul v6.4h, v6.4h, v20.4h 1142 mul v6.4h, v6.4h, v20.4h
1209 mul v7.4h, v7.4h, v21.4h 1143 mul v7.4h, v7.4h, v21.4h
1210 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */ 1144 ins v6.d[1], v7.d[0] /* 128 bit q6 */
1211 mul v8.4h, v8.4h, v22.4h 1145 mul v8.4h, v8.4h, v22.4h
1212 mul v9.4h, v9.4h, v23.4h 1146 mul v9.4h, v9.4h, v23.4h
1213 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */ 1147 ins v8.d[1], v9.d[0] /* 128 bit q8 */
1214 add DCT_TABLE, DCT_TABLE, #16 1148 add DCT_TABLE, DCT_TABLE, #16
1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 1149 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1216 mul v10.4h, v10.4h, v24.4h 1150 mul v10.4h, v10.4h, v24.4h
1217 mul v11.4h, v11.4h, v25.4h 1151 mul v11.4h, v11.4h, v25.4h
1218 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */ 1152 ins v10.d[1], v11.d[0] /* 128 bit q10 */
1219 mul v12.4h, v12.4h, v26.4h 1153 mul v12.4h, v12.4h, v26.4h
1220 mul v13.4h, v13.4h, v27.4h 1154 mul v13.4h, v13.4h, v27.4h
1221 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */ 1155 ins v12.d[1], v13.d[0] /* 128 bit q12 */
1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1156 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1223 mul v14.4h, v14.4h, v28.4h 1157 mul v14.4h, v14.4h, v28.4h
1224 mul v15.4h, v15.4h, v29.4h 1158 mul v15.4h, v15.4h, v29.4h
1225 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */ 1159 ins v14.d[1], v15.d[0] /* 128 bit q14 */
1226 mul v16.4h, v16.4h, v30.4h 1160 mul v16.4h, v16.4h, v30.4h
1227 mul v17.4h, v17.4h, v31.4h 1161 mul v17.4h, v17.4h, v31.4h
1228 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */ 1162 ins v16.d[1], v17.d[0] /* 128 bit q16 */
1229 1163
1230 /* Pass 1 */ 1164 /* Pass 1 */
1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h 1165 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1166 v4.4h, v6.4h, v8.4h, v10.4h
1232 transpose_4x4 v4, v6, v8, v10, v3 1167 transpose_4x4 v4, v6, v8, v10, v3
1233 ins v10.2d[1], v11.2d[0] 1168 ins v10.d[1], v11.d[0]
1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h 1169 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1170 v5.4h, v7.4h, v9.4h, v11.4h
1235 transpose_4x4 v5, v7, v9, v11, v3 1171 transpose_4x4 v5, v7, v9, v11, v3
1236 ins v10.2d[1], v11.2d[0] 1172 ins v10.d[1], v11.d[0]
1173
1237 /* Pass 2 */ 1174 /* Pass 2 */
1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h 1175 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1176 v26.4h, v27.4h, v28.4h, v29.4h
1239 transpose_4x4 v26, v27, v28, v29, v3 1177 transpose_4x4 v26, v27, v28, v29, v3
1240 1178
1241 /* Range limit */ 1179 /* Range limit */
1242 movi v30.8h, #0x80 1180 movi v30.8h, #0x80
1243 ins v26.2d[1], v27.2d[0] 1181 ins v26.d[1], v27.d[0]
1244 ins v28.2d[1], v29.2d[0] 1182 ins v28.d[1], v29.d[0]
1245 add v26.8h, v26.8h, v30.8h 1183 add v26.8h, v26.8h, v30.8h
1246 add v28.8h, v28.8h, v30.8h 1184 add v28.8h, v28.8h, v30.8h
1247 sqxtun v26.8b, v26.8h 1185 sqxtun v26.8b, v26.8h
1248 sqxtun v27.8b, v28.8h 1186 sqxtun v27.8b, v28.8h
1249 1187
1250 /* Store results to the output buffer */ 1188 /* Store results to the output buffer */
1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1189 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1252 ldp TMP3, TMP4, [OUTPUT_BUF] 1190 ldp TMP3, TMP4, [OUTPUT_BUF]
1253 add TMP1, TMP1, OUTPUT_COL 1191 add TMP1, TMP1, OUTPUT_COL
1254 add TMP2, TMP2, OUTPUT_COL 1192 add TMP2, TMP2, OUTPUT_COL
(...skipping 24 matching lines...) Expand all
1279 st1 {v27.b}[5], [TMP4], 1 1217 st1 {v27.b}[5], [TMP4], 1
1280 st1 {v26.b}[6], [TMP2], 1 1218 st1 {v26.b}[6], [TMP2], 1
1281 st1 {v27.b}[6], [TMP4], 1 1219 st1 {v27.b}[6], [TMP4], 1
1282 st1 {v26.b}[7], [TMP2], 1 1220 st1 {v26.b}[7], [TMP2], 1
1283 st1 {v27.b}[7], [TMP4], 1 1221 st1 {v27.b}[7], [TMP4], 1
1284 #endif 1222 #endif
1285 1223
1286 /* vpop {v8.4h - v15.4h} ;not available */ 1224 /* vpop {v8.4h - v15.4h} ;not available */
1287 sub sp, sp, #272 1225 sub sp, sp, #272
1288 ldr x15, [sp], 16 1226 ldr x15, [sp], 16
1289 ld1 {v0.8b - v3.8b}, [sp], 32 1227 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1290 ld1 {v4.8b - v7.8b}, [sp], 32 1228 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1291 ld1 {v8.8b - v11.8b}, [sp], 32 1229 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1292 ld1 {v12.8b - v15.8b}, [sp], 32 1230 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1293 ld1 {v16.8b - v19.8b}, [sp], 32 1231 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1294 ld1 {v20.8b - v23.8b}, [sp], 32 1232 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1295 ld1 {v24.8b - v27.8b}, [sp], 32 1233 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1296 ld1 {v28.8b - v31.8b}, [sp], 32 1234 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
1297 blr x30 1235 blr x30
1298 1236
1299 .unreq DCT_TABLE 1237 .unreq DCT_TABLE
1300 .unreq COEF_BLOCK 1238 .unreq COEF_BLOCK
1301 .unreq OUTPUT_BUF 1239 .unreq OUTPUT_BUF
1302 .unreq OUTPUT_COL 1240 .unreq OUTPUT_COL
1303 .unreq TMP1 1241 .unreq TMP1
1304 .unreq TMP2 1242 .unreq TMP2
1305 .unreq TMP3 1243 .unreq TMP3
1306 .unreq TMP4 1244 .unreq TMP4
(...skipping 11 matching lines...) Expand all
1318 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1256 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1319 * function from jpeg-6b (jidctred.c). 1257 * function from jpeg-6b (jidctred.c).
1320 * 1258 *
1321 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1259 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1322 * requires much less arithmetic operations and hence should be faster. 1260 * requires much less arithmetic operations and hence should be faster.
1323 * The primary purpose of this particular NEON optimized function is 1261 * The primary purpose of this particular NEON optimized function is
1324 * bit exact compatibility with jpeg-6b. 1262 * bit exact compatibility with jpeg-6b.
1325 */ 1263 */
1326 1264
1327 .balign 8 1265 .balign 8
1328 jsimd_idct_2x2_neon_consts: 1266 Ljsimd_idct_2x2_neon_consts:
1329 .short -FIX_0_720959822 /* v14[0] */ 1267 .short -FIX_0_720959822 /* v14[0] */
1330 .short FIX_0_850430095 /* v14[1] */ 1268 .short FIX_0_850430095 /* v14[1] */
1331 .short -FIX_1_272758580 /* v14[2] */ 1269 .short -FIX_1_272758580 /* v14[2] */
1332 .short FIX_3_624509785 /* v14[3] */ 1270 .short FIX_3_624509785 /* v14[3] */
1333 1271
1334 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1272 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1335 sshll v15.4s, \x4, #15 1273 sshll v15.4s, \x4, #15
1336 smull v26.4s, \x6, v14.4h[3] 1274 smull v26.4s, \x6, v14.h[3]
1337 smlal v26.4s, \x10, v14.4h[2] 1275 smlal v26.4s, \x10, v14.h[2]
1338 smlal v26.4s, \x12, v14.4h[1] 1276 smlal v26.4s, \x12, v14.h[1]
1339 smlal v26.4s, \x16, v14.4h[0] 1277 smlal v26.4s, \x16, v14.h[0]
1340 1278
1341 add v20.4s, v15.4s, v26.4s 1279 add v20.4s, v15.4s, v26.4s
1342 sub v15.4s, v15.4s, v26.4s 1280 sub v15.4s, v15.4s, v26.4s
1343 1281
1344 .if \shift > 16 1282 .if \shift > 16
1345 srshr v20.4s, v20.4s, #\shift 1283 srshr v20.4s, v20.4s, #\shift
1346 srshr v15.4s, v15.4s, #\shift 1284 srshr v15.4s, v15.4s, #\shift
1347 xtn \y26, v20.4s 1285 xtn \y26, v20.4s
1348 xtn \y27, v15.4s 1286 xtn \y27, v15.4s
1349 .else 1287 .else
1350 rshrn \y26, v20.4s, #\shift 1288 rshrn \y26, v20.4s, #\shift
1351 rshrn \y27, v15.4s, #\shift 1289 rshrn \y27, v15.4s, #\shift
1352 .endif 1290 .endif
1353
1354 .endm 1291 .endm
1355 1292
1356 asm_function jsimd_idct_2x2_neon 1293 asm_function jsimd_idct_2x2_neon
1357 1294
1358 DCT_TABLE .req x0 1295 DCT_TABLE .req x0
1359 COEF_BLOCK .req x1 1296 COEF_BLOCK .req x1
1360 OUTPUT_BUF .req x2 1297 OUTPUT_BUF .req x2
1361 OUTPUT_COL .req x3 1298 OUTPUT_COL .req x3
1362 TMP1 .req x0 1299 TMP1 .req x0
1363 TMP2 .req x15 1300 TMP2 .req x15
1364 1301
1365 /* vpush {v8.4h - v15.4h} ; not available */ 1302 /* vpush {v8.4h - v15.4h} ; not available */
1366 sub sp, sp, 208 1303 sub sp, sp, 208
1367 str x15, [sp], 16 1304 str x15, [sp], 16
1368 1305
1369 /* Load constants */ 1306 /* Load constants */
1370 adr TMP2, jsimd_idct_2x2_neon_consts 1307 adr TMP2, Ljsimd_idct_2x2_neon_consts
1371 st1 {v4.8b - v7.8b}, [sp], 32 1308 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1372 st1 {v8.8b - v11.8b}, [sp], 32 1309 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1373 st1 {v12.8b - v15.8b}, [sp], 32 1310 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1374 st1 {v16.8b - v19.8b}, [sp], 32 1311 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1375 st1 {v21.8b - v22.8b}, [sp], 16 1312 st1 {v21.8b, v22.8b}, [sp], 16
1376 st1 {v24.8b - v27.8b}, [sp], 32 1313 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1377 st1 {v30.8b - v31.8b}, [sp], 16 1314 st1 {v30.8b, v31.8b}, [sp], 16
1378 ld1 {v14.4h}, [TMP2] 1315 ld1 {v14.4h}, [TMP2]
1379 1316
1380 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1317 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1381 * 0 1 2 3 | 4 5 6 7 1318 * 0 1 2 3 | 4 5 6 7
1382 * ---------+-------- 1319 * ---------+--------
1383 * 0 | v4.4h | v5.4h 1320 * 0 | v4.4h | v5.4h
1384 * 1 | v6.4h | v7.4h 1321 * 1 | v6.4h | v7.4h
1385 * 2 | - | - 1322 * 2 | - | -
1386 * 3 | v10.4h | v11.4h 1323 * 3 | v10.4h | v11.4h
1387 * 4 | - | - 1324 * 4 | - | -
1388 * 5 | v12.4h | v13.4h 1325 * 5 | v12.4h | v13.4h
1389 * 6 | - | - 1326 * 6 | - | -
1390 * 7 | v16.4h | v17.4h 1327 * 7 | v16.4h | v17.4h
1391 */ 1328 */
1392 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1329 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1393 add COEF_BLOCK, COEF_BLOCK, #16 1330 add COEF_BLOCK, COEF_BLOCK, #16
1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 1331 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
1395 add COEF_BLOCK, COEF_BLOCK, #16 1332 add COEF_BLOCK, COEF_BLOCK, #16
1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 1333 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
1397 add COEF_BLOCK, COEF_BLOCK, #16 1334 add COEF_BLOCK, COEF_BLOCK, #16
1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1335 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1399 /* Dequantize */ 1336 /* Dequantize */
1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1337 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1401 mul v4.4h, v4.4h, v18.4h 1338 mul v4.4h, v4.4h, v18.4h
1402 mul v5.4h, v5.4h, v19.4h 1339 mul v5.4h, v5.4h, v19.4h
1403 ins v4.2d[1], v5.2d[0] 1340 ins v4.d[1], v5.d[0]
1404 mul v6.4h, v6.4h, v20.4h 1341 mul v6.4h, v6.4h, v20.4h
1405 mul v7.4h, v7.4h, v21.4h 1342 mul v7.4h, v7.4h, v21.4h
1406 ins v6.2d[1], v7.2d[0] 1343 ins v6.d[1], v7.d[0]
1407 add DCT_TABLE, DCT_TABLE, #16 1344 add DCT_TABLE, DCT_TABLE, #16
1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 1345 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
1409 mul v10.4h, v10.4h, v24.4h 1346 mul v10.4h, v10.4h, v24.4h
1410 mul v11.4h, v11.4h, v25.4h 1347 mul v11.4h, v11.4h, v25.4h
1411 ins v10.2d[1], v11.2d[0] 1348 ins v10.d[1], v11.d[0]
1412 add DCT_TABLE, DCT_TABLE, #16 1349 add DCT_TABLE, DCT_TABLE, #16
1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 1350 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
1414 mul v12.4h, v12.4h, v26.4h 1351 mul v12.4h, v12.4h, v26.4h
1415 mul v13.4h, v13.4h, v27.4h 1352 mul v13.4h, v13.4h, v27.4h
1416 ins v12.2d[1], v13.2d[0] 1353 ins v12.d[1], v13.d[0]
1417 add DCT_TABLE, DCT_TABLE, #16 1354 add DCT_TABLE, DCT_TABLE, #16
1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1355 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1419 mul v16.4h, v16.4h, v30.4h 1356 mul v16.4h, v16.4h, v30.4h
1420 mul v17.4h, v17.4h, v31.4h 1357 mul v17.4h, v17.4h, v31.4h
1421 ins v16.2d[1], v17.2d[0] 1358 ins v16.d[1], v17.d[0]
1422 1359
1423 /* Pass 1 */ 1360 /* Pass 1 */
1424 #if 0 1361 #if 0
1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h 1362 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h 1363 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h 1364 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h 1365 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
1429 #else 1366 #else
1430 smull v26.4s, v6.4h, v14.4h[3] 1367 smull v26.4s, v6.4h, v14.h[3]
1431 smlal v26.4s, v10.4h, v14.4h[2] 1368 smlal v26.4s, v10.4h, v14.h[2]
1432 smlal v26.4s, v12.4h, v14.4h[1] 1369 smlal v26.4s, v12.4h, v14.h[1]
1433 smlal v26.4s, v16.4h, v14.4h[0] 1370 smlal v26.4s, v16.4h, v14.h[0]
1434 smull v24.4s, v7.4h, v14.4h[3] 1371 smull v24.4s, v7.4h, v14.h[3]
1435 smlal v24.4s, v11.4h, v14.4h[2] 1372 smlal v24.4s, v11.4h, v14.h[2]
1436 smlal v24.4s, v13.4h, v14.4h[1] 1373 smlal v24.4s, v13.4h, v14.h[1]
1437 smlal v24.4s, v17.4h, v14.4h[0] 1374 smlal v24.4s, v17.4h, v14.h[0]
1438 sshll v15.4s, v4.4h, #15 1375 sshll v15.4s, v4.4h, #15
1439 sshll v30.4s, v5.4h, #15 1376 sshll v30.4s, v5.4h, #15
1440 add v20.4s, v15.4s, v26.4s 1377 add v20.4s, v15.4s, v26.4s
1441 sub v15.4s, v15.4s, v26.4s 1378 sub v15.4s, v15.4s, v26.4s
1442 rshrn v4.4h, v20.4s, #13 1379 rshrn v4.4h, v20.4s, #13
1443 rshrn v6.4h, v15.4s, #13 1380 rshrn v6.4h, v15.4s, #13
1444 add v20.4s, v30.4s, v24.4s 1381 add v20.4s, v30.4s, v24.4s
1445 sub v15.4s, v30.4s, v24.4s 1382 sub v15.4s, v30.4s, v24.4s
1446 rshrn v5.4h, v20.4s, #13 1383 rshrn v5.4h, v20.4s, #13
1447 rshrn v7.4h, v15.4s, #13 1384 rshrn v7.4h, v15.4s, #13
1448 ins v4.2d[1], v5.2d[0] 1385 ins v4.d[1], v5.d[0]
1449 ins v6.2d[1], v7.2d[0] 1386 ins v6.d[1], v7.d[0]
1450 transpose v4, v6, v3, .16b, .8h 1387 transpose v4, v6, v3, .16b, .8h
1451 transpose v6, v10, v3, .16b, .4s 1388 transpose v6, v10, v3, .16b, .4s
1452 ins v11.2d[0], v10.2d[1] 1389 ins v11.d[0], v10.d[1]
1453 ins v7.2d[0], v6.2d[1] 1390 ins v7.d[0], v6.d[1]
1454 #endif 1391 #endif
1455 1392
1456 /* Pass 2 */ 1393 /* Pass 2 */
1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h 1394 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1458 1395
1459 /* Range limit */ 1396 /* Range limit */
1460 movi v30.8h, #0x80 1397 movi v30.8h, #0x80
1461 ins v26.2d[1], v27.2d[0] 1398 ins v26.d[1], v27.d[0]
1462 add v26.8h, v26.8h, v30.8h 1399 add v26.8h, v26.8h, v30.8h
1463 sqxtun v30.8b, v26.8h 1400 sqxtun v30.8b, v26.8h
1464 ins v26.2d[0], v30.2d[0] 1401 ins v26.d[0], v30.d[0]
1465 sqxtun v27.8b, v26.8h 1402 sqxtun v27.8b, v26.8h
1466 1403
1467 /* Store results to the output buffer */ 1404 /* Store results to the output buffer */
1468 ldp TMP1, TMP2, [OUTPUT_BUF] 1405 ldp TMP1, TMP2, [OUTPUT_BUF]
1469 add TMP1, TMP1, OUTPUT_COL 1406 add TMP1, TMP1, OUTPUT_COL
1470 add TMP2, TMP2, OUTPUT_COL 1407 add TMP2, TMP2, OUTPUT_COL
1471 1408
1472 st1 {v26.b}[0], [TMP1], 1 1409 st1 {v26.b}[0], [TMP1], 1
1473 st1 {v27.b}[4], [TMP1], 1 1410 st1 {v27.b}[4], [TMP1], 1
1474 st1 {v26.b}[1], [TMP2], 1 1411 st1 {v26.b}[1], [TMP2], 1
1475 st1 {v27.b}[5], [TMP2], 1 1412 st1 {v27.b}[5], [TMP2], 1
1476 1413
1477 sub sp, sp, #208 1414 sub sp, sp, #208
1478 ldr x15, [sp], 16 1415 ldr x15, [sp], 16
1479 ld1 {v4.8b - v7.8b}, [sp], 32 1416 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1480 ld1 {v8.8b - v11.8b}, [sp], 32 1417 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1481 ld1 {v12.8b - v15.8b}, [sp], 32 1418 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1482 ld1 {v16.8b - v19.8b}, [sp], 32 1419 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1483 ld1 {v21.8b - v22.8b}, [sp], 16 1420 ld1 {v21.8b, v22.8b}, [sp], 16
1484 ld1 {v24.8b - v27.8b}, [sp], 32 1421 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1485 ld1 {v30.8b - v31.8b}, [sp], 16 1422 ld1 {v30.8b, v31.8b}, [sp], 16
1486 blr x30 1423 blr x30
1487 1424
1488 .unreq DCT_TABLE 1425 .unreq DCT_TABLE
1489 .unreq COEF_BLOCK 1426 .unreq COEF_BLOCK
1490 .unreq OUTPUT_BUF 1427 .unreq OUTPUT_BUF
1491 .unreq OUTPUT_COL 1428 .unreq OUTPUT_COL
1492 .unreq TMP1 1429 .unreq TMP1
1493 .unreq TMP2 1430 .unreq TMP2
1494 1431
1495 .purgem idct_helper 1432 .purgem idct_helper
1496 1433
1497 1434
1498 /*****************************************************************************/ 1435 /*****************************************************************************/
1499 1436
1500 /* 1437 /*
1501 * jsimd_ycc_extrgb_convert_neon 1438 * jsimd_ycc_extrgb_convert_neon
1502 * jsimd_ycc_extbgr_convert_neon 1439 * jsimd_ycc_extbgr_convert_neon
1503 * jsimd_ycc_extrgbx_convert_neon 1440 * jsimd_ycc_extrgbx_convert_neon
1504 * jsimd_ycc_extbgrx_convert_neon 1441 * jsimd_ycc_extbgrx_convert_neon
1505 * jsimd_ycc_extxbgr_convert_neon 1442 * jsimd_ycc_extxbgr_convert_neon
1506 * jsimd_ycc_extxrgb_convert_neon 1443 * jsimd_ycc_extxrgb_convert_neon
1507 * 1444 *
1508 * Colorspace conversion YCbCr -> RGB 1445 * Colorspace conversion YCbCr -> RGB
1509 */ 1446 */
1510 1447
1511
1512 .macro do_load size 1448 .macro do_load size
1449 .if \size == 8
1450 ld1 {v4.8b}, [U], 8
1451 ld1 {v5.8b}, [V], 8
1452 ld1 {v0.8b}, [Y], 8
1453 prfm pldl1keep, [U, #64]
1454 prfm pldl1keep, [V, #64]
1455 prfm pldl1keep, [Y, #64]
1456 .elseif \size == 4
1457 ld1 {v4.b}[0], [U], 1
1458 ld1 {v4.b}[1], [U], 1
1459 ld1 {v4.b}[2], [U], 1
1460 ld1 {v4.b}[3], [U], 1
1461 ld1 {v5.b}[0], [V], 1
1462 ld1 {v5.b}[1], [V], 1
1463 ld1 {v5.b}[2], [V], 1
1464 ld1 {v5.b}[3], [V], 1
1465 ld1 {v0.b}[0], [Y], 1
1466 ld1 {v0.b}[1], [Y], 1
1467 ld1 {v0.b}[2], [Y], 1
1468 ld1 {v0.b}[3], [Y], 1
1469 .elseif \size == 2
1470 ld1 {v4.b}[4], [U], 1
1471 ld1 {v4.b}[5], [U], 1
1472 ld1 {v5.b}[4], [V], 1
1473 ld1 {v5.b}[5], [V], 1
1474 ld1 {v0.b}[4], [Y], 1
1475 ld1 {v0.b}[5], [Y], 1
1476 .elseif \size == 1
1477 ld1 {v4.b}[6], [U], 1
1478 ld1 {v5.b}[6], [V], 1
1479 ld1 {v0.b}[6], [Y], 1
1480 .else
1481 .error unsupported macroblock size
1482 .endif
1483 .endm
1484
1485 .macro do_store bpp, size, fast_st3
1486 .if \bpp == 24
1513 .if \size == 8 1487 .if \size == 8
1514 ld1 {v4.8b}, [U], 8 1488 .if \fast_st3 == 1
1515 ld1 {v5.8b}, [V], 8 1489 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
1516 ld1 {v0.8b}, [Y], 8 1490 .else
1517 prfm PLDL1KEEP, [U, #64] 1491 st1 {v10.b}[0], [RGB], #1
1518 prfm PLDL1KEEP, [V, #64] 1492 st1 {v11.b}[0], [RGB], #1
1519 prfm PLDL1KEEP, [Y, #64] 1493 st1 {v12.b}[0], [RGB], #1
1494
1495 st1 {v10.b}[1], [RGB], #1
1496 st1 {v11.b}[1], [RGB], #1
1497 st1 {v12.b}[1], [RGB], #1
1498
1499 st1 {v10.b}[2], [RGB], #1
1500 st1 {v11.b}[2], [RGB], #1
1501 st1 {v12.b}[2], [RGB], #1
1502
1503 st1 {v10.b}[3], [RGB], #1
1504 st1 {v11.b}[3], [RGB], #1
1505 st1 {v12.b}[3], [RGB], #1
1506
1507 st1 {v10.b}[4], [RGB], #1
1508 st1 {v11.b}[4], [RGB], #1
1509 st1 {v12.b}[4], [RGB], #1
1510
1511 st1 {v10.b}[5], [RGB], #1
1512 st1 {v11.b}[5], [RGB], #1
1513 st1 {v12.b}[5], [RGB], #1
1514
1515 st1 {v10.b}[6], [RGB], #1
1516 st1 {v11.b}[6], [RGB], #1
1517 st1 {v12.b}[6], [RGB], #1
1518
1519 st1 {v10.b}[7], [RGB], #1
1520 st1 {v11.b}[7], [RGB], #1
1521 st1 {v12.b}[7], [RGB], #1
1522 .endif
1520 .elseif \size == 4 1523 .elseif \size == 4
1521 ld1 {v4.b}[0], [U], 1 1524 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
1522 ld1 {v4.b}[1], [U], 1 1525 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
1523 ld1 {v4.b}[2], [U], 1 1526 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
1524 ld1 {v4.b}[3], [U], 1 1527 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
1525 ld1 {v5.b}[0], [V], 1
1526 ld1 {v5.b}[1], [V], 1
1527 ld1 {v5.b}[2], [V], 1
1528 ld1 {v5.b}[3], [V], 1
1529 ld1 {v0.b}[0], [Y], 1
1530 ld1 {v0.b}[1], [Y], 1
1531 ld1 {v0.b}[2], [Y], 1
1532 ld1 {v0.b}[3], [Y], 1
1533 .elseif \size == 2 1528 .elseif \size == 2
1534 ld1 {v4.b}[4], [U], 1 1529 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
1535 ld1 {v4.b}[5], [U], 1 1530 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
1536 ld1 {v5.b}[4], [V], 1
1537 ld1 {v5.b}[5], [V], 1
1538 ld1 {v0.b}[4], [Y], 1
1539 ld1 {v0.b}[5], [Y], 1
1540 .elseif \size == 1 1531 .elseif \size == 1
1541 ld1 {v4.b}[6], [U], 1 1532 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
1542 ld1 {v5.b}[6], [V], 1
1543 ld1 {v0.b}[6], [Y], 1
1544 .else 1533 .else
1545 .error unsupported macroblock size 1534 .error unsupported macroblock size
1546 .endif 1535 .endif
1547 .endm 1536 .elseif \bpp == 32
1548 1537 .if \size == 8
1549 .macro do_store bpp, size 1538 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1550 .if \bpp == 24 1539 .elseif \size == 4
1551 .if \size == 8 1540 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1552 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 1541 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1553 .elseif \size == 4 1542 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1554 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 1543 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1555 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 1544 .elseif \size == 2
1556 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 1545 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1557 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 1546 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1558 .elseif \size == 2 1547 .elseif \size == 1
1559 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 1548 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1560 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 1549 .else
1561 .elseif \size == 1 1550 .error unsupported macroblock size
1562 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
1563 .else
1564 .error unsupported macroblock size
1565 .endif
1566 .elseif \bpp == 32
1567 .if \size == 8
1568 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1569 .elseif \size == 4
1570 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1571 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1572 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1573 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1574 .elseif \size == 2
1575 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1576 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1577 .elseif \size == 1
1578 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1579 .else
1580 .error unsupported macroblock size
1581 .endif
1582 .elseif \bpp==16
1583 .if \size == 8
1584 st1 {v25.8h}, [RGB],16
1585 .elseif \size == 4
1586 st1 {v25.4h}, [RGB],8
1587 .elseif \size == 2
1588 st1 {v25.h}[4], [RGB],2
1589 st1 {v25.h}[5], [RGB],2
1590 .elseif \size == 1
1591 st1 {v25.h}[6], [RGB],2
1592 .else
1593 .error unsupported macroblock size
1594 .endif
1595 .else
1596 .error unsupported bpp
1597 .endif 1551 .endif
1598 .endm 1552 .elseif \bpp==16
1599 1553 .if \size == 8
1600 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize 1554 st1 {v25.8h}, [RGB], 16
1555 .elseif \size == 4
1556 st1 {v25.4h}, [RGB], 8
1557 .elseif \size == 2
1558 st1 {v25.h}[4], [RGB], 2
1559 st1 {v25.h}[5], [RGB], 2
1560 .elseif \size == 1
1561 st1 {v25.h}[6], [RGB], 2
1562 .else
1563 .error unsupported macroblock size
1564 .endif
1565 .else
1566 .error unsupported bpp
1567 .endif
1568 .endm
1569
1570 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1571 g_offs, gsize, b_offs, bsize, \
1572 defsize, fast_st3
1601 1573
1602 /* 1574 /*
1603 * 2-stage pipelined YCbCr->RGB conversion 1575 * 2-stage pipelined YCbCr->RGB conversion
1604 */ 1576 */
1605 1577
1606 .macro do_yuv_to_rgb_stage1 1578 .macro do_yuv_to_rgb_stage1
1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ 1579 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1580 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1609 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ 1581 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1610 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ 1582 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1611 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ 1583 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1612 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ 1584 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1613 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ 1585 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1614 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ 1586 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1615 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ 1587 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1616 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ 1588 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1617 .endm 1589 .endm
1618 1590
1619 .macro do_yuv_to_rgb_stage2 1591 .macro do_yuv_to_rgb_stage2
1620 rshrn v20.4h, v20.4s, #15 1592 rshrn v20.4h, v20.4s, #15
1621 rshrn2 v20.8h, v22.4s, #15 1593 rshrn2 v20.8h, v22.4s, #15
1622 rshrn v24.4h, v24.4s, #14 1594 rshrn v24.4h, v24.4s, #14
1623 rshrn2 v24.8h, v26.4s, #14 1595 rshrn2 v24.8h, v26.4s, #14
1624 rshrn v28.4h, v28.4s, #14 1596 rshrn v28.4h, v28.4s, #14
1625 rshrn2 v28.8h, v30.4s, #14 1597 rshrn2 v28.8h, v30.4s, #14
1626 uaddw v20.8h, v20.8h, v0.8b 1598 uaddw v20.8h, v20.8h, v0.8b
1627 uaddw v24.8h, v24.8h, v0.8b 1599 uaddw v24.8h, v24.8h, v0.8b
1628 uaddw v28.8h, v28.8h, v0.8b 1600 uaddw v28.8h, v28.8h, v0.8b
1629 .if \bpp != 16 1601 .if \bpp != 16
1630 sqxtun v1\g_offs\defsize, v20.8h 1602 sqxtun v1\g_offs\defsize, v20.8h
1631 sqxtun v1\r_offs\defsize, v24.8h 1603 sqxtun v1\r_offs\defsize, v24.8h
1632 sqxtun v1\b_offs\defsize, v28.8h 1604 sqxtun v1\b_offs\defsize, v28.8h
1633 .else 1605 .else
1634 sqshlu v21.8h, v20.8h, #8 1606 sqshlu v21.8h, v20.8h, #8
1635 sqshlu v25.8h, v24.8h, #8 1607 sqshlu v25.8h, v24.8h, #8
1636 sqshlu v29.8h, v28.8h, #8 1608 sqshlu v29.8h, v28.8h, #8
1637 sri v25.8h, v21.8h, #5 1609 sri v25.8h, v21.8h, #5
1638 sri v25.8h, v29.8h, #11 1610 sri v25.8h, v29.8h, #11
1639 .endif 1611 .endif
1640 1612 .endm
1641 .endm 1613
1642 1614 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1643 .macro do_yuv_to_rgb_stage2_store_load_stage1 1615 rshrn v20.4h, v20.4s, #15
1644 rshrn v20.4h, v20.4s, #15 1616 rshrn v24.4h, v24.4s, #14
1645 rshrn v24.4h, v24.4s, #14 1617 rshrn v28.4h, v28.4s, #14
1646 rshrn v28.4h, v28.4s, #14 1618 ld1 {v4.8b}, [U], 8
1647 ld1 {v4.8b}, [U], 8 1619 rshrn2 v20.8h, v22.4s, #15
1648 rshrn2 v20.8h, v22.4s, #15 1620 rshrn2 v24.8h, v26.4s, #14
1649 rshrn2 v24.8h, v26.4s, #14 1621 rshrn2 v28.8h, v30.4s, #14
1650 rshrn2 v28.8h, v30.4s, #14 1622 ld1 {v5.8b}, [V], 8
1651 ld1 {v5.8b}, [V], 8 1623 uaddw v20.8h, v20.8h, v0.8b
1652 uaddw v20.8h, v20.8h, v0.8b 1624 uaddw v24.8h, v24.8h, v0.8b
1653 uaddw v24.8h, v24.8h, v0.8b 1625 uaddw v28.8h, v28.8h, v0.8b
1654 uaddw v28.8h, v28.8h, v0.8b 1626 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1655 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ 1627 sqxtun v1\g_offs\defsize, v20.8h
1656 sqxtun v1\g_offs\defsize, v20.8h 1628 ld1 {v0.8b}, [Y], 8
1657 ld1 {v0.8b}, [Y], 8 1629 sqxtun v1\r_offs\defsize, v24.8h
1658 sqxtun v1\r_offs\defsize, v24.8h 1630 prfm pldl1keep, [U, #64]
1659 prfm PLDL1KEEP, [U, #64] 1631 prfm pldl1keep, [V, #64]
1660 prfm PLDL1KEEP, [V, #64] 1632 prfm pldl1keep, [Y, #64]
1661 prfm PLDL1KEEP, [Y, #64] 1633 sqxtun v1\b_offs\defsize, v28.8h
1662 sqxtun v1\b_offs\defsize, v28.8h 1634 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1635 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1636 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1665 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ 1637 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1666 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ 1638 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1667 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ 1639 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1668 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ 1640 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1669 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ 1641 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1670 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ 1642 .else /**************************** rgb565 ********************************/
1671 .else /**************************** rgb565 ***********************************/ 1643 sqshlu v21.8h, v20.8h, #8
1672 sqshlu v21.8h, v20.8h, #8 1644 sqshlu v25.8h, v24.8h, #8
1673 sqshlu v25.8h, v24.8h, #8 1645 sqshlu v29.8h, v28.8h, #8
1674 sqshlu v29.8h, v28.8h, #8 1646 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1647 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1648 ld1 {v0.8b}, [Y], 8
1677 ld1 {v0.8b}, [Y], 8 1649 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1678 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ 1650 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1679 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ 1651 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1680 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ 1652 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1681 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ 1653 sri v25.8h, v21.8h, #5
1682 sri v25.8h, v21.8h, #5 1654 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1683 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ 1655 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1684 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ 1656 prfm pldl1keep, [U, #64]
1685 prfm PLDL1KEEP, [U, #64] 1657 prfm pldl1keep, [V, #64]
1686 prfm PLDL1KEEP, [V, #64] 1658 prfm pldl1keep, [Y, #64]
1687 prfm PLDL1KEEP, [Y, #64] 1659 sri v25.8h, v29.8h, #11
1688 sri v25.8h, v29.8h, #11 1660 .endif
1689 .endif 1661 do_store \bpp, 8, \fast_st3
1690 do_store \bpp, 8 1662 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1691 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ 1663 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1692 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
1693 .endm 1664 .endm
1694 1665
1695 .macro do_yuv_to_rgb 1666 .macro do_yuv_to_rgb
1696 do_yuv_to_rgb_stage1 1667 do_yuv_to_rgb_stage1
1697 do_yuv_to_rgb_stage2 1668 do_yuv_to_rgb_stage2
1698 .endm 1669 .endm
1699 1670
1700 /* Apple gas crashes on adrl, work around that by using adr. 1671 /* Apple gas crashes on adrl, work around that by using adr.
1701 * But this requires a copy of these constants for each function. 1672 * But this requires a copy of these constants for each function.
1702 */ 1673 */
1703 1674
1704 .balign 16 1675 .balign 16
1705 jsimd_ycc_\colorid\()_neon_consts: 1676 .if \fast_st3 == 1
1706 .short 0, 0, 0, 0 1677 Ljsimd_ycc_\colorid\()_neon_consts:
1707 .short 22971, -11277, -23401, 29033 1678 .else
1708 .short -128, -128, -128, -128 1679 Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
1709 .short -128, -128, -128, -128 1680 .endif
1681 .short 0, 0, 0, 0
1682 .short 22971, -11277, -23401, 29033
1683 .short -128, -128, -128, -128
1684 .short -128, -128, -128, -128
1710 1685
1686 .if \fast_st3 == 1
1711 asm_function jsimd_ycc_\colorid\()_convert_neon 1687 asm_function jsimd_ycc_\colorid\()_convert_neon
1688 .else
1689 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1690 .endif
1712 OUTPUT_WIDTH .req x0 1691 OUTPUT_WIDTH .req x0
1713 INPUT_BUF .req x1 1692 INPUT_BUF .req x1
1714 INPUT_ROW .req x2 1693 INPUT_ROW .req x2
1715 OUTPUT_BUF .req x3 1694 OUTPUT_BUF .req x3
1716 NUM_ROWS .req x4 1695 NUM_ROWS .req x4
1717 1696
1718 INPUT_BUF0 .req x5 1697 INPUT_BUF0 .req x5
1719 INPUT_BUF1 .req x6 1698 INPUT_BUF1 .req x6
1720 INPUT_BUF2 .req INPUT_BUF 1699 INPUT_BUF2 .req x1
1721 1700
1722 RGB .req x7 1701 RGB .req x7
1723 Y .req x8 1702 Y .req x8
1724 U .req x9 1703 U .req x9
1725 V .req x10 1704 V .req x10
1726 N .req x15 1705 N .req x15
1727 1706
1728 sub sp, sp, 336 1707 sub sp, sp, 336
1729 str x15, [sp], 16 1708 str x15, [sp], 16
1709
1730 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ 1710 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1731 adr x15, jsimd_ycc_\colorid\()_neon_consts 1711 .if \fast_st3 == 1
1712 adr x15, Ljsimd_ycc_\colorid\()_neon_consts
1713 .else
1714 adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
1715 .endif
1716
1732 /* Save NEON registers */ 1717 /* Save NEON registers */
1733 st1 {v0.8b - v3.8b}, [sp], 32 1718 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1734 st1 {v4.8b - v7.8b}, [sp], 32 1719 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1735 st1 {v8.8b - v11.8b}, [sp], 32 1720 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1736 st1 {v12.8b - v15.8b}, [sp], 32 1721 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1737 st1 {v16.8b - v19.8b}, [sp], 32 1722 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1738 st1 {v20.8b - v23.8b}, [sp], 32 1723 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1739 st1 {v24.8b - v27.8b}, [sp], 32 1724 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1740 st1 {v28.8b - v31.8b}, [sp], 32 1725 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
1741 ld1 {v0.4h, v1.4h}, [x15], 16 1726 ld1 {v0.4h, v1.4h}, [x15], 16
1742 ld1 {v2.8h}, [x15] 1727 ld1 {v2.8h}, [x15]
1743 1728
1744 /* Save ARM registers and handle input arguments */ 1729 /* Save ARM registers and handle input arguments */
1745 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ 1730 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
1746 stp x4, x5, [sp], 16 1731 stp x4, x5, [sp], 16
1747 stp x6, x7, [sp], 16 1732 stp x6, x7, [sp], 16
1748 stp x8, x9, [sp], 16 1733 stp x8, x9, [sp], 16
1749 stp x10, x30, [sp], 16 1734 stp x10, x30, [sp], 16
1750 ldr INPUT_BUF0, [INPUT_BUF] 1735 ldr INPUT_BUF0, [INPUT_BUF]
1751 ldr INPUT_BUF1, [INPUT_BUF, 8] 1736 ldr INPUT_BUF1, [INPUT_BUF, #8]
1752 ldr INPUT_BUF2, [INPUT_BUF, 16] 1737 ldr INPUT_BUF2, [INPUT_BUF, #16]
1753 .unreq INPUT_BUF 1738 .unreq INPUT_BUF
1754 1739
1755 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ 1740 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1756 movi v10.16b, #255 1741 movi v10.16b, #255
1757 movi v13.16b, #255 1742 movi v13.16b, #255
1758 1743
1759 /* Outer loop over scanlines */ 1744 /* Outer loop over scanlines */
1760 cmp NUM_ROWS, #1 1745 cmp NUM_ROWS, #1
1761 blt 9f 1746 b.lt 9f
1762 0: 1747 0:
1763 lsl x16, INPUT_ROW, #3 1748 lsl x16, INPUT_ROW, #3
1764 ldr Y, [INPUT_BUF0, x16] 1749 ldr Y, [INPUT_BUF0, x16]
1765 ldr U, [INPUT_BUF1, x16] 1750 ldr U, [INPUT_BUF1, x16]
1766 mov N, OUTPUT_WIDTH 1751 mov N, OUTPUT_WIDTH
1767 ldr V, [INPUT_BUF2, x16] 1752 ldr V, [INPUT_BUF2, x16]
1768 add INPUT_ROW, INPUT_ROW, #1 1753 add INPUT_ROW, INPUT_ROW, #1
1769 ldr RGB, [OUTPUT_BUF], #8 1754 ldr RGB, [OUTPUT_BUF], #8
1770 1755
1771 /* Inner loop over pixels */ 1756 /* Inner loop over pixels */
1772 subs N, N, #8 1757 subs N, N, #8
1773 blt 3f 1758 b.lt 3f
1774 do_load 8 1759 do_load 8
1775 do_yuv_to_rgb_stage1 1760 do_yuv_to_rgb_stage1
1776 subs N, N, #8 1761 subs N, N, #8
1777 blt 2f 1762 b.lt 2f
1778 1: 1763 1:
1779 do_yuv_to_rgb_stage2_store_load_stage1 1764 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1780 subs N, N, #8 1765 subs N, N, #8
1781 bge 1b 1766 b.ge 1b
1782 2: 1767 2:
1783 do_yuv_to_rgb_stage2 1768 do_yuv_to_rgb_stage2
1784 do_store \bpp, 8 1769 do_store \bpp, 8, \fast_st3
1785 tst N, #7 1770 tst N, #7
1786 beq 8f 1771 b.eq 8f
1787 3: 1772 3:
1788 tst N, #4 1773 tst N, #4
1789 beq 3f 1774 b.eq 3f
1790 do_load 4 1775 do_load 4
1791 3: 1776 3:
1792 tst N, #2 1777 tst N, #2
1793 beq 4f 1778 b.eq 4f
1794 do_load 2 1779 do_load 2
1795 4: 1780 4:
1796 tst N, #1 1781 tst N, #1
1797 beq 5f 1782 b.eq 5f
1798 do_load 1 1783 do_load 1
1799 5: 1784 5:
1800 do_yuv_to_rgb 1785 do_yuv_to_rgb
1801 tst N, #4 1786 tst N, #4
1802 beq 6f 1787 b.eq 6f
1803 do_store \bpp, 4 1788 do_store \bpp, 4, \fast_st3
1804 6: 1789 6:
1805 tst N, #2 1790 tst N, #2
1806 beq 7f 1791 b.eq 7f
1807 do_store \bpp, 2 1792 do_store \bpp, 2, \fast_st3
1808 7: 1793 7:
1809 tst N, #1 1794 tst N, #1
1810 beq 8f 1795 b.eq 8f
1811 do_store \bpp, 1 1796 do_store \bpp, 1, \fast_st3
1812 8: 1797 8:
1813 subs NUM_ROWS, NUM_ROWS, #1 1798 subs NUM_ROWS, NUM_ROWS, #1
1814 bgt 0b 1799 b.gt 0b
1815 9: 1800 9:
1816 /* Restore all registers and return */ 1801 /* Restore all registers and return */
1817 sub sp, sp, #336 1802 sub sp, sp, #336
1818 ldr x15, [sp], 16 1803 ldr x15, [sp], 16
1819 ld1 {v0.8b - v3.8b}, [sp], 32 1804 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1820 ld1 {v4.8b - v7.8b}, [sp], 32 1805 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1821 ld1 {v8.8b - v11.8b}, [sp], 32 1806 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1822 ld1 {v12.8b - v15.8b}, [sp], 32 1807 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1823 ld1 {v16.8b - v19.8b}, [sp], 32 1808 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1824 ld1 {v20.8b - v23.8b}, [sp], 32 1809 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1825 ld1 {v24.8b - v27.8b}, [sp], 32 1810 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1826 ld1 {v28.8b - v31.8b}, [sp], 32 1811 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
1827 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ 1812 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
1828 ldp x4, x5, [sp], 16 1813 ldp x4, x5, [sp], 16
1829 ldp x6, x7, [sp], 16 1814 ldp x6, x7, [sp], 16
1830 ldp x8, x9, [sp], 16 1815 ldp x8, x9, [sp], 16
1831 ldp x10, x30, [sp], 16 1816 ldp x10, x30, [sp], 16
1832 br x30 1817 br x30
1833 .unreq OUTPUT_WIDTH 1818 .unreq OUTPUT_WIDTH
1834 .unreq INPUT_ROW 1819 .unreq INPUT_ROW
1835 .unreq OUTPUT_BUF 1820 .unreq OUTPUT_BUF
1836 .unreq NUM_ROWS 1821 .unreq NUM_ROWS
1837 .unreq INPUT_BUF0 1822 .unreq INPUT_BUF0
1838 .unreq INPUT_BUF1 1823 .unreq INPUT_BUF1
1839 .unreq INPUT_BUF2 1824 .unreq INPUT_BUF2
1840 .unreq RGB 1825 .unreq RGB
1841 .unreq Y 1826 .unreq Y
1842 .unreq U 1827 .unreq U
1843 .unreq V 1828 .unreq V
1844 .unreq N 1829 .unreq N
1845 1830
1846 .purgem do_yuv_to_rgb 1831 .purgem do_yuv_to_rgb
1847 .purgem do_yuv_to_rgb_stage1 1832 .purgem do_yuv_to_rgb_stage1
1848 .purgem do_yuv_to_rgb_stage2 1833 .purgem do_yuv_to_rgb_stage2
1849 .purgem do_yuv_to_rgb_stage2_store_load_stage1 1834 .purgem do_yuv_to_rgb_stage2_store_load_stage1
1835
1850 .endm 1836 .endm
1851 1837
1852 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */ 1838 /*--------------------------------- id ----- bpp R rsize G gsize B bsize defs ize fast_st3*/
1853 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b 1839 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1854 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b 1840 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1855 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b 1841 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1856 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b 1842 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1857 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b 1843 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
1858 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b 1844 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
1859 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b 1845 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
1846
1847 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
1848 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
1849
1860 .purgem do_load 1850 .purgem do_load
1861 .purgem do_store 1851 .purgem do_store
1852
1853
1854 /*****************************************************************************/
1855
1856 /*
1857 * jsimd_extrgb_ycc_convert_neon
1858 * jsimd_extbgr_ycc_convert_neon
1859 * jsimd_extrgbx_ycc_convert_neon
1860 * jsimd_extbgrx_ycc_convert_neon
1861 * jsimd_extxbgr_ycc_convert_neon
1862 * jsimd_extxrgb_ycc_convert_neon
1863 *
1864 * Colorspace conversion RGB -> YCbCr
1865 */
1866
1867 .macro do_store size
1868 .if \size == 8
1869 st1 {v20.8b}, [Y], #8
1870 st1 {v21.8b}, [U], #8
1871 st1 {v22.8b}, [V], #8
1872 .elseif \size == 4
1873 st1 {v20.b}[0], [Y], #1
1874 st1 {v20.b}[1], [Y], #1
1875 st1 {v20.b}[2], [Y], #1
1876 st1 {v20.b}[3], [Y], #1
1877 st1 {v21.b}[0], [U], #1
1878 st1 {v21.b}[1], [U], #1
1879 st1 {v21.b}[2], [U], #1
1880 st1 {v21.b}[3], [U], #1
1881 st1 {v22.b}[0], [V], #1
1882 st1 {v22.b}[1], [V], #1
1883 st1 {v22.b}[2], [V], #1
1884 st1 {v22.b}[3], [V], #1
1885 .elseif \size == 2
1886 st1 {v20.b}[4], [Y], #1
1887 st1 {v20.b}[5], [Y], #1
1888 st1 {v21.b}[4], [U], #1
1889 st1 {v21.b}[5], [U], #1
1890 st1 {v22.b}[4], [V], #1
1891 st1 {v22.b}[5], [V], #1
1892 .elseif \size == 1
1893 st1 {v20.b}[6], [Y], #1
1894 st1 {v21.b}[6], [U], #1
1895 st1 {v22.b}[6], [V], #1
1896 .else
1897 .error unsupported macroblock size
1898 .endif
1899 .endm
1900
1901 .macro do_load bpp, size, fast_ld3
1902 .if \bpp == 24
1903 .if \size == 8
1904 .if \fast_ld3 == 1
1905 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
1906 .else
1907 ld1 {v10.b}[0], [RGB], #1
1908 ld1 {v11.b}[0], [RGB], #1
1909 ld1 {v12.b}[0], [RGB], #1
1910
1911 ld1 {v10.b}[1], [RGB], #1
1912 ld1 {v11.b}[1], [RGB], #1
1913 ld1 {v12.b}[1], [RGB], #1
1914
1915 ld1 {v10.b}[2], [RGB], #1
1916 ld1 {v11.b}[2], [RGB], #1
1917 ld1 {v12.b}[2], [RGB], #1
1918
1919 ld1 {v10.b}[3], [RGB], #1
1920 ld1 {v11.b}[3], [RGB], #1
1921 ld1 {v12.b}[3], [RGB], #1
1922
1923 ld1 {v10.b}[4], [RGB], #1
1924 ld1 {v11.b}[4], [RGB], #1
1925 ld1 {v12.b}[4], [RGB], #1
1926
1927 ld1 {v10.b}[5], [RGB], #1
1928 ld1 {v11.b}[5], [RGB], #1
1929 ld1 {v12.b}[5], [RGB], #1
1930
1931 ld1 {v10.b}[6], [RGB], #1
1932 ld1 {v11.b}[6], [RGB], #1
1933 ld1 {v12.b}[6], [RGB], #1
1934
1935 ld1 {v10.b}[7], [RGB], #1
1936 ld1 {v11.b}[7], [RGB], #1
1937 ld1 {v12.b}[7], [RGB], #1
1938 .endif
1939 prfm pldl1keep, [RGB, #128]
1940 .elseif \size == 4
1941 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
1942 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
1943 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
1944 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
1945 .elseif \size == 2
1946 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
1947 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
1948 .elseif \size == 1
1949 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
1950 .else
1951 .error unsupported macroblock size
1952 .endif
1953 .elseif \bpp == 32
1954 .if \size == 8
1955 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
1956 prfm pldl1keep, [RGB, #128]
1957 .elseif \size == 4
1958 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
1959 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
1960 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
1961 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
1962 .elseif \size == 2
1963 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
1964 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
1965 .elseif \size == 1
1966 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
1967 .else
1968 .error unsupported macroblock size
1969 .endif
1970 .else
1971 .error unsupported bpp
1972 .endif
1973 .endm
1974
1975 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
1976 b_offs, fast_ld3
1977
1978 /*
1979 * 2-stage pipelined RGB->YCbCr conversion
1980 */
1981
1982 .macro do_rgb_to_yuv_stage1
1983 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
1984 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
1985 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
1986 rev64 v18.4s, v1.4s
1987 rev64 v26.4s, v1.4s
1988 rev64 v28.4s, v1.4s
1989 rev64 v30.4s, v1.4s
1990 umull v14.4s, v4.4h, v0.h[0]
1991 umull2 v16.4s, v4.8h, v0.h[0]
1992 umlsl v18.4s, v4.4h, v0.h[3]
1993 umlsl2 v26.4s, v4.8h, v0.h[3]
1994 umlal v28.4s, v4.4h, v0.h[5]
1995 umlal2 v30.4s, v4.8h, v0.h[5]
1996 umlal v14.4s, v6.4h, v0.h[1]
1997 umlal2 v16.4s, v6.8h, v0.h[1]
1998 umlsl v18.4s, v6.4h, v0.h[4]
1999 umlsl2 v26.4s, v6.8h, v0.h[4]
2000 umlsl v28.4s, v6.4h, v0.h[6]
2001 umlsl2 v30.4s, v6.8h, v0.h[6]
2002 umlal v14.4s, v8.4h, v0.h[2]
2003 umlal2 v16.4s, v8.8h, v0.h[2]
2004 umlal v18.4s, v8.4h, v0.h[5]
2005 umlal2 v26.4s, v8.8h, v0.h[5]
2006 umlsl v28.4s, v8.4h, v0.h[7]
2007 umlsl2 v30.4s, v8.8h, v0.h[7]
2008 .endm
2009
2010 .macro do_rgb_to_yuv_stage2
2011 rshrn v20.4h, v14.4s, #16
2012 shrn v22.4h, v18.4s, #16
2013 shrn v24.4h, v28.4s, #16
2014 rshrn2 v20.8h, v16.4s, #16
2015 shrn2 v22.8h, v26.4s, #16
2016 shrn2 v24.8h, v30.4s, #16
2017 xtn v20.8b, v20.8h /* v20 = y */
2018 xtn v21.8b, v22.8h /* v21 = u */
2019 xtn v22.8b, v24.8h /* v22 = v */
2020 .endm
2021
2022 .macro do_rgb_to_yuv
2023 do_rgb_to_yuv_stage1
2024 do_rgb_to_yuv_stage2
2025 .endm
2026
2027 /* TODO: expand macros and interleave instructions if some in-order
2028 * ARM64 processor actually can dual-issue LOAD/STORE with ALU */
2029 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
2030 do_rgb_to_yuv_stage2
2031 do_load \bpp, 8, \fast_ld3
2032 st1 {v20.8b}, [Y], #8
2033 st1 {v21.8b}, [U], #8
2034 st1 {v22.8b}, [V], #8
2035 do_rgb_to_yuv_stage1
2036 .endm
2037
2038 .balign 16
2039 .if \fast_ld3 == 1
2040 Ljsimd_\colorid\()_ycc_neon_consts:
2041 .else
2042 Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
2043 .endif
2044 .short 19595, 38470, 7471, 11059
2045 .short 21709, 32768, 27439, 5329
2046 .short 32767, 128, 32767, 128
2047 .short 32767, 128, 32767, 128
2048
2049 .if \fast_ld3 == 1
2050 asm_function jsimd_\colorid\()_ycc_convert_neon
2051 .else
2052 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2053 .endif
2054 OUTPUT_WIDTH .req w0
2055 INPUT_BUF .req x1
2056 OUTPUT_BUF .req x2
2057 OUTPUT_ROW .req x3
2058 NUM_ROWS .req x4
2059
2060 OUTPUT_BUF0 .req x5
2061 OUTPUT_BUF1 .req x6
2062 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
2063
2064 RGB .req x7
2065 Y .req x9
2066 U .req x10
2067 V .req x11
2068 N .req w12
2069
2070 /* Load constants to d0, d1, d2, d3 */
2071 .if \fast_ld3 == 1
2072 adr x13, Ljsimd_\colorid\()_ycc_neon_consts
2073 .else
2074 adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
2075 .endif
2076 ld1 {v0.8h, v1.8h}, [x13]
2077
2078 ldr OUTPUT_BUF0, [OUTPUT_BUF]
2079 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
2080 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
2081 .unreq OUTPUT_BUF
2082
2083 /* Save NEON registers */
2084 sub sp, sp, #64
2085 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2086 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2087
2088 /* Outer loop over scanlines */
2089 cmp NUM_ROWS, #1
2090 b.lt 9f
2091 0:
2092 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
2093 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
2094 mov N, OUTPUT_WIDTH
2095 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
2096 add OUTPUT_ROW, OUTPUT_ROW, #1
2097 ldr RGB, [INPUT_BUF], #8
2098
2099 /* Inner loop over pixels */
2100 subs N, N, #8
2101 b.lt 3f
2102 do_load \bpp, 8, \fast_ld3
2103 do_rgb_to_yuv_stage1
2104 subs N, N, #8
2105 b.lt 2f
2106 1:
2107 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2108 subs N, N, #8
2109 b.ge 1b
2110 2:
2111 do_rgb_to_yuv_stage2
2112 do_store 8
2113 tst N, #7
2114 b.eq 8f
2115 3:
2116 tbz N, #2, 3f
2117 do_load \bpp, 4, \fast_ld3
2118 3:
2119 tbz N, #1, 4f
2120 do_load \bpp, 2, \fast_ld3
2121 4:
2122 tbz N, #0, 5f
2123 do_load \bpp, 1, \fast_ld3
2124 5:
2125 do_rgb_to_yuv
2126 tbz N, #2, 6f
2127 do_store 4
2128 6:
2129 tbz N, #1, 7f
2130 do_store 2
2131 7:
2132 tbz N, #0, 8f
2133 do_store 1
2134 8:
2135 subs NUM_ROWS, NUM_ROWS, #1
2136 b.gt 0b
2137 9:
2138 /* Restore all registers and return */
2139 sub sp, sp, #64
2140 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2141 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2142 br x30
2143
2144 .unreq OUTPUT_WIDTH
2145 .unreq OUTPUT_ROW
2146 .unreq INPUT_BUF
2147 .unreq NUM_ROWS
2148 .unreq OUTPUT_BUF0
2149 .unreq OUTPUT_BUF1
2150 .unreq OUTPUT_BUF2
2151 .unreq RGB
2152 .unreq Y
2153 .unreq U
2154 .unreq V
2155 .unreq N
2156
2157 .purgem do_rgb_to_yuv
2158 .purgem do_rgb_to_yuv_stage1
2159 .purgem do_rgb_to_yuv_stage2
2160 .purgem do_rgb_to_yuv_stage2_store_load_stage1
2161
2162 .endm
2163
2164 /*--------------------------------- id ----- bpp R G B Fast LD3 */
2165 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
2166 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
2167 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2168 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2169 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2170 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2171
2172 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
2173 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
2174
2175 .purgem do_load
2176 .purgem do_store
2177
2178
2179 /*****************************************************************************/
2180
2181 /*
2182 * Load data into workspace, applying unsigned->signed conversion
2183 *
2184 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2185 * rid of VST1.16 instructions
2186 */
2187
2188 asm_function jsimd_convsamp_neon
2189 SAMPLE_DATA .req x0
2190 START_COL .req x1
2191 WORKSPACE .req x2
2192 TMP1 .req x9
2193 TMP2 .req x10
2194 TMP3 .req x11
2195 TMP4 .req x12
2196 TMP5 .req x13
2197 TMP6 .req x14
2198 TMP7 .req x15
2199 TMP8 .req x4
2200 TMPDUP .req w3
2201
2202 mov TMPDUP, #128
2203 ldp TMP1, TMP2, [SAMPLE_DATA], 16
2204 ldp TMP3, TMP4, [SAMPLE_DATA], 16
2205 dup v0.8b, TMPDUP
2206 add TMP1, TMP1, START_COL
2207 add TMP2, TMP2, START_COL
2208 ldp TMP5, TMP6, [SAMPLE_DATA], 16
2209 add TMP3, TMP3, START_COL
2210 add TMP4, TMP4, START_COL
2211 ldp TMP7, TMP8, [SAMPLE_DATA], 16
2212 add TMP5, TMP5, START_COL
2213 add TMP6, TMP6, START_COL
2214 ld1 {v16.8b}, [TMP1]
2215 add TMP7, TMP7, START_COL
2216 add TMP8, TMP8, START_COL
2217 ld1 {v17.8b}, [TMP2]
2218 usubl v16.8h, v16.8b, v0.8b
2219 ld1 {v18.8b}, [TMP3]
2220 usubl v17.8h, v17.8b, v0.8b
2221 ld1 {v19.8b}, [TMP4]
2222 usubl v18.8h, v18.8b, v0.8b
2223 ld1 {v20.8b}, [TMP5]
2224 usubl v19.8h, v19.8b, v0.8b
2225 ld1 {v21.8b}, [TMP6]
2226 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2227 usubl v20.8h, v20.8b, v0.8b
2228 ld1 {v22.8b}, [TMP7]
2229 usubl v21.8h, v21.8b, v0.8b
2230 ld1 {v23.8b}, [TMP8]
2231 usubl v22.8h, v22.8b, v0.8b
2232 usubl v23.8h, v23.8b, v0.8b
2233 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2234
2235 br x30
2236
2237 .unreq SAMPLE_DATA
2238 .unreq START_COL
2239 .unreq WORKSPACE
2240 .unreq TMP1
2241 .unreq TMP2
2242 .unreq TMP3
2243 .unreq TMP4
2244 .unreq TMP5
2245 .unreq TMP6
2246 .unreq TMP7
2247 .unreq TMP8
2248 .unreq TMPDUP
2249
2250 /*****************************************************************************/
2251
2252 /*
2253 * jsimd_fdct_islow_neon
2254 *
2255 * This file contains a slow-but-accurate integer implementation of the
2256 * forward DCT (Discrete Cosine Transform). The following code is based
2257 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2258 * more details.
2259 *
2260 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2261 * rid of a bunch of VLD1.16 instructions
2262 */
2263
2264 #define CONST_BITS 13
2265 #define PASS1_BITS 2
2266
2267 #define DESCALE_P1 (CONST_BITS-PASS1_BITS)
2268 #define DESCALE_P2 (CONST_BITS+PASS1_BITS)
2269
2270 #define F_0_298 2446 /* FIX(0.298631336) */
2271 #define F_0_390 3196 /* FIX(0.390180644) */
2272 #define F_0_541 4433 /* FIX(0.541196100) */
2273 #define F_0_765 6270 /* FIX(0.765366865) */
2274 #define F_0_899 7373 /* FIX(0.899976223) */
2275 #define F_1_175 9633 /* FIX(1.175875602) */
2276 #define F_1_501 12299 /* FIX(1.501321110) */
2277 #define F_1_847 15137 /* FIX(1.847759065) */
2278 #define F_1_961 16069 /* FIX(1.961570560) */
2279 #define F_2_053 16819 /* FIX(2.053119869) */
2280 #define F_2_562 20995 /* FIX(2.562915447) */
2281 #define F_3_072 25172 /* FIX(3.072711026) */
2282
2283 .balign 16
2284 Ljsimd_fdct_islow_neon_consts:
2285 .short F_0_298
2286 .short -F_0_390
2287 .short F_0_541
2288 .short F_0_765
2289 .short - F_0_899
2290 .short F_1_175
2291 .short F_1_501
2292 .short - F_1_847
2293 .short - F_1_961
2294 .short F_2_053
2295 .short - F_2_562
2296 .short F_3_072
2297 .short 0 /* padding */
2298 .short 0
2299 .short 0
2300 .short 0
2301
2302 #undef F_0_298
2303 #undef F_0_390
2304 #undef F_0_541
2305 #undef F_0_765
2306 #undef F_0_899
2307 #undef F_1_175
2308 #undef F_1_501
2309 #undef F_1_847
2310 #undef F_1_961
2311 #undef F_2_053
2312 #undef F_2_562
2313 #undef F_3_072
2314 #define XFIX_P_0_298 v0.h[0]
2315 #define XFIX_N_0_390 v0.h[1]
2316 #define XFIX_P_0_541 v0.h[2]
2317 #define XFIX_P_0_765 v0.h[3]
2318 #define XFIX_N_0_899 v0.h[4]
2319 #define XFIX_P_1_175 v0.h[5]
2320 #define XFIX_P_1_501 v0.h[6]
2321 #define XFIX_N_1_847 v0.h[7]
2322 #define XFIX_N_1_961 v1.h[0]
2323 #define XFIX_P_2_053 v1.h[1]
2324 #define XFIX_N_2_562 v1.h[2]
2325 #define XFIX_P_3_072 v1.h[3]
2326
2327 asm_function jsimd_fdct_islow_neon
2328
2329 DATA .req x0
2330 TMP .req x9
2331
2332 /* Load constants */
2333 adr TMP, Ljsimd_fdct_islow_neon_consts
2334 ld1 {v0.8h, v1.8h}, [TMP]
2335
2336 /* Save NEON registers */
2337 sub sp, sp, #64
2338 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2339 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2340
2341 /* Load all DATA into NEON registers with the following allocation:
2342 * 0 1 2 3 | 4 5 6 7
2343 * ---------+--------
2344 * 0 | d16 | d17 | v16.8h
2345 * 1 | d18 | d19 | v17.8h
2346 * 2 | d20 | d21 | v18.8h
2347 * 3 | d22 | d23 | v19.8h
2348 * 4 | d24 | d25 | v20.8h
2349 * 5 | d26 | d27 | v21.8h
2350 * 6 | d28 | d29 | v22.8h
2351 * 7 | d30 | d31 | v23.8h
2352 */
2353
2354 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2355 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2356 sub DATA, DATA, #64
2357
2358 /* Transpose */
2359 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2360 /* 1-D FDCT */
2361 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; * /
2362 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; * /
2363 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; * /
2364 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; * /
2365 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; * /
2366 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; * /
2367 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; * /
2368 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; * /
2369
2370 /* even part */
2371
2372 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2373 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2374 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2375 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
2376
2377 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2378 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
2379
2380 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
2381
2382 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_ SHIFT(tmp10 + tmp11, PASS1_BITS); */
2383 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_ SHIFT(tmp10 - tmp11, PASS1_BITS); */
2384
2385 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tm p13, XFIX_P_0_541); */
2386 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tm p13, XFIX_P_0_541); */
2387 mov v22.16b, v18.16b
2388 mov v25.16b, v24.16b
2389
2390 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFI X_P_0_765) */
2391 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFI X_P_0_765) */
2392 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFI X_N_1_847) */
2393 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFI X_N_1_847) */
2394
2395 rshrn v18.4h, v18.4s, #DESCALE_P1
2396 rshrn v22.4h, v22.4s, #DESCALE_P1
2397 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCA LE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2398 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCA LE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2399
2400 /* Odd part */
2401
2402 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2403 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2404 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2405 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2406 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2407 smull2 v5.4s, v10.8h, XFIX_P_1_175
2408 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1 _175875602); */
2409 smlal2 v5.4s, v11.8h, XFIX_P_1_175
2410
2411 smull2 v24.4s, v28.8h, XFIX_P_0_298
2412 smull2 v25.4s, v29.8h, XFIX_P_2_053
2413 smull2 v26.4s, v30.8h, XFIX_P_3_072
2414 smull2 v27.4s, v31.8h, XFIX_P_1_501
2415 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0 _298631336); */
2416 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2 _053119869); */
2417 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3 _072711026); */
2418 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1 _501321110); */
2419
2420 smull2 v12.4s, v8.8h, XFIX_N_0_899
2421 smull2 v13.4s, v9.8h, XFIX_N_2_562
2422 smull2 v14.4s, v10.8h, XFIX_N_1_961
2423 smull2 v15.4s, v11.8h, XFIX_N_0_390
2424 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223); */
2425 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447); */
2426 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560); */
2427 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644); */
2428
2429 add v10.4s, v10.4s, v4.4s /* z3 += z5 */
2430 add v14.4s, v14.4s, v5.4s
2431 add v11.4s, v11.4s, v4.4s /* z4 += z5 */
2432 add v15.4s, v15.4s, v5.4s
2433
2434 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2435 add v24.4s, v24.4s, v12.4s
2436 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2437 add v25.4s, v25.4s, v13.4s
2438 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2439 add v26.4s, v26.4s, v14.4s
2440 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2441 add v27.4s, v27.4s, v15.4s
2442
2443 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2444 add v24.4s, v24.4s, v14.4s
2445 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2446 add v25.4s, v25.4s, v15.4s
2447 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2448 add v26.4s, v26.4s, v13.4s
2449 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2450 add v27.4s, v27.4s, v12.4s
2451
2452 rshrn v23.4h, v28.4s, #DESCALE_P1
2453 rshrn v21.4h, v29.4s, #DESCALE_P1
2454 rshrn v19.4h, v30.4s, #DESCALE_P1
2455 rshrn v17.4h, v31.4s, #DESCALE_P1
2456 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCA LE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2457 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCA LE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2458 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCA LE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2459 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCA LE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2460
2461 /* Transpose */
2462 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2463
2464 /* 1-D FDCT */
2465 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; * /
2466 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; * /
2467 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; * /
2468 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; * /
2469 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; * /
2470 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; * /
2471 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; * /
2472 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; * /
2473
2474 /* even part */
2475 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2476 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2477 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2478 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
2479
2480 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2481 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
2482
2483 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
2484
2485 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCA LE(tmp10 + tmp11, PASS1_BITS); */
2486 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCA LE(tmp10 - tmp11, PASS1_BITS); */
2487
2488 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tm p13, XFIX_P_0_541); */
2489 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tm p13, XFIX_P_0_541); */
2490 mov v22.16b, v18.16b
2491 mov v25.16b, v24.16b
2492
2493 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFI X_P_0_765) */
2494 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFI X_P_0_765) */
2495 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFI X_N_1_847) */
2496 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFI X_N_1_847) */
2497
2498 rshrn v18.4h, v18.4s, #DESCALE_P2
2499 rshrn v22.4h, v22.4s, #DESCALE_P2
2500 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCA LE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2501 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCA LE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2502
2503 /* Odd part */
2504 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2505 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2506 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2507 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2508
2509 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2510 smull2 v5.4s, v10.8h, XFIX_P_1_175
2511 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1 _175875602); */
2512 smlal2 v5.4s, v11.8h, XFIX_P_1_175
2513
2514 smull2 v24.4s, v28.8h, XFIX_P_0_298
2515 smull2 v25.4s, v29.8h, XFIX_P_2_053
2516 smull2 v26.4s, v30.8h, XFIX_P_3_072
2517 smull2 v27.4s, v31.8h, XFIX_P_1_501
2518 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0 _298631336); */
2519 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2 _053119869); */
2520 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3 _072711026); */
2521 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1 _501321110); */
2522
2523 smull2 v12.4s, v8.8h, XFIX_N_0_899
2524 smull2 v13.4s, v9.8h, XFIX_N_2_562
2525 smull2 v14.4s, v10.8h, XFIX_N_1_961
2526 smull2 v15.4s, v11.8h, XFIX_N_0_390
2527 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223); */
2528 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447); */
2529 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560); */
2530 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644); */
2531
2532 add v10.4s, v10.4s, v4.4s
2533 add v14.4s, v14.4s, v5.4s
2534 add v11.4s, v11.4s, v4.4s
2535 add v15.4s, v15.4s, v5.4s
2536
2537 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2538 add v24.4s, v24.4s, v12.4s
2539 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2540 add v25.4s, v25.4s, v13.4s
2541 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2542 add v26.4s, v26.4s, v14.4s
2543 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2544 add v27.4s, v27.4s, v15.4s
2545
2546 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2547 add v24.4s, v24.4s, v14.4s
2548 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2549 add v25.4s, v25.4s, v15.4s
2550 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2551 add v26.4s, v26.4s, v13.4s
2552 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2553 add v27.4s, v27.4s, v12.4s
2554
2555 rshrn v23.4h, v28.4s, #DESCALE_P2
2556 rshrn v21.4h, v29.4s, #DESCALE_P2
2557 rshrn v19.4h, v30.4s, #DESCALE_P2
2558 rshrn v17.4h, v31.4s, #DESCALE_P2
2559 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCA LE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2560 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCA LE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2561 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCA LE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2562 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCA LE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2563
2564 /* store results */
2565 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2566 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2567
2568 /* Restore NEON registers */
2569 sub sp, sp, #64
2570 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2571 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2572
2573 br x30
2574
2575 .unreq DATA
2576 .unreq TMP
2577
2578 #undef XFIX_P_0_298
2579 #undef XFIX_N_0_390
2580 #undef XFIX_P_0_541
2581 #undef XFIX_P_0_765
2582 #undef XFIX_N_0_899
2583 #undef XFIX_P_1_175
2584 #undef XFIX_P_1_501
2585 #undef XFIX_N_1_847
2586 #undef XFIX_N_1_961
2587 #undef XFIX_P_2_053
2588 #undef XFIX_N_2_562
2589 #undef XFIX_P_3_072
2590
2591
2592 /*****************************************************************************/
2593
2594 /*
2595 * jsimd_fdct_ifast_neon
2596 *
2597 * This function contains a fast, not so accurate integer implementation of
2598 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2599 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2600 * function from jfdctfst.c
2601 *
2602 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2603 * rid of a bunch of VLD1.16 instructions
2604 */
2605
2606 #undef XFIX_0_541196100
2607 #define XFIX_0_382683433 v0.h[0]
2608 #define XFIX_0_541196100 v0.h[1]
2609 #define XFIX_0_707106781 v0.h[2]
2610 #define XFIX_1_306562965 v0.h[3]
2611
2612 .balign 16
2613 Ljsimd_fdct_ifast_neon_consts:
2614 .short (98 * 128) /* XFIX_0_382683433 */
2615 .short (139 * 128) /* XFIX_0_541196100 */
2616 .short (181 * 128) /* XFIX_0_707106781 */
2617 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
2618
2619 asm_function jsimd_fdct_ifast_neon
2620
2621 DATA .req x0
2622 TMP .req x9
2623
2624 /* Load constants */
2625 adr TMP, Ljsimd_fdct_ifast_neon_consts
2626 ld1 {v0.4h}, [TMP]
2627
2628 /* Load all DATA into NEON registers with the following allocation:
2629 * 0 1 2 3 | 4 5 6 7
2630 * ---------+--------
2631 * 0 | d16 | d17 | v0.8h
2632 * 1 | d18 | d19 | q9
2633 * 2 | d20 | d21 | q10
2634 * 3 | d22 | d23 | q11
2635 * 4 | d24 | d25 | q12
2636 * 5 | d26 | d27 | q13
2637 * 6 | d28 | d29 | q14
2638 * 7 | d30 | d31 | q15
2639 */
2640
2641 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2642 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2643 mov TMP, #2
2644 sub DATA, DATA, #64
2645 1:
2646 /* Transpose */
2647 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2648 subs TMP, TMP, #1
2649 /* 1-D FDCT */
2650 add v4.8h, v19.8h, v20.8h
2651 sub v20.8h, v19.8h, v20.8h
2652 sub v28.8h, v18.8h, v21.8h
2653 add v18.8h, v18.8h, v21.8h
2654 sub v29.8h, v17.8h, v22.8h
2655 add v17.8h, v17.8h, v22.8h
2656 sub v21.8h, v16.8h, v23.8h
2657 add v16.8h, v16.8h, v23.8h
2658 sub v6.8h, v17.8h, v18.8h
2659 sub v7.8h, v16.8h, v4.8h
2660 add v5.8h, v17.8h, v18.8h
2661 add v6.8h, v6.8h, v7.8h
2662 add v4.8h, v16.8h, v4.8h
2663 sqdmulh v6.8h, v6.8h, XFIX_0_707106781
2664 add v19.8h, v20.8h, v28.8h
2665 add v16.8h, v4.8h, v5.8h
2666 sub v20.8h, v4.8h, v5.8h
2667 add v5.8h, v28.8h, v29.8h
2668 add v29.8h, v29.8h, v21.8h
2669 sqdmulh v5.8h, v5.8h, XFIX_0_707106781
2670 sub v28.8h, v19.8h, v29.8h
2671 add v18.8h, v7.8h, v6.8h
2672 sqdmulh v28.8h, v28.8h, XFIX_0_382683433
2673 sub v22.8h, v7.8h, v6.8h
2674 sqdmulh v19.8h, v19.8h, XFIX_0_541196100
2675 sqdmulh v7.8h, v29.8h, XFIX_1_306562965
2676 add v6.8h, v21.8h, v5.8h
2677 sub v5.8h, v21.8h, v5.8h
2678 add v29.8h, v29.8h, v28.8h
2679 add v19.8h, v19.8h, v28.8h
2680 add v29.8h, v29.8h, v7.8h
2681 add v21.8h, v5.8h, v19.8h
2682 sub v19.8h, v5.8h, v19.8h
2683 add v17.8h, v6.8h, v29.8h
2684 sub v23.8h, v6.8h, v29.8h
2685
2686 b.ne 1b
2687
2688 /* store results */
2689 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2690 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2691
2692 br x30
2693
2694 .unreq DATA
2695 .unreq TMP
2696 #undef XFIX_0_382683433
2697 #undef XFIX_0_541196100
2698 #undef XFIX_0_707106781
2699 #undef XFIX_1_306562965
2700
2701
2702 /*****************************************************************************/
2703
2704 /*
2705 * GLOBAL(void)
2706 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2707 * DCTELEM *workspace);
2708 *
2709 */
2710 asm_function jsimd_quantize_neon
2711
2712 COEF_BLOCK .req x0
2713 DIVISORS .req x1
2714 WORKSPACE .req x2
2715
2716 RECIPROCAL .req DIVISORS
2717 CORRECTION .req x9
2718 SHIFT .req x10
2719 LOOP_COUNT .req x11
2720
2721 mov LOOP_COUNT, #2
2722 add CORRECTION, DIVISORS, #(64 * 2)
2723 add SHIFT, DIVISORS, #(64 * 6)
2724 1:
2725 subs LOOP_COUNT, LOOP_COUNT, #1
2726 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2727 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2728 abs v20.8h, v0.8h
2729 abs v21.8h, v1.8h
2730 abs v22.8h, v2.8h
2731 abs v23.8h, v3.8h
2732 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2733 add v20.8h, v20.8h, v4.8h /* add correction */
2734 add v21.8h, v21.8h, v5.8h
2735 add v22.8h, v22.8h, v6.8h
2736 add v23.8h, v23.8h, v7.8h
2737 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */
2738 umull2 v16.4s, v20.8h, v28.8h
2739 umull v5.4s, v21.4h, v29.4h
2740 umull2 v17.4s, v21.8h, v29.8h
2741 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */
2742 umull2 v18.4s, v22.8h, v30.8h
2743 umull v7.4s, v23.4h, v31.4h
2744 umull2 v19.4s, v23.8h, v31.8h
2745 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2746 shrn v4.4h, v4.4s, #16
2747 shrn v5.4h, v5.4s, #16
2748 shrn v6.4h, v6.4s, #16
2749 shrn v7.4h, v7.4s, #16
2750 shrn2 v4.8h, v16.4s, #16
2751 shrn2 v5.8h, v17.4s, #16
2752 shrn2 v6.8h, v18.4s, #16
2753 shrn2 v7.8h, v19.4s, #16
2754 neg v24.8h, v24.8h
2755 neg v25.8h, v25.8h
2756 neg v26.8h, v26.8h
2757 neg v27.8h, v27.8h
2758 sshr v0.8h, v0.8h, #15 /* extract sign */
2759 sshr v1.8h, v1.8h, #15
2760 sshr v2.8h, v2.8h, #15
2761 sshr v3.8h, v3.8h, #15
2762 ushl v4.8h, v4.8h, v24.8h /* shift */
2763 ushl v5.8h, v5.8h, v25.8h
2764 ushl v6.8h, v6.8h, v26.8h
2765 ushl v7.8h, v7.8h, v27.8h
2766
2767 eor v4.16b, v4.16b, v0.16b /* restore sign */
2768 eor v5.16b, v5.16b, v1.16b
2769 eor v6.16b, v6.16b, v2.16b
2770 eor v7.16b, v7.16b, v3.16b
2771 sub v4.8h, v4.8h, v0.8h
2772 sub v5.8h, v5.8h, v1.8h
2773 sub v6.8h, v6.8h, v2.8h
2774 sub v7.8h, v7.8h, v3.8h
2775 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2776
2777 b.ne 1b
2778
2779 br x30 /* return */
2780
2781 .unreq COEF_BLOCK
2782 .unreq DIVISORS
2783 .unreq WORKSPACE
2784 .unreq RECIPROCAL
2785 .unreq CORRECTION
2786 .unreq SHIFT
2787 .unreq LOOP_COUNT
2788
2789
2790 /*****************************************************************************/
2791
2792 /*
2793 * Downsample pixel values of a single component.
2794 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2795 * without smoothing.
2796 *
2797 * GLOBAL(void)
2798 * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2799 * JDIMENSION v_samp_factor,
2800 * JDIMENSION width_blocks, JSAMPARRAY input_data,
2801 * JSAMPARRAY output_data);
2802 */
2803
2804 .balign 16
2805 Ljsimd_h2_downsample_neon_consts:
2806 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2807 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
2808 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2809 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
2810 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2811 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
2812 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2813 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
2814 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2815 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
2816 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2817 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
2818 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2819 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
2820 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2821 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
2822 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2823 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
2824 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
2825 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
2826 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
2827 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
2828 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
2829 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
2830 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
2831 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
2832 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
2833 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
2834 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
2835 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
2836 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
2837 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
2838
2839 asm_function jsimd_h2v1_downsample_neon
2840 IMAGE_WIDTH .req x0
2841 MAX_V_SAMP .req x1
2842 V_SAMP .req x2
2843 BLOCK_WIDTH .req x3
2844 INPUT_DATA .req x4
2845 OUTPUT_DATA .req x5
2846 OUTPTR .req x9
2847 INPTR .req x10
2848 TMP1 .req x11
2849 TMP2 .req x12
2850 TMP3 .req x13
2851 TMPDUP .req w15
2852
2853 mov TMPDUP, #0x10000
2854 lsl TMP2, BLOCK_WIDTH, #4
2855 sub TMP2, TMP2, IMAGE_WIDTH
2856 adr TMP3, Ljsimd_h2_downsample_neon_consts
2857 add TMP3, TMP3, TMP2, lsl #4
2858 dup v16.4s, TMPDUP
2859 ld1 {v18.16b}, [TMP3]
2860
2861 1: /* row loop */
2862 ldr INPTR, [INPUT_DATA], #8
2863 ldr OUTPTR, [OUTPUT_DATA], #8
2864 subs TMP1, BLOCK_WIDTH, #1
2865 b.eq 3f
2866 2: /* columns */
2867 ld1 {v0.16b}, [INPTR], #16
2868 mov v4.16b, v16.16b
2869 subs TMP1, TMP1, #1
2870 uadalp v4.8h, v0.16b
2871 shrn v6.8b, v4.8h, #1
2872 st1 {v6.8b}, [OUTPTR], #8
2873 b.ne 2b
2874 3: /* last columns */
2875 ld1 {v0.16b}, [INPTR]
2876 mov v4.16b, v16.16b
2877 subs V_SAMP, V_SAMP, #1
2878 /* expand right */
2879 tbl v2.16b, {v0.16b}, v18.16b
2880 uadalp v4.8h, v2.16b
2881 shrn v6.8b, v4.8h, #1
2882 st1 {v6.8b}, [OUTPTR], #8
2883 b.ne 1b
2884
2885 br x30
2886
2887 .unreq IMAGE_WIDTH
2888 .unreq MAX_V_SAMP
2889 .unreq V_SAMP
2890 .unreq BLOCK_WIDTH
2891 .unreq INPUT_DATA
2892 .unreq OUTPUT_DATA
2893 .unreq OUTPTR
2894 .unreq INPTR
2895 .unreq TMP1
2896 .unreq TMP2
2897 .unreq TMP3
2898 .unreq TMPDUP
2899
2900
2901 /*****************************************************************************/
2902
2903 /*
2904 * Downsample pixel values of a single component.
2905 * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2906 * without smoothing.
2907 *
2908 * GLOBAL(void)
2909 * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2910 * JDIMENSION v_samp_factor, JDIMENSION width_blocks ,
2911 * JSAMPARRAY input_data, JSAMPARRAY output_data);
2912 */
2913
2914 .balign 16
2915 asm_function jsimd_h2v2_downsample_neon
2916 IMAGE_WIDTH .req x0
2917 MAX_V_SAMP .req x1
2918 V_SAMP .req x2
2919 BLOCK_WIDTH .req x3
2920 INPUT_DATA .req x4
2921 OUTPUT_DATA .req x5
2922 OUTPTR .req x9
2923 INPTR0 .req x10
2924 INPTR1 .req x14
2925 TMP1 .req x11
2926 TMP2 .req x12
2927 TMP3 .req x13
2928 TMPDUP .req w15
2929
2930 mov TMPDUP, #1
2931 lsl TMP2, BLOCK_WIDTH, #4
2932 lsl TMPDUP, TMPDUP, #17
2933 sub TMP2, TMP2, IMAGE_WIDTH
2934 adr TMP3, Ljsimd_h2_downsample_neon_consts
2935 orr TMPDUP, TMPDUP, #1
2936 add TMP3, TMP3, TMP2, lsl #4
2937 dup v16.4s, TMPDUP
2938 ld1 {v18.16b}, [TMP3]
2939
2940 1: /* row loop */
2941 ldr INPTR0, [INPUT_DATA], #8
2942 ldr OUTPTR, [OUTPUT_DATA], #8
2943 ldr INPTR1, [INPUT_DATA], #8
2944 subs TMP1, BLOCK_WIDTH, #1
2945 b.eq 3f
2946 2: /* columns */
2947 ld1 {v0.16b}, [INPTR0], #16
2948 ld1 {v1.16b}, [INPTR1], #16
2949 mov v4.16b, v16.16b
2950 subs TMP1, TMP1, #1
2951 uadalp v4.8h, v0.16b
2952 uadalp v4.8h, v1.16b
2953 shrn v6.8b, v4.8h, #2
2954 st1 {v6.8b}, [OUTPTR], #8
2955 b.ne 2b
2956 3: /* last columns */
2957 ld1 {v0.16b}, [INPTR0], #16
2958 ld1 {v1.16b}, [INPTR1], #16
2959 mov v4.16b, v16.16b
2960 subs V_SAMP, V_SAMP, #1
2961 /* expand right */
2962 tbl v2.16b, {v0.16b}, v18.16b
2963 tbl v3.16b, {v1.16b}, v18.16b
2964 uadalp v4.8h, v2.16b
2965 uadalp v4.8h, v3.16b
2966 shrn v6.8b, v4.8h, #2
2967 st1 {v6.8b}, [OUTPTR], #8
2968 b.ne 1b
2969
2970 br x30
2971
2972 .unreq IMAGE_WIDTH
2973 .unreq MAX_V_SAMP
2974 .unreq V_SAMP
2975 .unreq BLOCK_WIDTH
2976 .unreq INPUT_DATA
2977 .unreq OUTPUT_DATA
2978 .unreq OUTPTR
2979 .unreq INPTR0
2980 .unreq INPTR1
2981 .unreq TMP1
2982 .unreq TMP2
2983 .unreq TMP3
2984 .unreq TMPDUP
2985
2986
2987 /*****************************************************************************/
2988
2989 /*
2990 * GLOBAL(JOCTET*)
2991 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
2992 * JCOEFPTR block, int last_dc_val,
2993 * c_derived_tbl *dctbl, c_derived_tbl *actbl)
2994 *
2995 */
2996
2997 BUFFER .req x1
2998 PUT_BUFFER .req x6
2999 PUT_BITS .req x7
3000 PUT_BITSw .req w7
3001
3002 .macro emit_byte
3003 sub PUT_BITS, PUT_BITS, #0x8
3004 lsr x19, PUT_BUFFER, PUT_BITS
3005 uxtb w19, w19
3006 strb w19, [BUFFER, #1]!
3007 cmp w19, #0xff
3008 b.ne 14f
3009 strb wzr, [BUFFER, #1]!
3010 14:
3011 .endm
3012 .macro put_bits CODE, SIZE
3013 lsl PUT_BUFFER, PUT_BUFFER, \SIZE
3014 add PUT_BITS, PUT_BITS, \SIZE
3015 orr PUT_BUFFER, PUT_BUFFER, \CODE
3016 .endm
3017 .macro checkbuf31
3018 cmp PUT_BITS, #0x20
3019 b.lt 31f
3020 emit_byte
3021 emit_byte
3022 emit_byte
3023 emit_byte
3024 31:
3025 .endm
3026 .macro checkbuf47
3027 cmp PUT_BITS, #0x30
3028 b.lt 47f
3029 emit_byte
3030 emit_byte
3031 emit_byte
3032 emit_byte
3033 emit_byte
3034 emit_byte
3035 47:
3036 .endm
3037
3038 .macro generate_jsimd_huff_encode_one_block fast_tbl
3039
3040 .balign 16
3041 .if \fast_tbl == 1
3042 Ljsimd_huff_encode_one_block_neon_consts:
3043 .else
3044 Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
3045 .endif
3046 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
3047 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
3048 .if \fast_tbl == 1
3049 .byte 0, 1, 2, 3, 16, 17, 32, 33, \
3050 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
3051 .byte 34, 35, 48, 49, 255, 255, 50, 51, \
3052 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
3053 .byte 8, 9, 22, 23, 36, 37, 50, 51, \
3054 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
3055 .byte 54, 55, 40, 41, 26, 27, 12, 13, \
3056 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
3057 .byte 6, 7, 20, 21, 34, 35, 48, 49, \
3058 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
3059 .byte 42, 43, 28, 29, 14, 15, 30, 31, \
3060 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
3061 .byte 255, 255, 255, 255, 56, 57, 42, 43, \
3062 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
3063 .byte 26, 27, 40, 41, 42, 43, 28, 29, \
3064 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
3065 .byte 255, 255, 255, 255, 0, 1, 255, 255, \
3066 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
3067 .byte 255, 255, 255, 255, 255, 255, 255, 255, \
3068 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
3069 .byte 255, 255, 255, 255, 255, 255, 255, 255, \
3070 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
3071 .byte 4, 5, 6, 7, 255, 255, 255, 255, \
3072 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
3073 .endif
3074
3075 .if \fast_tbl == 1
3076 asm_function jsimd_huff_encode_one_block_neon
3077 .else
3078 asm_function jsimd_huff_encode_one_block_neon_slowtbl
3079 .endif
3080 sub sp, sp, 272
3081 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
3082 /* Save ARM registers */
3083 stp x19, x20, [sp], 16
3084 .if \fast_tbl == 1
3085 adr x15, Ljsimd_huff_encode_one_block_neon_consts
3086 .else
3087 adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
3088 .endif
3089 ldr PUT_BUFFER, [x0, #0x10]
3090 ldr PUT_BITSw, [x0, #0x18]
3091 ldrsh w12, [x2] /* load DC coeff in w12 */
3092 /* prepare data */
3093 .if \fast_tbl == 1
3094 ld1 {v23.16b}, [x15], #16
3095 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3096 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3097 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3098 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3099 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3100 sub w12, w12, w3 /* last_dc_val, not used afterwards */
3101 /* ZigZag 8x8 */
3102 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3103 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3104 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3105 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3106 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3107 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3108 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3109 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3110 ins v0.h[0], w12
3111 tbx v1.16b, {v28.16b}, v16.16b
3112 tbx v2.16b, {v29.16b, v30.16b}, v17.16b
3113 tbx v5.16b, {v29.16b, v30.16b}, v18.16b
3114 tbx v6.16b, {v31.16b}, v19.16b
3115 .else
3116 add x13, x2, #0x22
3117 sub w12, w12, w3 /* last_dc_val, not used afterwards */
3118 ld1 {v23.16b}, [x15]
3119 add x14, x2, #0x18
3120 add x3, x2, #0x36
3121 ins v0.h[0], w12
3122 add x9, x2, #0x2
3123 ld1 {v1.h}[0], [x13]
3124 add x15, x2, #0x30
3125 ld1 {v2.h}[0], [x14]
3126 add x19, x2, #0x26
3127 ld1 {v3.h}[0], [x3]
3128 add x20, x2, #0x28
3129 ld1 {v0.h}[1], [x9]
3130 add x12, x2, #0x10
3131 ld1 {v1.h}[1], [x15]
3132 add x13, x2, #0x40
3133 ld1 {v2.h}[1], [x19]
3134 add x14, x2, #0x34
3135 ld1 {v3.h}[1], [x20]
3136 add x3, x2, #0x1a
3137 ld1 {v0.h}[2], [x12]
3138 add x9, x2, #0x20
3139 ld1 {v1.h}[2], [x13]
3140 add x15, x2, #0x32
3141 ld1 {v2.h}[2], [x14]
3142 add x19, x2, #0x42
3143 ld1 {v3.h}[2], [x3]
3144 add x20, x2, #0xc
3145 ld1 {v0.h}[3], [x9]
3146 add x12, x2, #0x12
3147 ld1 {v1.h}[3], [x15]
3148 add x13, x2, #0x24
3149 ld1 {v2.h}[3], [x19]
3150 add x14, x2, #0x50
3151 ld1 {v3.h}[3], [x20]
3152 add x3, x2, #0xe
3153 ld1 {v0.h}[4], [x12]
3154 add x9, x2, #0x4
3155 ld1 {v1.h}[4], [x13]
3156 add x15, x2, #0x16
3157 ld1 {v2.h}[4], [x14]
3158 add x19, x2, #0x60
3159 ld1 {v3.h}[4], [x3]
3160 add x20, x2, #0x1c
3161 ld1 {v0.h}[5], [x9]
3162 add x12, x2, #0x6
3163 ld1 {v1.h}[5], [x15]
3164 add x13, x2, #0x8
3165 ld1 {v2.h}[5], [x19]
3166 add x14, x2, #0x52
3167 ld1 {v3.h}[5], [x20]
3168 add x3, x2, #0x2a
3169 ld1 {v0.h}[6], [x12]
3170 add x9, x2, #0x14
3171 ld1 {v1.h}[6], [x13]
3172 add x15, x2, #0xa
3173 ld1 {v2.h}[6], [x14]
3174 add x19, x2, #0x44
3175 ld1 {v3.h}[6], [x3]
3176 add x20, x2, #0x38
3177 ld1 {v0.h}[7], [x9]
3178 add x12, x2, #0x46
3179 ld1 {v1.h}[7], [x15]
3180 add x13, x2, #0x3a
3181 ld1 {v2.h}[7], [x19]
3182 add x14, x2, #0x74
3183 ld1 {v3.h}[7], [x20]
3184 add x3, x2, #0x6a
3185 ld1 {v4.h}[0], [x12]
3186 add x9, x2, #0x54
3187 ld1 {v5.h}[0], [x13]
3188 add x15, x2, #0x2c
3189 ld1 {v6.h}[0], [x14]
3190 add x19, x2, #0x76
3191 ld1 {v7.h}[0], [x3]
3192 add x20, x2, #0x78
3193 ld1 {v4.h}[1], [x9]
3194 add x12, x2, #0x62
3195 ld1 {v5.h}[1], [x15]
3196 add x13, x2, #0x1e
3197 ld1 {v6.h}[1], [x19]
3198 add x14, x2, #0x68
3199 ld1 {v7.h}[1], [x20]
3200 add x3, x2, #0x7a
3201 ld1 {v4.h}[2], [x12]
3202 add x9, x2, #0x70
3203 ld1 {v5.h}[2], [x13]
3204 add x15, x2, #0x2e
3205 ld1 {v6.h}[2], [x14]
3206 add x19, x2, #0x5a
3207 ld1 {v7.h}[2], [x3]
3208 add x20, x2, #0x6c
3209 ld1 {v4.h}[3], [x9]
3210 add x12, x2, #0x72
3211 ld1 {v5.h}[3], [x15]
3212 add x13, x2, #0x3c
3213 ld1 {v6.h}[3], [x19]
3214 add x14, x2, #0x4c
3215 ld1 {v7.h}[3], [x20]
3216 add x3, x2, #0x5e
3217 ld1 {v4.h}[4], [x12]
3218 add x9, x2, #0x64
3219 ld1 {v5.h}[4], [x13]
3220 add x15, x2, #0x4a
3221 ld1 {v6.h}[4], [x14]
3222 add x19, x2, #0x3e
3223 ld1 {v7.h}[4], [x3]
3224 add x20, x2, #0x6e
3225 ld1 {v4.h}[5], [x9]
3226 add x12, x2, #0x56
3227 ld1 {v5.h}[5], [x15]
3228 add x13, x2, #0x58
3229 ld1 {v6.h}[5], [x19]
3230 add x14, x2, #0x4e
3231 ld1 {v7.h}[5], [x20]
3232 add x3, x2, #0x7c
3233 ld1 {v4.h}[6], [x12]
3234 add x9, x2, #0x48
3235 ld1 {v5.h}[6], [x13]
3236 add x15, x2, #0x66
3237 ld1 {v6.h}[6], [x14]
3238 add x19, x2, #0x5c
3239 ld1 {v7.h}[6], [x3]
3240 add x20, x2, #0x7e
3241 ld1 {v4.h}[7], [x9]
3242 ld1 {v5.h}[7], [x15]
3243 ld1 {v6.h}[7], [x19]
3244 ld1 {v7.h}[7], [x20]
3245 .endif
3246 cmlt v24.8h, v0.8h, #0
3247 cmlt v25.8h, v1.8h, #0
3248 cmlt v26.8h, v2.8h, #0
3249 cmlt v27.8h, v3.8h, #0
3250 cmlt v28.8h, v4.8h, #0
3251 cmlt v29.8h, v5.8h, #0
3252 cmlt v30.8h, v6.8h, #0
3253 cmlt v31.8h, v7.8h, #0
3254 abs v0.8h, v0.8h
3255 abs v1.8h, v1.8h
3256 abs v2.8h, v2.8h
3257 abs v3.8h, v3.8h
3258 abs v4.8h, v4.8h
3259 abs v5.8h, v5.8h
3260 abs v6.8h, v6.8h
3261 abs v7.8h, v7.8h
3262 eor v24.16b, v24.16b, v0.16b
3263 eor v25.16b, v25.16b, v1.16b
3264 eor v26.16b, v26.16b, v2.16b
3265 eor v27.16b, v27.16b, v3.16b
3266 eor v28.16b, v28.16b, v4.16b
3267 eor v29.16b, v29.16b, v5.16b
3268 eor v30.16b, v30.16b, v6.16b
3269 eor v31.16b, v31.16b, v7.16b
3270 cmeq v16.8h, v0.8h, #0
3271 cmeq v17.8h, v1.8h, #0
3272 cmeq v18.8h, v2.8h, #0
3273 cmeq v19.8h, v3.8h, #0
3274 cmeq v20.8h, v4.8h, #0
3275 cmeq v21.8h, v5.8h, #0
3276 cmeq v22.8h, v6.8h, #0
3277 xtn v16.8b, v16.8h
3278 xtn v18.8b, v18.8h
3279 xtn v20.8b, v20.8h
3280 xtn v22.8b, v22.8h
3281 umov w14, v0.h[0]
3282 xtn2 v16.16b, v17.8h
3283 umov w13, v24.h[0]
3284 xtn2 v18.16b, v19.8h
3285 clz w14, w14
3286 xtn2 v20.16b, v21.8h
3287 lsl w13, w13, w14
3288 cmeq v17.8h, v7.8h, #0
3289 sub w12, w14, #32
3290 xtn2 v22.16b, v17.8h
3291 lsr w13, w13, w14
3292 and v16.16b, v16.16b, v23.16b
3293 neg w12, w12
3294 and v18.16b, v18.16b, v23.16b
3295 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
3296 and v20.16b, v20.16b, v23.16b
3297 add x15, sp, #0x80 /* x15 = t2 */
3298 and v22.16b, v22.16b, v23.16b
3299 ldr w10, [x4, x12, lsl #2]
3300 addp v16.16b, v16.16b, v18.16b
3301 ldrb w11, [x3, x12]
3302 addp v20.16b, v20.16b, v22.16b
3303 checkbuf47
3304 addp v16.16b, v16.16b, v20.16b
3305 put_bits x10, x11
3306 addp v16.16b, v16.16b, v18.16b
3307 checkbuf47
3308 umov x9,v16.D[0]
3309 put_bits x13, x12
3310 cnt v17.8b, v16.8b
3311 mvn x9, x9
3312 addv B18, v17.8b
3313 add x4, x5, #0x400 /* x4 = actbl->ehufsi */
3314 umov w12, v18.b[0]
3315 lsr x9, x9, #0x1 /* clear AC coeff */
3316 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
3317 rbit x9, x9 /* x9 = index0 */
3318 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
3319 cmp w12, #(64-8)
3320 mov x11, sp
3321 b.lt 4f
3322 cbz x9, 6f
3323 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3324 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3325 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3326 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3327 1:
3328 clz x2, x9
3329 add x15, x15, x2, lsl #1
3330 lsl x9, x9, x2
3331 ldrh w20, [x15, #-126]
3332 2:
3333 cmp x2, #0x10
3334 b.lt 3f
3335 sub x2, x2, #0x10
3336 checkbuf47
3337 put_bits x13, x14
3338 b 2b
3339 3:
3340 clz w20, w20
3341 ldrh w3, [x15, #2]!
3342 sub w11, w20, #32
3343 lsl w3, w3, w20
3344 neg w11, w11
3345 lsr w3, w3, w20
3346 add x2, x11, x2, lsl #4
3347 lsl x9, x9, #0x1
3348 ldr w12, [x5, x2, lsl #2]
3349 ldrb w10, [x4, x2]
3350 checkbuf31
3351 put_bits x12, x10
3352 put_bits x3, x11
3353 cbnz x9, 1b
3354 b 6f
3355 4:
3356 movi v21.8h, #0x0010
3357 clz v0.8h, v0.8h
3358 clz v1.8h, v1.8h
3359 clz v2.8h, v2.8h
3360 clz v3.8h, v3.8h
3361 clz v4.8h, v4.8h
3362 clz v5.8h, v5.8h
3363 clz v6.8h, v6.8h
3364 clz v7.8h, v7.8h
3365 ushl v24.8h, v24.8h, v0.8h
3366 ushl v25.8h, v25.8h, v1.8h
3367 ushl v26.8h, v26.8h, v2.8h
3368 ushl v27.8h, v27.8h, v3.8h
3369 ushl v28.8h, v28.8h, v4.8h
3370 ushl v29.8h, v29.8h, v5.8h
3371 ushl v30.8h, v30.8h, v6.8h
3372 ushl v31.8h, v31.8h, v7.8h
3373 neg v0.8h, v0.8h
3374 neg v1.8h, v1.8h
3375 neg v2.8h, v2.8h
3376 neg v3.8h, v3.8h
3377 neg v4.8h, v4.8h
3378 neg v5.8h, v5.8h
3379 neg v6.8h, v6.8h
3380 neg v7.8h, v7.8h
3381 ushl v24.8h, v24.8h, v0.8h
3382 ushl v25.8h, v25.8h, v1.8h
3383 ushl v26.8h, v26.8h, v2.8h
3384 ushl v27.8h, v27.8h, v3.8h
3385 ushl v28.8h, v28.8h, v4.8h
3386 ushl v29.8h, v29.8h, v5.8h
3387 ushl v30.8h, v30.8h, v6.8h
3388 ushl v31.8h, v31.8h, v7.8h
3389 add v0.8h, v21.8h, v0.8h
3390 add v1.8h, v21.8h, v1.8h
3391 add v2.8h, v21.8h, v2.8h
3392 add v3.8h, v21.8h, v3.8h
3393 add v4.8h, v21.8h, v4.8h
3394 add v5.8h, v21.8h, v5.8h
3395 add v6.8h, v21.8h, v6.8h
3396 add v7.8h, v21.8h, v7.8h
3397 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3398 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3399 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3400 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3401 1:
3402 clz x2, x9
3403 add x15, x15, x2, lsl #1
3404 lsl x9, x9, x2
3405 ldrh w11, [x15, #-126]
3406 2:
3407 cmp x2, #0x10
3408 b.lt 3f
3409 sub x2, x2, #0x10
3410 checkbuf47
3411 put_bits x13, x14
3412 b 2b
3413 3:
3414 ldrh w3, [x15, #2]!
3415 add x2, x11, x2, lsl #4
3416 lsl x9, x9, #0x1
3417 ldr w12, [x5, x2, lsl #2]
3418 ldrb w10, [x4, x2]
3419 checkbuf31
3420 put_bits x12, x10
3421 put_bits x3, x11
3422 cbnz x9, 1b
3423 6:
3424 add x13, sp, #0xfe
3425 cmp x15, x13
3426 b.hs 1f
3427 ldr w12, [x5]
3428 ldrb w14, [x4]
3429 checkbuf47
3430 put_bits x12, x14
3431 1:
3432 sub sp, sp, 16
3433 str PUT_BUFFER, [x0, #0x10]
3434 str PUT_BITSw, [x0, #0x18]
3435 ldp x19, x20, [sp], 16
3436 add x0, BUFFER, #0x1
3437 add sp, sp, 256
3438 br x30
3439
3440 .endm
3441
3442 generate_jsimd_huff_encode_one_block 1
3443 generate_jsimd_huff_encode_one_block 0
3444
3445 .unreq BUFFER
3446 .unreq PUT_BUFFER
3447 .unreq PUT_BITS
3448 .unreq PUT_BITSw
3449
3450 .purgem emit_byte
3451 .purgem put_bits
3452 .purgem checkbuf31
3453 .purgem checkbuf47
OLDNEW
« no previous file with comments | « simd/jsimd_arm64.c ('k') | simd/jsimd_arm_neon.S » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698