OLD | NEW |
1 /* | 1 /* |
2 * ARMv8 NEON optimizations for libjpeg-turbo | 2 * ARMv8 NEON optimizations for libjpeg-turbo |
3 * | 3 * |
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
5 * All rights reserved. | 5 * All rights reserved. |
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
7 * Copyright (C) 2013-2014, Linaro Limited | 7 * Copyright (C) 2013-2014, Linaro Limited |
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> | 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> |
| 9 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. |
| 10 * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. |
| 11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. |
9 * | 12 * |
10 * This software is provided 'as-is', without any express or implied | 13 * This software is provided 'as-is', without any express or implied |
11 * warranty. In no event will the authors be held liable for any damages | 14 * warranty. In no event will the authors be held liable for any damages |
12 * arising from the use of this software. | 15 * arising from the use of this software. |
13 * | 16 * |
14 * Permission is granted to anyone to use this software for any purpose, | 17 * Permission is granted to anyone to use this software for any purpose, |
15 * including commercial applications, and to alter it and redistribute it | 18 * including commercial applications, and to alter it and redistribute it |
16 * freely, subject to the following restrictions: | 19 * freely, subject to the following restrictions: |
17 * | 20 * |
18 * 1. The origin of this software must not be misrepresented; you must not | 21 * 1. The origin of this software must not be misrepresented; you must not |
19 * claim that you wrote the original software. If you use this software | 22 * claim that you wrote the original software. If you use this software |
20 * in a product, an acknowledgment in the product documentation would be | 23 * in a product, an acknowledgment in the product documentation would be |
21 * appreciated but is not required. | 24 * appreciated but is not required. |
22 * 2. Altered source versions must be plainly marked as such, and must not be | 25 * 2. Altered source versions must be plainly marked as such, and must not be |
23 * misrepresented as being the original software. | 26 * misrepresented as being the original software. |
24 * 3. This notice may not be removed or altered from any source distribution. | 27 * 3. This notice may not be removed or altered from any source distribution. |
25 */ | 28 */ |
26 | 29 |
27 #if defined(__linux__) && defined(__ELF__) | 30 #if defined(__linux__) && defined(__ELF__) |
28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ | 31 .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ |
29 #endif | 32 #endif |
30 | 33 |
31 .text | 34 .text |
32 .arch armv8-a+fp+simd | |
33 | 35 |
34 | 36 |
35 #define RESPECT_STRICT_ALIGNMENT 1 | 37 #define RESPECT_STRICT_ALIGNMENT 1 |
36 | 38 |
37 | 39 |
38 /*****************************************************************************/ | 40 /*****************************************************************************/ |
39 | 41 |
40 /* Supplementary macro for setting function attributes */ | 42 /* Supplementary macro for setting function attributes */ |
41 .macro asm_function fname | 43 .macro asm_function fname |
42 #ifdef __APPLE__ | 44 #ifdef __APPLE__ |
43 .globl _\fname | 45 .globl _\fname |
44 _\fname: | 46 _\fname: |
45 #else | 47 #else |
46 .global \fname | 48 .global \fname |
47 #ifdef __ELF__ | 49 #ifdef __ELF__ |
48 .hidden \fname | 50 .hidden \fname |
49 .type \fname, %function | 51 .type \fname, %function |
50 #endif | 52 #endif |
51 \fname: | 53 \fname: |
52 #endif | 54 #endif |
53 .endm | 55 .endm |
54 | 56 |
55 /* Transpose elements of single 128 bit registers */ | 57 /* Transpose elements of single 128 bit registers */ |
56 .macro transpose_single x0,x1,xi,xilen,literal | 58 .macro transpose_single x0, x1, xi, xilen, literal |
57 ins \xi\xilen[0], \x0\xilen[0] | 59 ins \xi\xilen[0], \x0\xilen[0] |
58 ins \x1\xilen[0], \x0\xilen[1] | 60 ins \x1\xilen[0], \x0\xilen[1] |
59 trn1 \x0\literal, \x0\literal, \x1\literal | 61 trn1 \x0\literal, \x0\literal, \x1\literal |
60 trn2 \x1\literal, \xi\literal, \x1\literal | 62 trn2 \x1\literal, \xi\literal, \x1\literal |
61 .endm | 63 .endm |
62 | 64 |
63 /* Transpose elements of 2 differnet registers */ | 65 /* Transpose elements of 2 differnet registers */ |
64 .macro transpose x0,x1,xi,xilen,literal | 66 .macro transpose x0, x1, xi, xilen, literal |
65 mov \xi\xilen, \x0\xilen | 67 mov \xi\xilen, \x0\xilen |
66 trn1 \x0\literal, \x0\literal, \x1\literal | 68 trn1 \x0\literal, \x0\literal, \x1\literal |
67 trn2 \x1\literal, \xi\literal, \x1\literal | 69 trn2 \x1\literal, \xi\literal, \x1\literal |
68 .endm | 70 .endm |
69 | 71 |
70 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ | 72 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ |
71 .macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen | 73 .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen |
72 mov \xi\xilen, \x0\xilen | 74 mov \xi\xilen, \x0\xilen |
73 trn1 \x0\x0len, \x0\x0len, \x2\x2len | 75 trn1 \x0\x0len, \x0\x0len, \x2\x2len |
74 trn2 \x2\x2len, \xi\x0len, \x2\x2len | 76 trn2 \x2\x2len, \xi\x0len, \x2\x2len |
75 mov \xi\xilen, \x1\xilen | 77 mov \xi\xilen, \x1\xilen |
76 trn1 \x1\x1len, \x1\x1len, \x3\x3len | 78 trn1 \x1\x1len, \x1\x1len, \x3\x3len |
77 trn2 \x3\x3len, \xi\x1len, \x3\x3len | 79 trn2 \x3\x3len, \xi\x1len, \x3\x3len |
78 .endm | 80 .endm |
79 | 81 |
80 .macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen | 82 .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen |
81 mov \xi\xilen, \x0\xilen | 83 mov \xi\xilen, \x0\xilen |
82 trn1 \x0\x0len, \x0\x0len, \x1\x1len | 84 trn1 \x0\x0len, \x0\x0len, \x1\x1len |
83 trn2 \x1\x2len, \xi\x0len, \x1\x2len | 85 trn2 \x1\x2len, \xi\x0len, \x1\x2len |
84 mov \xi\xilen, \x2\xilen | 86 mov \xi\xilen, \x2\xilen |
85 trn1 \x2\x2len, \x2\x2len, \x3\x3len | 87 trn1 \x2\x2len, \x2\x2len, \x3\x3len |
86 trn2 \x3\x2len, \xi\x1len, \x3\x3len | 88 trn2 \x3\x2len, \xi\x1len, \x3\x3len |
87 .endm | 89 .endm |
88 | 90 |
89 .macro transpose_4x4 x0, x1, x2, x3,x5 | 91 .macro transpose_4x4 x0, x1, x2, x3, x5 |
90 transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b | 92 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b |
91 transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b | 93 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b |
| 94 .endm |
| 95 |
| 96 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 |
| 97 trn1 \t0\().8h, \l0\().8h, \l1\().8h |
| 98 trn1 \t1\().8h, \l2\().8h, \l3\().8h |
| 99 trn1 \t2\().8h, \l4\().8h, \l5\().8h |
| 100 trn1 \t3\().8h, \l6\().8h, \l7\().8h |
| 101 trn2 \l1\().8h, \l0\().8h, \l1\().8h |
| 102 trn2 \l3\().8h, \l2\().8h, \l3\().8h |
| 103 trn2 \l5\().8h, \l4\().8h, \l5\().8h |
| 104 trn2 \l7\().8h, \l6\().8h, \l7\().8h |
| 105 |
| 106 trn1 \l4\().4s, \t2\().4s, \t3\().4s |
| 107 trn2 \t3\().4s, \t2\().4s, \t3\().4s |
| 108 trn1 \t2\().4s, \t0\().4s, \t1\().4s |
| 109 trn2 \l2\().4s, \t0\().4s, \t1\().4s |
| 110 trn1 \t0\().4s, \l1\().4s, \l3\().4s |
| 111 trn2 \l3\().4s, \l1\().4s, \l3\().4s |
| 112 trn2 \t1\().4s, \l5\().4s, \l7\().4s |
| 113 trn1 \l5\().4s, \l5\().4s, \l7\().4s |
| 114 |
| 115 trn2 \l6\().2d, \l2\().2d, \t3\().2d |
| 116 trn1 \l0\().2d, \t2\().2d, \l4\().2d |
| 117 trn1 \l1\().2d, \t0\().2d, \l5\().2d |
| 118 trn2 \l7\().2d, \l3\().2d, \t1\().2d |
| 119 trn1 \l2\().2d, \l2\().2d, \t3\().2d |
| 120 trn2 \l4\().2d, \t2\().2d, \l4\().2d |
| 121 trn1 \l3\().2d, \l3\().2d, \t1\().2d |
| 122 trn2 \l5\().2d, \t0\().2d, \l5\().2d |
92 .endm | 123 .endm |
93 | 124 |
94 | 125 |
95 #define CENTERJSAMPLE 128 | 126 #define CENTERJSAMPLE 128 |
96 | 127 |
97 /*****************************************************************************/ | 128 /*****************************************************************************/ |
98 | 129 |
99 /* | 130 /* |
100 * Perform dequantization and inverse DCT on one block of coefficients. | 131 * Perform dequantization and inverse DCT on one block of coefficients. |
101 * | 132 * |
102 * GLOBAL(void) | 133 * GLOBAL(void) |
103 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, | 134 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, |
104 * JSAMPARRAY output_buf, JDIMENSION output_col) | 135 * JSAMPARRAY output_buf, JDIMENSION output_col) |
105 */ | 136 */ |
106 | 137 |
107 #define FIX_0_298631336 (2446) | 138 #define CONST_BITS 13 |
108 #define FIX_0_390180644 (3196) | 139 #define PASS1_BITS 2 |
109 #define FIX_0_541196100 (4433) | 140 |
110 #define FIX_0_765366865 (6270) | 141 #define F_0_298 2446 /* FIX(0.298631336) */ |
111 #define FIX_0_899976223 (7373) | 142 #define F_0_390 3196 /* FIX(0.390180644) */ |
112 #define FIX_1_175875602 (9633) | 143 #define F_0_541 4433 /* FIX(0.541196100) */ |
113 #define FIX_1_501321110 (12299) | 144 #define F_0_765 6270 /* FIX(0.765366865) */ |
114 #define FIX_1_847759065 (15137) | 145 #define F_0_899 7373 /* FIX(0.899976223) */ |
115 #define FIX_1_961570560 (16069) | 146 #define F_1_175 9633 /* FIX(1.175875602) */ |
116 #define FIX_2_053119869 (16819) | 147 #define F_1_501 12299 /* FIX(1.501321110) */ |
117 #define FIX_2_562915447 (20995) | 148 #define F_1_847 15137 /* FIX(1.847759065) */ |
118 #define FIX_3_072711026 (25172) | 149 #define F_1_961 16069 /* FIX(1.961570560) */ |
119 | 150 #define F_2_053 16819 /* FIX(2.053119869) */ |
120 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) | 151 #define F_2_562 20995 /* FIX(2.562915447) */ |
121 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) | 152 #define F_3_072 25172 /* FIX(3.072711026) */ |
122 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) | |
123 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) | |
124 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) | |
125 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) | |
126 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) | |
127 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) | |
128 | |
129 /* | |
130 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. | |
131 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' | |
132 */ | |
133 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ | |
134 { \ | |
135 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ | |
136 INT32 q1, q2, q3, q4, q5, q6, q7; \ | |
137 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ | |
138 \ | |
139 /* 1-D iDCT input data */ \ | |
140 row0 = xrow0; \ | |
141 row1 = xrow1; \ | |
142 row2 = xrow2; \ | |
143 row3 = xrow3; \ | |
144 row4 = xrow4; \ | |
145 row5 = xrow5; \ | |
146 row6 = xrow6; \ | |
147 row7 = xrow7; \ | |
148 \ | |
149 q5 = row7 + row3; \ | |
150 q4 = row5 + row1; \ | |
151 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ | |
152 MULTIPLY(q4, FIX_1_175875602); \ | |
153 q7 = MULTIPLY(q5, FIX_1_175875602) + \ | |
154 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ | |
155 q2 = MULTIPLY(row2, FIX_0_541196100) + \ | |
156 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ | |
157 q4 = q6; \ | |
158 q3 = ((INT32) row0 - (INT32) row4) << 13; \ | |
159 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ | |
160 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ | |
161 /* now we can use q1 (reloadable constants have been used up) */ \ | |
162 q1 = q3 + q2; \ | |
163 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ | |
164 MULTIPLY(row1, -FIX_0_899976223); \ | |
165 q5 = q7; \ | |
166 q1 = q1 + q6; \ | |
167 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ | |
168 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ | |
169 \ | |
170 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ | |
171 tmp11_plus_tmp2 = q1; \ | |
172 row1 = 0; \ | |
173 \ | |
174 q1 = q1 - q6; \ | |
175 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ | |
176 MULTIPLY(row3, -FIX_2_562915447); \ | |
177 q1 = q1 - q6; \ | |
178 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ | |
179 MULTIPLY(row6, FIX_0_541196100); \ | |
180 q3 = q3 - q2; \ | |
181 \ | |
182 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ | |
183 tmp11_minus_tmp2 = q1; \ | |
184 \ | |
185 q1 = ((INT32) row0 + (INT32) row4) << 13; \ | |
186 q2 = q1 + q6; \ | |
187 q1 = q1 - q6; \ | |
188 \ | |
189 /* pick up the results */ \ | |
190 tmp0 = q4; \ | |
191 tmp1 = q5; \ | |
192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ | |
193 tmp3 = q7; \ | |
194 tmp10 = q2; \ | |
195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ | |
196 tmp12 = q3; \ | |
197 tmp13 = q1; \ | |
198 } | |
199 | |
200 #define XFIX_0_899976223 v0.4h[0] | |
201 #define XFIX_0_541196100 v0.4h[1] | |
202 #define XFIX_2_562915447 v0.4h[2] | |
203 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3] | |
204 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0] | |
205 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1] | |
206 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2] | |
207 #define XFIX_1_175875602 v1.4h[3] | |
208 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0] | |
209 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1] | |
210 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2] | |
211 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3] | |
212 | 153 |
213 .balign 16 | 154 .balign 16 |
214 jsimd_idct_islow_neon_consts: | 155 Ljsimd_idct_islow_neon_consts: |
215 .short FIX_0_899976223 /* d0[0] */ | 156 .short F_0_298 |
216 .short FIX_0_541196100 /* d0[1] */ | 157 .short -F_0_390 |
217 .short FIX_2_562915447 /* d0[2] */ | 158 .short F_0_541 |
218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ | 159 .short F_0_765 |
219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ | 160 .short - F_0_899 |
220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ | 161 .short F_1_175 |
221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ | 162 .short F_1_501 |
222 .short FIX_1_175875602 /* d1[3] */ | 163 .short - F_1_847 |
223 /* reloadable constants */ | 164 .short - F_1_961 |
224 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ | 165 .short F_2_053 |
225 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ | 166 .short - F_2_562 |
226 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ | 167 .short F_3_072 |
227 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ | 168 .short 0 /* padding */ |
| 169 .short 0 |
| 170 .short 0 |
| 171 .short 0 |
| 172 |
| 173 #undef F_0_298 |
| 174 #undef F_0_390 |
| 175 #undef F_0_541 |
| 176 #undef F_0_765 |
| 177 #undef F_0_899 |
| 178 #undef F_1_175 |
| 179 #undef F_1_501 |
| 180 #undef F_1_847 |
| 181 #undef F_1_961 |
| 182 #undef F_2_053 |
| 183 #undef F_2_562 |
| 184 #undef F_3_072 |
| 185 |
| 186 #define XFIX_P_0_298 v0.h[0] |
| 187 #define XFIX_N_0_390 v0.h[1] |
| 188 #define XFIX_P_0_541 v0.h[2] |
| 189 #define XFIX_P_0_765 v0.h[3] |
| 190 #define XFIX_N_0_899 v0.h[4] |
| 191 #define XFIX_P_1_175 v0.h[5] |
| 192 #define XFIX_P_1_501 v0.h[6] |
| 193 #define XFIX_N_1_847 v0.h[7] |
| 194 #define XFIX_N_1_961 v1.h[0] |
| 195 #define XFIX_P_2_053 v1.h[1] |
| 196 #define XFIX_N_2_562 v1.h[2] |
| 197 #define XFIX_P_3_072 v1.h[3] |
228 | 198 |
229 asm_function jsimd_idct_islow_neon | 199 asm_function jsimd_idct_islow_neon |
230 | |
231 DCT_TABLE .req x0 | 200 DCT_TABLE .req x0 |
232 COEF_BLOCK .req x1 | 201 COEF_BLOCK .req x1 |
233 OUTPUT_BUF .req x2 | 202 OUTPUT_BUF .req x2 |
234 OUTPUT_COL .req x3 | 203 OUTPUT_COL .req x3 |
235 TMP1 .req x0 | 204 TMP1 .req x0 |
236 TMP2 .req x1 | 205 TMP2 .req x1 |
237 TMP3 .req x2 | 206 TMP3 .req x9 |
238 TMP4 .req x15 | 207 TMP4 .req x10 |
239 | 208 TMP5 .req x11 |
240 ROW0L .req v16 | 209 TMP6 .req x12 |
241 ROW0R .req v17 | 210 TMP7 .req x13 |
242 ROW1L .req v18 | 211 TMP8 .req x14 |
243 ROW1R .req v19 | 212 |
244 ROW2L .req v20 | 213 sub sp, sp, #64 |
245 ROW2R .req v21 | 214 adr x15, Ljsimd_idct_islow_neon_consts |
246 ROW3L .req v22 | 215 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 |
247 ROW3R .req v23 | 216 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 |
248 ROW4L .req v24 | 217 ld1 {v0.8h, v1.8h}, [x15] |
249 ROW4R .req v25 | 218 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 |
250 ROW5L .req v26 | 219 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 |
251 ROW5R .req v27 | 220 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 |
252 ROW6L .req v28 | 221 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 |
253 ROW6R .req v29 | 222 |
254 ROW7L .req v30 | 223 cmeq v16.8h, v3.8h, #0 |
255 ROW7R .req v31 | 224 cmeq v26.8h, v4.8h, #0 |
256 /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */ | 225 cmeq v27.8h, v5.8h, #0 |
257 sub sp, sp, 272 | 226 cmeq v28.8h, v6.8h, #0 |
258 str x15, [sp], 16 | 227 cmeq v29.8h, v7.8h, #0 |
259 adr x15, jsimd_idct_islow_neon_consts | 228 cmeq v30.8h, v8.8h, #0 |
260 st1 {v0.8b - v3.8b}, [sp], 32 | 229 cmeq v31.8h, v9.8h, #0 |
261 st1 {v4.8b - v7.8b}, [sp], 32 | 230 |
262 st1 {v8.8b - v11.8b}, [sp], 32 | 231 and v10.16b, v16.16b, v26.16b |
263 st1 {v12.8b - v15.8b}, [sp], 32 | 232 and v11.16b, v27.16b, v28.16b |
264 st1 {v16.8b - v19.8b}, [sp], 32 | 233 and v12.16b, v29.16b, v30.16b |
265 st1 {v20.8b - v23.8b}, [sp], 32 | 234 and v13.16b, v31.16b, v10.16b |
266 st1 {v24.8b - v27.8b}, [sp], 32 | 235 and v14.16b, v11.16b, v12.16b |
267 st1 {v28.8b - v31.8b}, [sp], 32 | 236 mul v2.8h, v2.8h, v18.8h |
268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 | 237 and v15.16b, v13.16b, v14.16b |
269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | 238 shl v10.8h, v2.8h, #(PASS1_BITS) |
270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 | 239 sqxtn v16.8b, v15.8h |
271 mul v16.4h, v16.4h, v0.4h | 240 mov TMP1, v16.d[0] |
272 mul v17.4h, v17.4h, v1.4h | 241 sub sp, sp, #64 |
273 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */ | 242 mvn TMP2, TMP1 |
274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | 243 |
275 mul v18.4h, v18.4h, v2.4h | 244 cbnz TMP2, 2f |
276 mul v19.4h, v19.4h, v3.4h | 245 /* case all AC coeffs are zeros */ |
277 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */ | 246 dup v2.2d, v10.d[0] |
278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 | 247 dup v6.2d, v10.d[1] |
279 mul v20.4h, v20.4h, v4.4h | 248 mov v3.16b, v2.16b |
280 mul v21.4h, v21.4h, v5.4h | 249 mov v7.16b, v6.16b |
281 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */ | 250 mov v4.16b, v2.16b |
282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | 251 mov v8.16b, v6.16b |
283 mul v22.4h, v22.4h, v6.4h | 252 mov v5.16b, v2.16b |
284 mul v23.4h, v23.4h, v7.4h | 253 mov v9.16b, v6.16b |
285 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */ | 254 1: |
286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] | 255 /* for this transpose, we should organise data like this: |
287 mul v24.4h, v24.4h, v0.4h | 256 * 00, 01, 02, 03, 40, 41, 42, 43 |
288 mul v25.4h, v25.4h, v1.4h | 257 * 10, 11, 12, 13, 50, 51, 52, 53 |
289 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */ | 258 * 20, 21, 22, 23, 60, 61, 62, 63 |
290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | 259 * 30, 31, 32, 33, 70, 71, 72, 73 |
291 mul v28.4h, v28.4h, v4.4h | 260 * 04, 05, 06, 07, 44, 45, 46, 47 |
292 mul v29.4h, v29.4h, v5.4h | 261 * 14, 15, 16, 17, 54, 55, 56, 57 |
293 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */ | 262 * 24, 25, 26, 27, 64, 65, 66, 67 |
294 mul v26.4h, v26.4h, v2.4h | 263 * 34, 35, 36, 37, 74, 75, 76, 77 |
295 mul v27.4h, v27.4h, v3.4h | 264 */ |
296 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */ | 265 trn1 v28.8h, v2.8h, v3.8h |
297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ | 266 trn1 v29.8h, v4.8h, v5.8h |
298 add x15, x15, #16 | 267 trn1 v30.8h, v6.8h, v7.8h |
299 mul v30.4h, v30.4h, v6.4h | 268 trn1 v31.8h, v8.8h, v9.8h |
300 mul v31.4h, v31.4h, v7.4h | 269 trn2 v16.8h, v2.8h, v3.8h |
301 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ | 270 trn2 v17.8h, v4.8h, v5.8h |
302 /* Go to the bottom of the stack */ | 271 trn2 v18.8h, v6.8h, v7.8h |
303 sub sp, sp, 352 | 272 trn2 v19.8h, v8.8h, v9.8h |
304 stp x4, x5, [sp], 16 | 273 trn1 v2.4s, v28.4s, v29.4s |
305 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ | 274 trn1 v6.4s, v30.4s, v31.4s |
306 st1 {v12.4h - v15.4h}, [sp], 32 | 275 trn1 v3.4s, v16.4s, v17.4s |
307 /* 1-D IDCT, pass 1, left 4x8 half */ | 276 trn1 v7.4s, v18.4s, v19.4s |
308 add v4.4h, ROW7L.4h, ROW3L.4h | 277 trn2 v4.4s, v28.4s, v29.4s |
309 add v5.4h, ROW5L.4h, ROW1L.4h | 278 trn2 v8.4s, v30.4s, v31.4s |
310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 | 279 trn2 v5.4s, v16.4s, v17.4s |
311 smlal v12.4s, v5.4h, XFIX_1_175875602 | 280 trn2 v9.4s, v18.4s, v19.4s |
312 smull v14.4s, v4.4h, XFIX_1_175875602 | 281 /* Even part: reverse the even part of the forward DCT. */ |
313 /* Check for the zero coefficients in the right 4x8 half */ | 282 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr
[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZ
E*6]) */ |
314 smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644 | 283 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr
[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ
E*4]) */ |
315 ssubl v6.4s, ROW0L.4h, ROW4L.4h | 284 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3,
FIX_0_541196100); */ |
316 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] | 285 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr
[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ
E*4]) */ |
317 smull v4.4s, ROW2L.4h, XFIX_0_541196100 | 286 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3,
FIX_0_541196100); */ |
318 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065 | 287 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2
+ z3, CONST_BITS); */ |
319 orr x0, x4, x5 | 288 mov v21.16b, v19.16b /* tmp3 = z1 */ |
320 mov v8.16b, v12.16b | 289 mov v20.16b, v18.16b /* tmp3 = z1 */ |
321 smlsl v12.4s, ROW5L.4h, XFIX_2_562915447 | 290 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY
(z3, - FIX_1_847759065); */ |
322 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] | 291 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY
(z3, - FIX_1_847759065); */ |
323 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | 292 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2
- z3, CONST_BITS); */ |
324 shl v6.4s, v6.4s, #13 | 293 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY
(z2, FIX_0_765366865); */ |
325 orr x0, x0, x4 | 294 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY
(z2, FIX_0_765366865); */ |
326 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | 295 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2
+ z3, CONST_BITS); */ |
327 orr x0, x0 , x5 | 296 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2
- z3, CONST_BITS); */ |
328 add v2.4s, v6.4s, v4.4s | 297 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3
; */ |
329 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] | 298 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3
; */ |
330 mov v10.16b, v14.16b | 299 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2
; */ |
331 add v2.4s, v2.4s, v12.4s | 300 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2
; */ |
332 orr x0, x0, x4 | 301 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3
; */ |
333 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223 | 302 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3
; */ |
334 orr x0, x0, x5 | 303 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2
; */ |
335 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | 304 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2
; */ |
336 rshrn ROW1L.4h, v2.4s, #11 | 305 |
337 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] | 306 /* Odd part per figure 8; the matrix is unitary and hence its |
338 sub v2.4s, v2.4s, v12.4s | 307 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. |
339 smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447 | 308 */ |
340 orr x0, x0, x4 | 309 |
341 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | 310 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp
tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS
IZE*3]) */ |
342 orr x0, x0, x5 | 311 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp
tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS
IZE*1]) */ |
343 sub v2.4s, v2.4s, v12.4s | 312 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp
tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS
IZE*1]) */ |
344 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | 313 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp
tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS
IZE*3]) */ |
345 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] | 314 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ |
346 smlal v12.4s, ROW6L.4h, XFIX_0_541196100 | 315 |
347 sub v6.4s, v6.4s, v4.4s | 316 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0
_298631336) */ |
348 orr x0, x0, x4 | 317 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2
_053119869) */ |
349 rshrn ROW6L.4h, v2.4s, #11 | 318 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3
_072711026) */ |
350 orr x0, x0, x5 | 319 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1
_501321110) */ |
351 add v2.4s, v6.4s, v10.4s | 320 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4,
FIX_1_175875602) */ |
352 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] | 321 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9
61570560) */ |
353 sub v6.4s, v6.4s, v10.4s | 322 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3
90180644) */ |
354 saddl v10.4s, ROW0L.4h, ROW4L.4h | 323 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8
99976223) */ |
355 orr x0, x0, x4 | 324 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5
62915447) */ |
356 rshrn ROW2L.4h, v2.4s, #11 | 325 |
357 orr x0, x0, x5 | 326 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0
_298631336) */ |
358 rshrn ROW5L.4h, v6.4s, #11 | 327 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2
_053119869) */ |
359 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] | 328 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3
_072711026) */ |
360 shl v10.4s, v10.4s, #13 | 329 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1
_501321110) */ |
361 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223 | 330 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4,
FIX_1_175875602) */ |
362 orr x0, x0, x4 | 331 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9
61570560) */ |
363 add v4.4s, v10.4s, v12.4s | 332 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3
90180644) */ |
364 orr x0, x0, x5 | 333 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8
99976223) */ |
365 cmp x0, #0 /* orrs instruction removed */ | 334 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5
62915447) */ |
366 sub v2.4s, v10.4s, v12.4s | 335 |
367 add v12.4s, v4.4s, v14.4s | 336 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ |
368 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] | 337 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ |
369 sub v4.4s, v4.4s, v14.4s | 338 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ |
370 add v10.4s, v2.4s, v8.4s | 339 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ |
371 orr x0, x4, x5 | 340 |
372 sub v6.4s, v2.4s, v8.4s | 341 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ |
373 /* pop {x4, x5} */ | 342 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ |
374 sub sp, sp, 80 | 343 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ |
375 ldp x4, x5, [sp], 16 | 344 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ |
376 rshrn ROW7L.4h, v4.4s, #11 | 345 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ |
377 rshrn ROW3L.4h, v10.4s, #11 | 346 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ |
378 rshrn ROW0L.4h, v12.4s, #11 | 347 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ |
379 rshrn ROW4L.4h, v6.4s, #11 | 348 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ |
380 | 349 |
381 beq 3f /* Go to do some special handling for the sparse right
4x8 half */ | 350 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ |
382 | 351 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ |
383 /* 1-D IDCT, pass 1, right 4x8 half */ | 352 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ |
384 ld1 {v2.4h}, [x15] /* reload constants */ | 353 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ |
385 add v10.4h, ROW7R.4h, ROW3R.4h | 354 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ |
386 add v8.4h, ROW5R.4h, ROW1R.4h | 355 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ |
387 /* Transpose ROW6L <-> ROW7L (v3 available free register) */ | 356 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ |
388 transpose ROW6L, ROW7L, v3, .16b, .4h | 357 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ |
389 smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560 | 358 |
390 smlal v12.4s, v8.4h, XFIX_1_175875602 | 359 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
391 /* Transpose ROW2L <-> ROW3L (v3 available free register) */ | 360 |
392 transpose ROW2L, ROW3L, v3, .16b, .4h | 361 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ |
393 smull v14.4s, v10.4h, XFIX_1_175875602 | 362 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ |
394 smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644 | 363 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ |
395 /* Transpose ROW0L <-> ROW1L (v3 available free register) */ | 364 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ |
396 transpose ROW0L, ROW1L, v3, .16b, .4h | 365 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ |
397 ssubl v6.4s, ROW0R.4h, ROW4R.4h | 366 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ |
398 smull v4.4s, ROW2R.4h, XFIX_0_541196100 | 367 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ |
399 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 | 368 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ |
400 /* Transpose ROW4L <-> ROW5L (v3 available free register) */ | 369 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ |
401 transpose ROW4L, ROW5L, v3, .16b, .4h | 370 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ |
402 mov v8.16b, v12.16b | 371 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ |
403 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 | 372 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ |
404 smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447 | 373 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ |
405 /* Transpose ROW1L <-> ROW3L (v3 available free register) */ | 374 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ |
406 transpose ROW1L, ROW3L, v3, .16b, .2s | 375 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ |
407 shl v6.4s, v6.4s, #13 | 376 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ |
408 smlsl v8.4s, ROW1R.4h, XFIX_0_899976223 | 377 |
409 /* Transpose ROW4L <-> ROW6L (v3 available free register) */ | 378 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp1
0 + tmp3, CONST_BITS+PASS1_BITS+3) */ |
410 transpose ROW4L, ROW6L, v3, .16b, .2s | 379 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp1
0 - tmp3, CONST_BITS+PASS1_BITS+3) */ |
411 add v2.4s, v6.4s, v4.4s | 380 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp1
1 + tmp2, CONST_BITS+PASS1_BITS+3) */ |
412 mov v10.16b, v14.16b | 381 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp1
1 - tmp2, CONST_BITS+PASS1_BITS+3) */ |
413 add v2.4s, v2.4s, v12.4s | 382 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp1
2 + tmp1, CONST_BITS+PASS1_BITS+3) */ |
414 /* Transpose ROW0L <-> ROW2L (v3 available free register) */ | 383 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp1
2 - tmp1, CONST_BITS+PASS1_BITS+3) */ |
415 transpose ROW0L, ROW2L, v3, .16b, .2s | 384 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp1
3 + tmp0, CONST_BITS+PASS1_BITS+3) */ |
416 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 | 385 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp1
3 - tmp0, CONST_BITS+PASS1_BITS+3) */ |
417 smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223 | 386 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp1
0 + tmp3, CONST_BITS+PASS1_BITS+3) */ |
418 rshrn ROW1R.4h, v2.4s, #11 | 387 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp1
0 - tmp3, CONST_BITS+PASS1_BITS+3) */ |
419 /* Transpose ROW5L <-> ROW7L (v3 available free register) */ | 388 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp1
1 + tmp2, CONST_BITS+PASS1_BITS+3) */ |
420 transpose ROW5L, ROW7L, v3, .16b, .2s | 389 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp1
1 - tmp2, CONST_BITS+PASS1_BITS+3) */ |
421 sub v2.4s, v2.4s, v12.4s | 390 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp1
2 + tmp1, CONST_BITS+PASS1_BITS+3) */ |
422 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 | 391 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp1
2 - tmp1, CONST_BITS+PASS1_BITS+3) */ |
423 smlsl v10.4s, ROW3R.4h, XFIX_2_562915447 | 392 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp1
3 + tmp0, CONST_BITS+PASS1_BITS+3) */ |
424 sub v2.4s, v2.4s, v12.4s | 393 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp1
3 - tmp0, CONST_BITS+PASS1_BITS+3) */ |
425 smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865 | 394 movi v0.16b, #(CENTERJSAMPLE) |
426 smlal v12.4s, ROW6R.4h, XFIX_0_541196100 | 395 /* Prepare pointers (dual-issue with NEON instructions) */ |
427 sub v6.4s, v6.4s, v4.4s | 396 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
428 rshrn ROW6R.4h, v2.4s, #11 | 397 sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16) |
429 add v2.4s, v6.4s, v10.4s | 398 ldp TMP3, TMP4, [OUTPUT_BUF], 16 |
430 sub v6.4s, v6.4s, v10.4s | 399 sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16) |
431 saddl v10.4s, ROW0R.4h, ROW4R.4h | 400 add TMP1, TMP1, OUTPUT_COL |
432 rshrn ROW2R.4h, v2.4s, #11 | 401 sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16) |
433 rshrn ROW5R.4h, v6.4s, #11 | 402 add TMP2, TMP2, OUTPUT_COL |
434 shl v10.4s, v10.4s, #13 | 403 sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16) |
435 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 | 404 add TMP3, TMP3, OUTPUT_COL |
436 add v4.4s, v10.4s, v12.4s | 405 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16) |
437 sub v2.4s, v10.4s, v12.4s | 406 add TMP4, TMP4, OUTPUT_COL |
438 add v12.4s, v4.4s, v14.4s | 407 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16) |
439 sub v4.4s, v4.4s, v14.4s | 408 ldp TMP5, TMP6, [OUTPUT_BUF], 16 |
440 add v10.4s, v2.4s, v8.4s | 409 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16) |
441 sub v6.4s, v2.4s, v8.4s | 410 ldp TMP7, TMP8, [OUTPUT_BUF], 16 |
442 rshrn ROW7R.4h, v4.4s, #11 | 411 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16) |
443 rshrn ROW3R.4h, v10.4s, #11 | 412 add TMP5, TMP5, OUTPUT_COL |
444 rshrn ROW0R.4h, v12.4s, #11 | 413 add v16.16b, v28.16b, v0.16b |
445 rshrn ROW4R.4h, v6.4s, #11 | 414 add TMP6, TMP6, OUTPUT_COL |
446 /* Transpose right 4x8 half */ | 415 add v18.16b, v29.16b, v0.16b |
447 transpose ROW6R, ROW7R, v3, .16b, .4h | 416 add TMP7, TMP7, OUTPUT_COL |
448 transpose ROW2R, ROW3R, v3, .16b, .4h | 417 add v20.16b, v30.16b, v0.16b |
449 transpose ROW0R, ROW1R, v3, .16b, .4h | 418 add TMP8, TMP8, OUTPUT_COL |
450 transpose ROW4R, ROW5R, v3, .16b, .4h | 419 add v22.16b, v31.16b, v0.16b |
451 transpose ROW1R, ROW3R, v3, .16b, .2s | 420 |
452 transpose ROW4R, ROW6R, v3, .16b, .2s | 421 /* Transpose the final 8-bit samples */ |
453 transpose ROW0R, ROW2R, v3, .16b, .2s | 422 trn1 v28.16b, v16.16b, v18.16b |
454 transpose ROW5R, ROW7R, v3, .16b, .2s | 423 trn1 v30.16b, v20.16b, v22.16b |
455 | 424 trn2 v29.16b, v16.16b, v18.16b |
456 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ | 425 trn2 v31.16b, v20.16b, v22.16b |
457 ld1 {v2.4h}, [x15] /* reload constants */ | 426 |
458 smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4
h */ | 427 trn1 v16.8h, v28.8h, v30.8h |
459 smlal v12.4s, ROW1L.4h, XFIX_1_175875602 | 428 trn2 v18.8h, v28.8h, v30.8h |
460 smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO
W7L.4h <-> ROW3R.4h */ | 429 trn1 v20.8h, v29.8h, v31.8h |
461 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 | 430 trn2 v22.8h, v29.8h, v31.8h |
462 smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4
h */ | 431 |
463 smlal v14.4s, ROW3L.4h, XFIX_1_175875602 | 432 uzp1 v28.4s, v16.4s, v18.4s |
464 smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO
W5L.4h <-> ROW1R.4h */ | 433 uzp2 v30.4s, v16.4s, v18.4s |
465 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 | 434 uzp1 v29.4s, v20.4s, v22.4s |
466 ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ | 435 uzp2 v31.4s, v20.4s, v22.4s |
467 smull v4.4s, ROW2L.4h, XFIX_0_541196100 | 436 |
468 smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* RO
W6L.4h <-> ROW2R.4h */ | |
469 mov v8.16b, v12.16b | |
470 smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4
h */ | |
471 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
472 shl v6.4s, v6.4s, #13 | |
473 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | |
474 add v2.4s, v6.4s, v4.4s | |
475 mov v10.16b, v14.16b | |
476 add v2.4s, v2.4s, v12.4s | |
477 smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4
h */ | |
478 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
479 shrn ROW1L.4h, v2.4s, #16 | |
480 sub v2.4s, v2.4s, v12.4s | |
481 smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* RO
W5L.4h <-> ROW1R.4h */ | |
482 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | |
483 sub v2.4s, v2.4s, v12.4s | |
484 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
485 smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4
h */ | |
486 sub v6.4s, v6.4s, v4.4s | |
487 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
488 add v2.4s, v6.4s, v10.4s | |
489 sub v6.4s, v6.4s, v10.4s | |
490 saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
491 shrn ROW2L.4h, v2.4s, #16 | |
492 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
493 shl v10.4s, v10.4s, #13 | |
494 smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* RO
W7L.4h <-> ROW3R.4h */ | |
495 add v4.4s, v10.4s, v12.4s | |
496 sub v2.4s, v10.4s, v12.4s | |
497 add v12.4s, v4.4s, v14.4s | |
498 sub v4.4s, v4.4s, v14.4s | |
499 add v10.4s, v2.4s, v8.4s | |
500 sub v6.4s, v2.4s, v8.4s | |
501 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
502 shrn ROW3L.4h, v10.4s, #16 | |
503 shrn ROW0L.4h, v12.4s, #16 | |
504 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
505 /* 1-D IDCT, pass 2, right 4x8 half */ | |
506 ld1 {v2.4h}, [x15] /* reload constants */ | |
507 smull v12.4s, ROW5R.4h, XFIX_1_175875602 | |
508 smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4
h */ | |
509 smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560 | |
510 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO
W7L.4h <-> ROW3R.4h */ | |
511 smull v14.4s, ROW7R.4h, XFIX_1_175875602 | |
512 smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4
h */ | |
513 smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644 | |
514 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO
W5L.4h <-> ROW1R.4h */ | |
515 ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
516 smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4
h */ | |
517 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 | |
518 mov v8.16b, v12.16b | |
519 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 | |
520 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* RO
W7L.4h <-> ROW3R.4h */ | |
521 shl v6.4s, v6.4s, #13 | |
522 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4
h */ | |
523 add v2.4s, v6.4s, v4.4s | |
524 mov v10.16b, v14.16b | |
525 add v2.4s, v2.4s, v12.4s | |
526 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 | |
527 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* RO
W5L.4h <-> ROW1R.4h */ | |
528 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
529 sub v2.4s, v2.4s, v12.4s | |
530 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 | |
531 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4
h */ | |
532 sub v2.4s, v2.4s, v12.4s | |
533 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW
6L.4h <-> ROW2R.4h */ | |
534 smlal v12.4s, ROW6R.4h, XFIX_0_541196100 | |
535 sub v6.4s, v6.4s, v4.4s | |
536 shrn ROW6R.4h, v2.4s, #16 | |
537 add v2.4s, v6.4s, v10.4s | |
538 sub v6.4s, v6.4s, v10.4s | |
539 saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
540 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
541 shrn ROW5R.4h, v6.4s, #16 | |
542 shl v10.4s, v10.4s, #13 | |
543 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 | |
544 add v4.4s, v10.4s, v12.4s | |
545 sub v2.4s, v10.4s, v12.4s | |
546 add v12.4s, v4.4s, v14.4s | |
547 sub v4.4s, v4.4s, v14.4s | |
548 add v10.4s, v2.4s, v8.4s | |
549 sub v6.4s, v2.4s, v8.4s | |
550 shrn ROW7R.4h, v4.4s, #16 | |
551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
553 shrn ROW4R.4h, v6.4s, #16 | |
554 | |
555 2: /* Descale to 8-bit and range limit */ | |
556 ins v16.2d[1], v17.2d[0] | |
557 ins v18.2d[1], v19.2d[0] | |
558 ins v20.2d[1], v21.2d[0] | |
559 ins v22.2d[1], v23.2d[0] | |
560 sqrshrn v16.8b, v16.8h, #2 | |
561 sqrshrn2 v16.16b, v18.8h, #2 | |
562 sqrshrn v18.8b, v20.8h, #2 | |
563 sqrshrn2 v18.16b, v22.8h, #2 | |
564 | |
565 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ | |
566 ld1 {v8.4h - v11.4h}, [sp], 32 | |
567 ld1 {v12.4h - v15.4h}, [sp], 32 | |
568 ins v24.2d[1], v25.2d[0] | |
569 | |
570 sqrshrn v20.8b, v24.8h, #2 | |
571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ | |
572 /* trn1 v16.8h, v16.8h, v18.8h */ | |
573 transpose v16, v18, v3, .16b, .8h | |
574 ins v26.2d[1], v27.2d[0] | |
575 ins v28.2d[1], v29.2d[0] | |
576 ins v30.2d[1], v31.2d[0] | |
577 sqrshrn2 v20.16b, v26.8h, #2 | |
578 sqrshrn v22.8b, v28.8h, #2 | |
579 movi v0.16b, #(CENTERJSAMPLE) | |
580 sqrshrn2 v22.16b, v30.8h, #2 | |
581 transpose_single v16, v17, v3, .2d, .8b | |
582 transpose_single v18, v19, v3, .2d, .8b | |
583 add v16.8b, v16.8b, v0.8b | |
584 add v17.8b, v17.8b, v0.8b | |
585 add v18.8b, v18.8b, v0.8b | |
586 add v19.8b, v19.8b, v0.8b | |
587 transpose v20, v22, v3, .16b, .8h | |
588 /* Store results to the output buffer */ | 437 /* Store results to the output buffer */ |
589 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 438 st1 {v28.d}[0], [TMP1] |
590 add TMP1, TMP1, OUTPUT_COL | 439 st1 {v29.d}[0], [TMP2] |
591 add TMP2, TMP2, OUTPUT_COL | 440 st1 {v28.d}[1], [TMP3] |
592 st1 {v16.8b}, [TMP1] | 441 st1 {v29.d}[1], [TMP4] |
593 transpose_single v20, v21, v3, .2d, .8b | 442 st1 {v30.d}[0], [TMP5] |
594 st1 {v17.8b}, [TMP2] | 443 st1 {v31.d}[0], [TMP6] |
595 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 444 st1 {v30.d}[1], [TMP7] |
596 add TMP1, TMP1, OUTPUT_COL | 445 st1 {v31.d}[1], [TMP8] |
597 add TMP2, TMP2, OUTPUT_COL | 446 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 |
598 st1 {v18.8b}, [TMP1] | 447 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 |
599 add v20.8b, v20.8b, v0.8b | |
600 add v21.8b, v21.8b, v0.8b | |
601 st1 {v19.8b}, [TMP2] | |
602 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
603 ldp TMP3, TMP4, [OUTPUT_BUF] | |
604 add TMP1, TMP1, OUTPUT_COL | |
605 add TMP2, TMP2, OUTPUT_COL | |
606 add TMP3, TMP3, OUTPUT_COL | |
607 add TMP4, TMP4, OUTPUT_COL | |
608 transpose_single v22, v23, v3, .2d, .8b | |
609 st1 {v20.8b}, [TMP1] | |
610 add v22.8b, v22.8b, v0.8b | |
611 add v23.8b, v23.8b, v0.8b | |
612 st1 {v21.8b}, [TMP2] | |
613 st1 {v22.8b}, [TMP3] | |
614 st1 {v23.8b}, [TMP4] | |
615 ldr x15, [sp], 16 | |
616 ld1 {v0.8b - v3.8b}, [sp], 32 | |
617 ld1 {v4.8b - v7.8b}, [sp], 32 | |
618 ld1 {v8.8b - v11.8b}, [sp], 32 | |
619 ld1 {v12.8b - v15.8b}, [sp], 32 | |
620 ld1 {v16.8b - v19.8b}, [sp], 32 | |
621 ld1 {v20.8b - v23.8b}, [sp], 32 | |
622 ld1 {v24.8b - v27.8b}, [sp], 32 | |
623 ld1 {v28.8b - v31.8b}, [sp], 32 | |
624 blr x30 | 448 blr x30 |
625 | 449 |
626 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ | 450 .balign 16 |
627 | 451 2: |
628 /* Transpose left 4x8 half */ | 452 mul v3.8h, v3.8h, v19.8h |
629 transpose ROW6L, ROW7L, v3, .16b, .4h | 453 mul v4.8h, v4.8h, v20.8h |
630 transpose ROW2L, ROW3L, v3, .16b, .4h | 454 mul v5.8h, v5.8h, v21.8h |
631 transpose ROW0L, ROW1L, v3, .16b, .4h | 455 add TMP4, xzr, TMP2, LSL #32 |
632 transpose ROW4L, ROW5L, v3, .16b, .4h | 456 mul v6.8h, v6.8h, v22.8h |
633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ | 457 mul v7.8h, v7.8h, v23.8h |
634 transpose ROW1L, ROW3L, v3, .16b, .2s | 458 adds TMP3, xzr, TMP2, LSR #32 |
635 transpose ROW4L, ROW6L, v3, .16b, .2s | 459 mul v8.8h, v8.8h, v24.8h |
636 transpose ROW0L, ROW2L, v3, .16b, .2s | 460 mul v9.8h, v9.8h, v25.8h |
637 transpose ROW5L, ROW7L, v3, .16b, .2s | 461 b.ne 3f |
638 cmp x0, #0 | 462 /* Right AC coef is zero */ |
639 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa
ss */ | 463 dup v15.2d, v10.d[1] |
640 | 464 /* Even part: reverse the even part of the forward DCT. */ |
641 /* Only row 0 is non-zero for the right 4x8 half */ | 465 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr
[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZ
E*6]) */ |
642 dup ROW1R.4h, ROW0R.4h[1] | 466 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr
[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ
E*4]) */ |
643 dup ROW2R.4h, ROW0R.4h[2] | 467 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr
[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ
E*4]) */ |
644 dup ROW3R.4h, ROW0R.4h[3] | 468 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3,
FIX_0_541196100); */ |
645 dup ROW4R.4h, ROW0R.4h[0] | 469 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2
+ z3, CONST_BITS); */ |
646 dup ROW5R.4h, ROW0R.4h[1] | 470 mov v20.16b, v18.16b /* tmp3 = z1 */ |
647 dup ROW6R.4h, ROW0R.4h[2] | 471 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2
- z3, CONST_BITS); */ |
648 dup ROW7R.4h, ROW0R.4h[3] | 472 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY
(z3, - FIX_1_847759065); */ |
649 dup ROW0R.4h, ROW0R.4h[0] | 473 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY
(z2, FIX_0_765366865); */ |
650 b 1b /* Go to 'normal' second pass */ | 474 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3
; */ |
651 | 475 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3
; */ |
652 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ | 476 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2
; */ |
653 ld1 {v2.4h}, [x15] /* reload constants */ | 477 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2
; */ |
654 smull v12.4s, ROW1L.4h, XFIX_1_175875602 | 478 |
655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 | 479 /* Odd part per figure 8; the matrix is unitary and hence its |
656 smull v14.4s, ROW3L.4h, XFIX_1_175875602 | 480 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. |
657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 | 481 */ |
658 smull v4.4s, ROW2L.4h, XFIX_0_541196100 | 482 |
659 sshll v6.4s, ROW0L.4h, #13 | 483 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp
tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS
IZE*3]) */ |
660 mov v8.16b, v12.16b | 484 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp
tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS
IZE*1]) */ |
661 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | 485 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp
tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS
IZE*1]) */ |
662 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | 486 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp
tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS
IZE*3]) */ |
663 add v2.4s, v6.4s, v4.4s | 487 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ |
664 mov v10.16b, v14.16b | 488 |
665 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | 489 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0
_298631336) */ |
666 add v2.4s, v2.4s, v12.4s | 490 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2
_053119869) */ |
667 add v12.4s, v12.4s, v12.4s | 491 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3
_072711026) */ |
668 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | 492 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1
_501321110) */ |
669 shrn ROW1L.4h, v2.4s, #16 | 493 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4,
FIX_1_175875602) */ |
670 sub v2.4s, v2.4s, v12.4s | 494 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9
61570560) */ |
671 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | 495 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3
90180644) */ |
672 sub v6.4s, v6.4s, v4.4s | 496 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8
99976223) */ |
673 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | 497 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5
62915447) */ |
674 add v2.4s, v6.4s, v10.4s | 498 |
675 sub v6.4s, v6.4s, v10.4s | 499 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ |
676 sshll v10.4s, ROW0L.4h, #13 | 500 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ |
677 shrn ROW2L.4h, v2.4s, #16 | 501 |
678 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | 502 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ |
679 add v4.4s, v10.4s, v12.4s | 503 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ |
680 sub v2.4s, v10.4s, v12.4s | 504 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ |
681 add v12.4s, v4.4s, v14.4s | 505 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ |
682 sub v4.4s, v4.4s, v14.4s | 506 |
683 add v10.4s, v2.4s, v8.4s | 507 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ |
684 sub v6.4s, v2.4s, v8.4s | 508 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ |
685 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | 509 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ |
686 shrn ROW3L.4h, v10.4s, #16 | 510 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ |
687 shrn ROW0L.4h, v12.4s, #16 | 511 |
688 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | 512 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
689 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ | 513 |
690 ld1 {v2.4h}, [x15] /* reload constants */ | 514 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ |
691 smull v12.4s, ROW5L.4h, XFIX_1_175875602 | 515 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ |
692 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 | 516 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ |
693 smull v14.4s, ROW7L.4h, XFIX_1_175875602 | 517 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ |
694 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 | 518 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ |
695 smull v4.4s, ROW6L.4h, XFIX_0_541196100 | 519 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ |
696 sshll v6.4s, ROW4L.4h, #13 | 520 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ |
697 mov v8.16b, v12.16b | 521 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ |
698 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 | 522 |
699 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 | 523 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0]
= (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ |
700 add v2.4s, v6.4s, v4.4s | 524 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1]
= (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ |
701 mov v10.16b, v14.16b | 525 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2]
= (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ |
702 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 | 526 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3]
= (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ |
703 add v2.4s, v2.4s, v12.4s | 527 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4]
= (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ |
704 add v12.4s, v12.4s, v12.4s | 528 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5]
= (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ |
705 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 | 529 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6]
= (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ |
706 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | 530 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7]
= (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ |
707 sub v2.4s, v2.4s, v12.4s | 531 mov v6.16b, v15.16b |
708 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 | 532 mov v7.16b, v15.16b |
709 sub v6.4s, v6.4s, v4.4s | 533 mov v8.16b, v15.16b |
710 shrn ROW6R.4h, v2.4s, #16 | 534 mov v9.16b, v15.16b |
711 add v2.4s, v6.4s, v10.4s | 535 b 1b |
712 sub v6.4s, v6.4s, v10.4s | 536 |
713 sshll v10.4s, ROW4L.4h, #13 | 537 .balign 16 |
714 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | 538 3: |
715 shrn ROW5R.4h, v6.4s, #16 | 539 cbnz TMP4, 4f |
716 add v4.4s, v10.4s, v12.4s | 540 /* Left AC coef is zero */ |
717 sub v2.4s, v10.4s, v12.4s | 541 dup v14.2d, v10.d[0] |
718 add v12.4s, v4.4s, v14.4s | 542 /* Even part: reverse the even part of the forward DCT. */ |
719 sub v4.4s, v4.4s, v14.4s | 543 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr
[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZ
E*6]) */ |
720 add v10.4s, v2.4s, v8.4s | 544 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr
[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ
E*4]) */ |
721 sub v6.4s, v2.4s, v8.4s | 545 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3,
FIX_0_541196100); */ |
722 shrn ROW7R.4h, v4.4s, #16 | 546 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr
[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ
E*4]) */ |
723 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | 547 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2
+ z3, CONST_BITS); */ |
724 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | 548 mov v21.16b, v19.16b /* tmp3 = z1 */ |
725 shrn ROW4R.4h, v6.4s, #16 | 549 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY
(z3, - FIX_1_847759065); */ |
726 b 2b /* Go to epilogue */ | 550 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2
- z3, CONST_BITS); */ |
| 551 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY
(z2, FIX_0_765366865); */ |
| 552 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3
; */ |
| 553 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3
; */ |
| 554 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2
; */ |
| 555 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2
; */ |
| 556 |
| 557 /* Odd part per figure 8; the matrix is unitary and hence its |
| 558 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. |
| 559 */ |
| 560 |
| 561 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp
tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS
IZE*3]) */ |
| 562 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp
tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS
IZE*1]) */ |
| 563 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp
tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS
IZE*1]) */ |
| 564 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp
tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS
IZE*3]) */ |
| 565 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ |
| 566 |
| 567 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0
_298631336) */ |
| 568 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2
_053119869) */ |
| 569 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3
_072711026) */ |
| 570 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1
_501321110) */ |
| 571 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4,
FIX_1_175875602) */ |
| 572 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9
61570560) */ |
| 573 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3
90180644) */ |
| 574 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8
99976223) */ |
| 575 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5
62915447) */ |
| 576 |
| 577 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ |
| 578 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ |
| 579 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ |
| 580 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ |
| 581 |
| 582 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ |
| 583 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ |
| 584 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ |
| 585 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ |
| 586 |
| 587 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ |
| 588 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ |
| 589 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ |
| 590 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ |
| 591 |
| 592 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
| 593 |
| 594 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ |
| 595 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ |
| 596 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ |
| 597 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ |
| 598 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ |
| 599 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ |
| 600 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ |
| 601 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ |
| 602 |
| 603 mov v2.16b, v14.16b |
| 604 mov v3.16b, v14.16b |
| 605 mov v4.16b, v14.16b |
| 606 mov v5.16b, v14.16b |
| 607 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0]
= (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ |
| 608 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1]
= (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ |
| 609 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2]
= (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ |
| 610 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3]
= (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ |
| 611 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4]
= (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ |
| 612 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5]
= (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ |
| 613 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6]
= (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ |
| 614 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7]
= (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ |
| 615 b 1b |
| 616 |
| 617 .balign 16 |
| 618 4: |
| 619 /* "No" AC coef is zero */ |
| 620 /* Even part: reverse the even part of the forward DCT. */ |
| 621 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr
[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZ
E*6]) */ |
| 622 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr
[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ
E*4]) */ |
| 623 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3,
FIX_0_541196100); */ |
| 624 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr
[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZ
E*4]) */ |
| 625 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3,
FIX_0_541196100); */ |
| 626 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2
+ z3, CONST_BITS); */ |
| 627 mov v21.16b, v19.16b /* tmp3 = z1 */ |
| 628 mov v20.16b, v18.16b /* tmp3 = z1 */ |
| 629 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY
(z3, - FIX_1_847759065); */ |
| 630 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY
(z3, - FIX_1_847759065); */ |
| 631 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2
- z3, CONST_BITS); */ |
| 632 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY
(z2, FIX_0_765366865); */ |
| 633 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY
(z2, FIX_0_765366865); */ |
| 634 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2
+ z3, CONST_BITS); */ |
| 635 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2
- z3, CONST_BITS); */ |
| 636 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3
; */ |
| 637 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3
; */ |
| 638 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2
; */ |
| 639 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2
; */ |
| 640 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3
; */ |
| 641 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3
; */ |
| 642 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2
; */ |
| 643 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2
; */ |
| 644 |
| 645 /* Odd part per figure 8; the matrix is unitary and hence its |
| 646 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. |
| 647 */ |
| 648 |
| 649 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp
tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS
IZE*3]) */ |
| 650 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp
tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS
IZE*1]) */ |
| 651 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp
tr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTS
IZE*1]) */ |
| 652 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp
tr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTS
IZE*3]) */ |
| 653 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ |
| 654 |
| 655 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0
_298631336) */ |
| 656 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2
_053119869) */ |
| 657 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3
_072711026) */ |
| 658 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1
_501321110) */ |
| 659 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4,
FIX_1_175875602) */ |
| 660 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9
61570560) */ |
| 661 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3
90180644) */ |
| 662 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8
99976223) */ |
| 663 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5
62915447) */ |
| 664 |
| 665 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0
_298631336) */ |
| 666 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2
_053119869) */ |
| 667 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3
_072711026) */ |
| 668 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1
_501321110) */ |
| 669 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4,
FIX_1_175875602) */ |
| 670 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9
61570560) */ |
| 671 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3
90180644) */ |
| 672 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8
99976223) */ |
| 673 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5
62915447) */ |
| 674 |
| 675 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ |
| 676 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ |
| 677 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ |
| 678 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ |
| 679 |
| 680 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ |
| 681 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ |
| 682 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ |
| 683 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ |
| 684 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ |
| 685 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ |
| 686 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ |
| 687 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ |
| 688 |
| 689 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ |
| 690 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ |
| 691 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ |
| 692 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ |
| 693 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ |
| 694 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ |
| 695 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ |
| 696 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ |
| 697 |
| 698 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
| 699 |
| 700 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ |
| 701 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ |
| 702 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ |
| 703 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ |
| 704 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ |
| 705 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ |
| 706 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ |
| 707 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ |
| 708 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ |
| 709 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ |
| 710 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ |
| 711 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ |
| 712 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ |
| 713 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ |
| 714 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ |
| 715 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ |
| 716 |
| 717 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0]
= (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ |
| 718 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1]
= (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ |
| 719 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2]
= (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ |
| 720 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3]
= (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ |
| 721 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0]
= (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ |
| 722 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1]
= (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ |
| 723 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2]
= (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ |
| 724 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3]
= (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ |
| 725 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4]
= (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ |
| 726 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5]
= (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ |
| 727 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6]
= (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ |
| 728 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7]
= (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ |
| 729 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4]
= (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ |
| 730 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5]
= (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ |
| 731 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6]
= (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ |
| 732 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7]
= (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ |
| 733 b 1b |
727 | 734 |
728 .unreq DCT_TABLE | 735 .unreq DCT_TABLE |
729 .unreq COEF_BLOCK | 736 .unreq COEF_BLOCK |
730 .unreq OUTPUT_BUF | 737 .unreq OUTPUT_BUF |
731 .unreq OUTPUT_COL | 738 .unreq OUTPUT_COL |
732 .unreq TMP1 | 739 .unreq TMP1 |
733 .unreq TMP2 | 740 .unreq TMP2 |
734 .unreq TMP3 | 741 .unreq TMP3 |
735 .unreq TMP4 | 742 .unreq TMP4 |
| 743 .unreq TMP5 |
| 744 .unreq TMP6 |
| 745 .unreq TMP7 |
| 746 .unreq TMP8 |
736 | 747 |
737 .unreq ROW0L | 748 #undef CENTERJSAMPLE |
738 .unreq ROW0R | 749 #undef CONST_BITS |
739 .unreq ROW1L | 750 #undef PASS1_BITS |
740 .unreq ROW1R | 751 #undef XFIX_P_0_298 |
741 .unreq ROW2L | 752 #undef XFIX_N_0_390 |
742 .unreq ROW2R | 753 #undef XFIX_P_0_541 |
743 .unreq ROW3L | 754 #undef XFIX_P_0_765 |
744 .unreq ROW3R | 755 #undef XFIX_N_0_899 |
745 .unreq ROW4L | 756 #undef XFIX_P_1_175 |
746 .unreq ROW4R | 757 #undef XFIX_P_1_501 |
747 .unreq ROW5L | 758 #undef XFIX_N_1_847 |
748 .unreq ROW5R | 759 #undef XFIX_N_1_961 |
749 .unreq ROW6L | 760 #undef XFIX_P_2_053 |
750 .unreq ROW6R | 761 #undef XFIX_N_2_562 |
751 .unreq ROW7L | 762 #undef XFIX_P_3_072 |
752 .unreq ROW7R | |
753 | 763 |
754 | 764 |
755 /*****************************************************************************/ | 765 /*****************************************************************************/ |
756 | 766 |
757 /* | 767 /* |
758 * jsimd_idct_ifast_neon | 768 * jsimd_idct_ifast_neon |
759 * | 769 * |
760 * This function contains a fast, not so accurate integer implementation of | 770 * This function contains a fast, not so accurate integer implementation of |
761 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | 771 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations |
762 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' | 772 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' |
763 * function from jidctfst.c | 773 * function from jidctfst.c |
764 * | 774 * |
765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. | 775 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. |
766 * But in ARM NEON case some extra additions are required because VQDMULH | 776 * But in ARM NEON case some extra additions are required because VQDMULH |
767 * instruction can't handle the constants larger than 1. So the expressions | 777 * instruction can't handle the constants larger than 1. So the expressions |
768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", | 778 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", |
769 * which introduces an extra addition. Overall, there are 6 extra additions | 779 * which introduces an extra addition. Overall, there are 6 extra additions |
770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. | 780 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. |
771 */ | 781 */ |
772 | 782 |
773 #define XFIX_1_082392200 v0.4h[0] | 783 #define XFIX_1_082392200 v0.h[0] |
774 #define XFIX_1_414213562 v0.4h[1] | 784 #define XFIX_1_414213562 v0.h[1] |
775 #define XFIX_1_847759065 v0.4h[2] | 785 #define XFIX_1_847759065 v0.h[2] |
776 #define XFIX_2_613125930 v0.4h[3] | 786 #define XFIX_2_613125930 v0.h[3] |
777 | 787 |
778 .balign 16 | 788 .balign 16 |
779 jsimd_idct_ifast_neon_consts: | 789 Ljsimd_idct_ifast_neon_consts: |
780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ | 790 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ |
781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ | 791 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ |
782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ | 792 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ | 793 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
784 | 794 |
785 asm_function jsimd_idct_ifast_neon | 795 asm_function jsimd_idct_ifast_neon |
786 | 796 |
787 DCT_TABLE .req x0 | 797 DCT_TABLE .req x0 |
788 COEF_BLOCK .req x1 | 798 COEF_BLOCK .req x1 |
789 OUTPUT_BUF .req x2 | 799 OUTPUT_BUF .req x2 |
790 OUTPUT_COL .req x3 | 800 OUTPUT_COL .req x3 |
791 TMP1 .req x0 | 801 TMP1 .req x0 |
792 TMP2 .req x1 | 802 TMP2 .req x1 |
793 TMP3 .req x2 | 803 TMP3 .req x9 |
794 TMP4 .req x22 | 804 TMP4 .req x10 |
795 TMP5 .req x23 | 805 TMP5 .req x11 |
| 806 TMP6 .req x12 |
| 807 TMP7 .req x13 |
| 808 TMP8 .req x14 |
796 | 809 |
797 /* Load and dequantize coefficients into NEON registers | 810 /* Load and dequantize coefficients into NEON registers |
798 * with the following allocation: | 811 * with the following allocation: |
799 * 0 1 2 3 | 4 5 6 7 | 812 * 0 1 2 3 | 4 5 6 7 |
800 * ---------+-------- | 813 * ---------+-------- |
801 * 0 | d16 | d17 ( v8.8h ) | 814 * 0 | d16 | d17 ( v16.8h ) |
802 * 1 | d18 | d19 ( v9.8h ) | 815 * 1 | d18 | d19 ( v17.8h ) |
803 * 2 | d20 | d21 ( v10.8h ) | 816 * 2 | d20 | d21 ( v18.8h ) |
804 * 3 | d22 | d23 ( v11.8h ) | 817 * 3 | d22 | d23 ( v19.8h ) |
805 * 4 | d24 | d25 ( v12.8h ) | 818 * 4 | d24 | d25 ( v20.8h ) |
806 * 5 | d26 | d27 ( v13.8h ) | 819 * 5 | d26 | d27 ( v21.8h ) |
807 * 6 | d28 | d29 ( v14.8h ) | 820 * 6 | d28 | d29 ( v22.8h ) |
808 * 7 | d30 | d31 ( v15.8h ) | 821 * 7 | d30 | d31 ( v23.8h ) |
809 */ | 822 */ |
810 /* Save NEON registers used in fast IDCT */ | 823 /* Save NEON registers used in fast IDCT */ |
811 sub sp, sp, #176 | 824 adr TMP5, Ljsimd_idct_ifast_neon_consts |
812 stp x22, x23, [sp], 16 | 825 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 |
813 adr x23, jsimd_idct_ifast_neon_consts | |
814 st1 {v0.8b - v3.8b}, [sp], 32 | |
815 st1 {v4.8b - v7.8b}, [sp], 32 | |
816 st1 {v8.8b - v11.8b}, [sp], 32 | |
817 st1 {v12.8b - v15.8b}, [sp], 32 | |
818 st1 {v16.8b - v19.8b}, [sp], 32 | |
819 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32 | |
820 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 | 826 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 |
821 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32 | 827 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 |
822 mul v8.8h, v8.8h, v0.8h | 828 mul v16.8h, v16.8h, v0.8h |
823 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 | 829 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 |
824 mul v9.8h, v9.8h, v1.8h | 830 mul v17.8h, v17.8h, v1.8h |
825 ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32 | 831 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 |
826 mul v10.8h, v10.8h, v2.8h | 832 mul v18.8h, v18.8h, v2.8h |
827 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 | 833 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 |
828 mul v11.8h, v11.8h, v3.8h | 834 mul v19.8h, v19.8h, v3.8h |
829 ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32 | 835 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 |
830 mul v12.8h, v12.8h, v0.8h | 836 mul v20.8h, v20.8h, v0.8h |
831 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 | 837 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 |
832 mul v14.8h, v14.8h, v2.8h | 838 mul v22.8h, v22.8h, v2.8h |
833 mul v13.8h, v13.8h, v1.8h | 839 mul v21.8h, v21.8h, v1.8h |
834 ld1 {v0.4h}, [x23] /* load constants */ | 840 ld1 {v0.4h}, [TMP5] /* load constants */ |
835 mul v15.8h, v15.8h, v3.8h | 841 mul v23.8h, v23.8h, v3.8h |
836 | 842 |
837 /* 1-D IDCT, pass 1 */ | 843 /* 1-D IDCT, pass 1 */ |
838 sub v2.8h, v10.8h, v14.8h | 844 sub v2.8h, v18.8h, v22.8h |
839 add v14.8h, v10.8h, v14.8h | 845 add v22.8h, v18.8h, v22.8h |
840 sub v1.8h, v11.8h, v13.8h | 846 sub v1.8h, v19.8h, v21.8h |
841 add v13.8h, v11.8h, v13.8h | 847 add v21.8h, v19.8h, v21.8h |
842 sub v5.8h, v9.8h, v15.8h | 848 sub v5.8h, v17.8h, v23.8h |
843 add v15.8h, v9.8h, v15.8h | 849 add v23.8h, v17.8h, v23.8h |
844 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 | 850 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 |
845 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 | 851 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 |
846 add v3.8h, v1.8h, v1.8h | 852 add v3.8h, v1.8h, v1.8h |
847 sub v1.8h, v5.8h, v1.8h | 853 sub v1.8h, v5.8h, v1.8h |
848 add v10.8h, v2.8h, v4.8h | 854 add v18.8h, v2.8h, v4.8h |
849 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 | 855 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 |
850 sub v2.8h, v15.8h, v13.8h | 856 sub v2.8h, v23.8h, v21.8h |
851 add v3.8h, v3.8h, v6.8h | 857 add v3.8h, v3.8h, v6.8h |
852 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 | 858 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 |
853 add v1.8h, v1.8h, v4.8h | 859 add v1.8h, v1.8h, v4.8h |
854 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 | 860 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 |
855 sub v10.8h, v10.8h, v14.8h | 861 sub v18.8h, v18.8h, v22.8h |
856 add v2.8h, v2.8h, v6.8h | 862 add v2.8h, v2.8h, v6.8h |
857 sub v6.8h, v8.8h, v12.8h | 863 sub v6.8h, v16.8h, v20.8h |
858 add v12.8h, v8.8h, v12.8h | 864 add v20.8h, v16.8h, v20.8h |
859 add v9.8h, v5.8h, v4.8h | 865 add v17.8h, v5.8h, v4.8h |
860 add v5.8h, v6.8h, v10.8h | 866 add v5.8h, v6.8h, v18.8h |
861 sub v10.8h, v6.8h, v10.8h | 867 sub v18.8h, v6.8h, v18.8h |
862 add v6.8h, v15.8h, v13.8h | 868 add v6.8h, v23.8h, v21.8h |
863 add v8.8h, v12.8h, v14.8h | 869 add v16.8h, v20.8h, v22.8h |
864 sub v3.8h, v6.8h, v3.8h | 870 sub v3.8h, v6.8h, v3.8h |
865 sub v12.8h, v12.8h, v14.8h | 871 sub v20.8h, v20.8h, v22.8h |
866 sub v3.8h, v3.8h, v1.8h | 872 sub v3.8h, v3.8h, v1.8h |
867 sub v1.8h, v9.8h, v1.8h | 873 sub v1.8h, v17.8h, v1.8h |
868 add v2.8h, v3.8h, v2.8h | 874 add v2.8h, v3.8h, v2.8h |
869 sub v15.8h, v8.8h, v6.8h | 875 sub v23.8h, v16.8h, v6.8h |
870 add v1.8h, v1.8h, v2.8h | 876 add v1.8h, v1.8h, v2.8h |
871 add v8.8h, v8.8h, v6.8h | 877 add v16.8h, v16.8h, v6.8h |
872 add v14.8h, v5.8h, v3.8h | 878 add v22.8h, v5.8h, v3.8h |
873 sub v9.8h, v5.8h, v3.8h | 879 sub v17.8h, v5.8h, v3.8h |
874 sub v13.8h, v10.8h, v2.8h | 880 sub v21.8h, v18.8h, v2.8h |
875 add v10.8h, v10.8h, v2.8h | 881 add v18.8h, v18.8h, v2.8h |
876 /* Transpose q8-q9 */ | 882 sub v19.8h, v20.8h, v1.8h |
877 mov v18.16b, v8.16b | 883 add v20.8h, v20.8h, v1.8h |
878 trn1 v8.8h, v8.8h, v9.8h | 884 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 |
879 trn2 v9.8h, v18.8h, v9.8h | 885 /* 1-D IDCT, pass 2 */ |
880 sub v11.8h, v12.8h, v1.8h | 886 sub v2.8h, v18.8h, v22.8h |
881 /* Transpose q14-q15 */ | 887 add v22.8h, v18.8h, v22.8h |
882 mov v18.16b, v14.16b | 888 sub v1.8h, v19.8h, v21.8h |
883 trn1 v14.8h, v14.8h, v15.8h | 889 add v21.8h, v19.8h, v21.8h |
884 trn2 v15.8h, v18.8h, v15.8h | 890 sub v5.8h, v17.8h, v23.8h |
885 add v12.8h, v12.8h, v1.8h | 891 add v23.8h, v17.8h, v23.8h |
886 /* Transpose q10-q11 */ | 892 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 |
887 mov v18.16b, v10.16b | 893 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 |
888 trn1 v10.8h, v10.8h, v11.8h | 894 add v3.8h, v1.8h, v1.8h |
889 trn2 v11.8h, v18.8h, v11.8h | 895 sub v1.8h, v5.8h, v1.8h |
890 /* Transpose q12-q13 */ | 896 add v18.8h, v2.8h, v4.8h |
891 mov v18.16b, v12.16b | 897 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 |
892 trn1 v12.8h, v12.8h, v13.8h | 898 sub v2.8h, v23.8h, v21.8h |
893 trn2 v13.8h, v18.8h, v13.8h | 899 add v3.8h, v3.8h, v6.8h |
894 /* Transpose q9-q11 */ | 900 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 |
895 mov v18.16b, v9.16b | 901 add v1.8h, v1.8h, v4.8h |
896 trn1 v9.4s, v9.4s, v11.4s | 902 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 |
897 trn2 v11.4s, v18.4s, v11.4s | 903 sub v18.8h, v18.8h, v22.8h |
898 /* Transpose q12-q14 */ | 904 add v2.8h, v2.8h, v6.8h |
899 mov v18.16b, v12.16b | 905 sub v6.8h, v16.8h, v20.8h |
900 trn1 v12.4s, v12.4s, v14.4s | 906 add v20.8h, v16.8h, v20.8h |
901 trn2 v14.4s, v18.4s, v14.4s | 907 add v17.8h, v5.8h, v4.8h |
902 /* Transpose q8-q10 */ | 908 add v5.8h, v6.8h, v18.8h |
903 mov v18.16b, v8.16b | 909 sub v18.8h, v6.8h, v18.8h |
904 trn1 v8.4s, v8.4s, v10.4s | 910 add v6.8h, v23.8h, v21.8h |
905 trn2 v10.4s, v18.4s, v10.4s | 911 add v16.8h, v20.8h, v22.8h |
906 /* Transpose q13-q15 */ | 912 sub v3.8h, v6.8h, v3.8h |
907 mov v18.16b, v13.16b | 913 sub v20.8h, v20.8h, v22.8h |
908 trn1 v13.4s, v13.4s, v15.4s | 914 sub v3.8h, v3.8h, v1.8h |
909 trn2 v15.4s, v18.4s, v15.4s | 915 sub v1.8h, v17.8h, v1.8h |
910 /* vswp v14.4h, v10-MSB.4h */ | 916 add v2.8h, v3.8h, v2.8h |
911 umov x22, v14.d[0] | 917 sub v23.8h, v16.8h, v6.8h |
912 ins v14.2d[0], v10.2d[1] | 918 add v1.8h, v1.8h, v2.8h |
913 ins v10.2d[1], x22 | 919 add v16.8h, v16.8h, v6.8h |
914 /* vswp v13.4h, v9MSB.4h */ | 920 add v22.8h, v5.8h, v3.8h |
| 921 sub v17.8h, v5.8h, v3.8h |
| 922 sub v21.8h, v18.8h, v2.8h |
| 923 add v18.8h, v18.8h, v2.8h |
| 924 sub v19.8h, v20.8h, v1.8h |
| 925 add v20.8h, v20.8h, v1.8h |
| 926 /* Descale to 8-bit and range limit */ |
| 927 movi v0.16b, #0x80 |
| 928 /* Prepare pointers (dual-issue with NEON instructions) */ |
| 929 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
| 930 sqshrn v28.8b, v16.8h, #5 |
| 931 ldp TMP3, TMP4, [OUTPUT_BUF], 16 |
| 932 sqshrn v29.8b, v17.8h, #5 |
| 933 add TMP1, TMP1, OUTPUT_COL |
| 934 sqshrn v30.8b, v18.8h, #5 |
| 935 add TMP2, TMP2, OUTPUT_COL |
| 936 sqshrn v31.8b, v19.8h, #5 |
| 937 add TMP3, TMP3, OUTPUT_COL |
| 938 sqshrn2 v28.16b, v20.8h, #5 |
| 939 add TMP4, TMP4, OUTPUT_COL |
| 940 sqshrn2 v29.16b, v21.8h, #5 |
| 941 ldp TMP5, TMP6, [OUTPUT_BUF], 16 |
| 942 sqshrn2 v30.16b, v22.8h, #5 |
| 943 ldp TMP7, TMP8, [OUTPUT_BUF], 16 |
| 944 sqshrn2 v31.16b, v23.8h, #5 |
| 945 add TMP5, TMP5, OUTPUT_COL |
| 946 add v16.16b, v28.16b, v0.16b |
| 947 add TMP6, TMP6, OUTPUT_COL |
| 948 add v18.16b, v29.16b, v0.16b |
| 949 add TMP7, TMP7, OUTPUT_COL |
| 950 add v20.16b, v30.16b, v0.16b |
| 951 add TMP8, TMP8, OUTPUT_COL |
| 952 add v22.16b, v31.16b, v0.16b |
915 | 953 |
916 umov x22, v13.d[0] | |
917 ins v13.2d[0], v9.2d[1] | |
918 ins v9.2d[1], x22 | |
919 /* 1-D IDCT, pass 2 */ | |
920 sub v2.8h, v10.8h, v14.8h | |
921 /* vswp v15.4h, v11MSB.4h */ | |
922 umov x22, v15.d[0] | |
923 ins v15.2d[0], v11.2d[1] | |
924 ins v11.2d[1], x22 | |
925 add v14.8h, v10.8h, v14.8h | |
926 /* vswp v12.4h, v8-MSB.4h */ | |
927 umov x22, v12.d[0] | |
928 ins v12.2d[0], v8.2d[1] | |
929 ins v8.2d[1], x22 | |
930 sub v1.8h, v11.8h, v13.8h | |
931 add v13.8h, v11.8h, v13.8h | |
932 sub v5.8h, v9.8h, v15.8h | |
933 add v15.8h, v9.8h, v15.8h | |
934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 | |
935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 | |
936 add v3.8h, v1.8h, v1.8h | |
937 sub v1.8h, v5.8h, v1.8h | |
938 add v10.8h, v2.8h, v4.8h | |
939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 | |
940 sub v2.8h, v15.8h, v13.8h | |
941 add v3.8h, v3.8h, v6.8h | |
942 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 | |
943 add v1.8h, v1.8h, v4.8h | |
944 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 | |
945 sub v10.8h, v10.8h, v14.8h | |
946 add v2.8h, v2.8h, v6.8h | |
947 sub v6.8h, v8.8h, v12.8h | |
948 add v12.8h, v8.8h, v12.8h | |
949 add v9.8h, v5.8h, v4.8h | |
950 add v5.8h, v6.8h, v10.8h | |
951 sub v10.8h, v6.8h, v10.8h | |
952 add v6.8h, v15.8h, v13.8h | |
953 add v8.8h, v12.8h, v14.8h | |
954 sub v3.8h, v6.8h, v3.8h | |
955 sub v12.8h, v12.8h, v14.8h | |
956 sub v3.8h, v3.8h, v1.8h | |
957 sub v1.8h, v9.8h, v1.8h | |
958 add v2.8h, v3.8h, v2.8h | |
959 sub v15.8h, v8.8h, v6.8h | |
960 add v1.8h, v1.8h, v2.8h | |
961 add v8.8h, v8.8h, v6.8h | |
962 add v14.8h, v5.8h, v3.8h | |
963 sub v9.8h, v5.8h, v3.8h | |
964 sub v13.8h, v10.8h, v2.8h | |
965 add v10.8h, v10.8h, v2.8h | |
966 sub v11.8h, v12.8h, v1.8h | |
967 add v12.8h, v12.8h, v1.8h | |
968 /* Descale to 8-bit and range limit */ | |
969 movi v0.16b, #0x80 | |
970 sqshrn v8.8b, v8.8h, #5 | |
971 sqshrn2 v8.16b, v9.8h, #5 | |
972 sqshrn v9.8b, v10.8h, #5 | |
973 sqshrn2 v9.16b, v11.8h, #5 | |
974 sqshrn v10.8b, v12.8h, #5 | |
975 sqshrn2 v10.16b, v13.8h, #5 | |
976 sqshrn v11.8b, v14.8h, #5 | |
977 sqshrn2 v11.16b, v15.8h, #5 | |
978 add v8.16b, v8.16b, v0.16b | |
979 add v9.16b, v9.16b, v0.16b | |
980 add v10.16b, v10.16b, v0.16b | |
981 add v11.16b, v11.16b, v0.16b | |
982 /* Transpose the final 8-bit samples */ | 954 /* Transpose the final 8-bit samples */ |
983 /* Transpose q8-q9 */ | 955 trn1 v28.16b, v16.16b, v18.16b |
984 mov v18.16b, v8.16b | 956 trn1 v30.16b, v20.16b, v22.16b |
985 trn1 v8.8h, v8.8h, v9.8h | 957 trn2 v29.16b, v16.16b, v18.16b |
986 trn2 v9.8h, v18.8h, v9.8h | 958 trn2 v31.16b, v20.16b, v22.16b |
987 /* Transpose q10-q11 */ | 959 |
988 mov v18.16b, v10.16b | 960 trn1 v16.8h, v28.8h, v30.8h |
989 trn1 v10.8h, v10.8h, v11.8h | 961 trn2 v18.8h, v28.8h, v30.8h |
990 trn2 v11.8h, v18.8h, v11.8h | 962 trn1 v20.8h, v29.8h, v31.8h |
991 /* Transpose q8-q10 */ | 963 trn2 v22.8h, v29.8h, v31.8h |
992 mov v18.16b, v8.16b | 964 |
993 trn1 v8.4s, v8.4s, v10.4s | 965 uzp1 v28.4s, v16.4s, v18.4s |
994 trn2 v10.4s, v18.4s, v10.4s | 966 uzp2 v30.4s, v16.4s, v18.4s |
995 /* Transpose q9-q11 */ | 967 uzp1 v29.4s, v20.4s, v22.4s |
996 mov v18.16b, v9.16b | 968 uzp2 v31.4s, v20.4s, v22.4s |
997 trn1 v9.4s, v9.4s, v11.4s | 969 |
998 trn2 v11.4s, v18.4s, v11.4s | |
999 /* make copy */ | |
1000 ins v17.2d[0], v8.2d[1] | |
1001 /* Transpose d16-d17-msb */ | |
1002 mov v18.16b, v8.16b | |
1003 trn1 v8.8b, v8.8b, v17.8b | |
1004 trn2 v17.8b, v18.8b, v17.8b | |
1005 /* make copy */ | |
1006 ins v19.2d[0], v9.2d[1] | |
1007 mov v18.16b, v9.16b | |
1008 trn1 v9.8b, v9.8b, v19.8b | |
1009 trn2 v19.8b, v18.8b, v19.8b | |
1010 /* Store results to the output buffer */ | 970 /* Store results to the output buffer */ |
1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 971 st1 {v28.d}[0], [TMP1] |
1012 add TMP1, TMP1, OUTPUT_COL | 972 st1 {v29.d}[0], [TMP2] |
1013 add TMP2, TMP2, OUTPUT_COL | 973 st1 {v28.d}[1], [TMP3] |
1014 st1 {v8.8b}, [TMP1] | 974 st1 {v29.d}[1], [TMP4] |
1015 st1 {v17.8b}, [TMP2] | 975 st1 {v30.d}[0], [TMP5] |
1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 976 st1 {v31.d}[0], [TMP6] |
1017 add TMP1, TMP1, OUTPUT_COL | 977 st1 {v30.d}[1], [TMP7] |
1018 add TMP2, TMP2, OUTPUT_COL | 978 st1 {v31.d}[1], [TMP8] |
1019 st1 {v9.8b}, [TMP1] | |
1020 /* make copy */ | |
1021 ins v7.2d[0], v10.2d[1] | |
1022 mov v18.16b, v10.16b | |
1023 trn1 v10.8b, v10.8b, v7.8b | |
1024 trn2 v7.8b, v18.8b, v7.8b | |
1025 st1 {v19.8b}, [TMP2] | |
1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16 | |
1028 add TMP1, TMP1, OUTPUT_COL | |
1029 add TMP2, TMP2, OUTPUT_COL | |
1030 add TMP4, TMP4, OUTPUT_COL | |
1031 add TMP5, TMP5, OUTPUT_COL | |
1032 st1 {v10.8b}, [TMP1] | |
1033 /* make copy */ | |
1034 ins v16.2d[0], v11.2d[1] | |
1035 mov v18.16b, v11.16b | |
1036 trn1 v11.8b, v11.8b, v16.8b | |
1037 trn2 v16.8b, v18.8b, v16.8b | |
1038 st1 {v7.8b}, [TMP2] | |
1039 st1 {v11.8b}, [TMP4] | |
1040 st1 {v16.8b}, [TMP5] | |
1041 sub sp, sp, #176 | |
1042 ldp x22, x23, [sp], 16 | |
1043 ld1 {v0.8b - v3.8b}, [sp], 32 | |
1044 ld1 {v4.8b - v7.8b}, [sp], 32 | |
1045 ld1 {v8.8b - v11.8b}, [sp], 32 | |
1046 ld1 {v12.8b - v15.8b}, [sp], 32 | |
1047 ld1 {v16.8b - v19.8b}, [sp], 32 | |
1048 blr x30 | 979 blr x30 |
1049 | 980 |
1050 .unreq DCT_TABLE | 981 .unreq DCT_TABLE |
1051 .unreq COEF_BLOCK | 982 .unreq COEF_BLOCK |
1052 .unreq OUTPUT_BUF | 983 .unreq OUTPUT_BUF |
1053 .unreq OUTPUT_COL | 984 .unreq OUTPUT_COL |
1054 .unreq TMP1 | 985 .unreq TMP1 |
1055 .unreq TMP2 | 986 .unreq TMP2 |
1056 .unreq TMP3 | 987 .unreq TMP3 |
1057 .unreq TMP4 | 988 .unreq TMP4 |
| 989 .unreq TMP5 |
| 990 .unreq TMP6 |
| 991 .unreq TMP7 |
| 992 .unreq TMP8 |
1058 | 993 |
1059 | 994 |
1060 /*****************************************************************************/ | 995 /*****************************************************************************/ |
1061 | 996 |
1062 /* | 997 /* |
1063 * jsimd_idct_4x4_neon | 998 * jsimd_idct_4x4_neon |
1064 * | 999 * |
1065 * This function contains inverse-DCT code for getting reduced-size | 1000 * This function contains inverse-DCT code for getting reduced-size |
1066 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations | 1001 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations |
1067 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | 1002 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' |
1068 * function from jpeg-6b (jidctred.c). | 1003 * function from jpeg-6b (jidctred.c). |
1069 * | 1004 * |
1070 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which | 1005 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which |
1071 * requires much less arithmetic operations and hence should be faster. | 1006 * requires much less arithmetic operations and hence should be faster. |
1072 * The primary purpose of this particular NEON optimized function is | 1007 * The primary purpose of this particular NEON optimized function is |
1073 * bit exact compatibility with jpeg-6b. | 1008 * bit exact compatibility with jpeg-6b. |
1074 * | 1009 * |
1075 * TODO: a bit better instructions scheduling can be achieved by expanding | 1010 * TODO: a bit better instructions scheduling can be achieved by expanding |
1076 * idct_helper/transpose_4x4 macros and reordering instructions, | 1011 * idct_helper/transpose_4x4 macros and reordering instructions, |
1077 * but readability will suffer somewhat. | 1012 * but readability will suffer somewhat. |
1078 */ | 1013 */ |
1079 | 1014 |
1080 #define CONST_BITS 13 | 1015 #define CONST_BITS 13 |
1081 | 1016 |
1082 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ | 1017 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ |
1083 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ | 1018 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ |
1084 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ | 1019 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ |
1085 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ | 1020 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ |
1086 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ | 1021 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ |
1087 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ | 1022 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ |
1088 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ | 1023 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ |
1089 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ | 1024 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ |
1090 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ | 1025 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ |
1091 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ | 1026 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ |
1092 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ | 1027 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ |
1093 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ | 1028 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ |
1094 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ | 1029 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ |
1095 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ | 1030 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ |
1096 | 1031 |
1097 .balign 16 | 1032 .balign 16 |
1098 jsimd_idct_4x4_neon_consts: | 1033 Ljsimd_idct_4x4_neon_consts: |
1099 .short FIX_1_847759065 /* v0.4h[0] */ | 1034 .short FIX_1_847759065 /* v0.h[0] */ |
1100 .short -FIX_0_765366865 /* v0.4h[1] */ | 1035 .short -FIX_0_765366865 /* v0.h[1] */ |
1101 .short -FIX_0_211164243 /* v0.4h[2] */ | 1036 .short -FIX_0_211164243 /* v0.h[2] */ |
1102 .short FIX_1_451774981 /* v0.4h[3] */ | 1037 .short FIX_1_451774981 /* v0.h[3] */ |
1103 .short -FIX_2_172734803 /* d1[0] */ | 1038 .short -FIX_2_172734803 /* d1[0] */ |
1104 .short FIX_1_061594337 /* d1[1] */ | 1039 .short FIX_1_061594337 /* d1[1] */ |
1105 .short -FIX_0_509795579 /* d1[2] */ | 1040 .short -FIX_0_509795579 /* d1[2] */ |
1106 .short -FIX_0_601344887 /* d1[3] */ | 1041 .short -FIX_0_601344887 /* d1[3] */ |
1107 .short FIX_0_899976223 /* v2.4h[0] */ | 1042 .short FIX_0_899976223 /* v2.h[0] */ |
1108 .short FIX_2_562915447 /* v2.4h[1] */ | 1043 .short FIX_2_562915447 /* v2.h[1] */ |
1109 .short 1 << (CONST_BITS+1) /* v2.4h[2] */ | 1044 .short 1 << (CONST_BITS+1) /* v2.h[2] */ |
1110 .short 0 /* v2.4h[3] */ | 1045 .short 0 /* v2.h[3] */ |
1111 | 1046 |
1112 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 | 1047 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 |
1113 smull v28.4s, \x4, v2.4h[2] | 1048 smull v28.4s, \x4, v2.h[2] |
1114 smlal v28.4s, \x8, v0.4h[0] | 1049 smlal v28.4s, \x8, v0.h[0] |
1115 smlal v28.4s, \x14, v0.4h[1] | 1050 smlal v28.4s, \x14, v0.h[1] |
1116 | 1051 |
1117 smull v26.4s, \x16, v1.4h[2] | 1052 smull v26.4s, \x16, v1.h[2] |
1118 smlal v26.4s, \x12, v1.4h[3] | 1053 smlal v26.4s, \x12, v1.h[3] |
1119 smlal v26.4s, \x10, v2.4h[0] | 1054 smlal v26.4s, \x10, v2.h[0] |
1120 smlal v26.4s, \x6, v2.4h[1] | 1055 smlal v26.4s, \x6, v2.h[1] |
1121 | 1056 |
1122 smull v30.4s, \x4, v2.4h[2] | 1057 smull v30.4s, \x4, v2.h[2] |
1123 smlsl v30.4s, \x8, v0.4h[0] | 1058 smlsl v30.4s, \x8, v0.h[0] |
1124 smlsl v30.4s, \x14, v0.4h[1] | 1059 smlsl v30.4s, \x14, v0.h[1] |
1125 | 1060 |
1126 smull v24.4s, \x16, v0.4h[2] | 1061 smull v24.4s, \x16, v0.h[2] |
1127 smlal v24.4s, \x12, v0.4h[3] | 1062 smlal v24.4s, \x12, v0.h[3] |
1128 smlal v24.4s, \x10, v1.4h[0] | 1063 smlal v24.4s, \x10, v1.h[0] |
1129 smlal v24.4s, \x6, v1.4h[1] | 1064 smlal v24.4s, \x6, v1.h[1] |
1130 | 1065 |
1131 add v20.4s, v28.4s, v26.4s | 1066 add v20.4s, v28.4s, v26.4s |
1132 sub v28.4s, v28.4s, v26.4s | 1067 sub v28.4s, v28.4s, v26.4s |
1133 | 1068 |
1134 .if \shift > 16 | 1069 .if \shift > 16 |
1135 srshr v20.4s, v20.4s, #\shift | 1070 srshr v20.4s, v20.4s, #\shift |
1136 srshr v28.4s, v28.4s, #\shift | 1071 srshr v28.4s, v28.4s, #\shift |
1137 xtn \y26, v20.4s | 1072 xtn \y26, v20.4s |
1138 xtn \y29, v28.4s | 1073 xtn \y29, v28.4s |
1139 .else | 1074 .else |
1140 rshrn \y26, v20.4s, #\shift | 1075 rshrn \y26, v20.4s, #\shift |
1141 rshrn \y29, v28.4s, #\shift | 1076 rshrn \y29, v28.4s, #\shift |
1142 .endif | 1077 .endif |
1143 | 1078 |
1144 add v20.4s, v30.4s, v24.4s | 1079 add v20.4s, v30.4s, v24.4s |
1145 sub v30.4s, v30.4s, v24.4s | 1080 sub v30.4s, v30.4s, v24.4s |
1146 | 1081 |
1147 .if \shift > 16 | 1082 .if \shift > 16 |
1148 srshr v20.4s, v20.4s, #\shift | 1083 srshr v20.4s, v20.4s, #\shift |
1149 srshr v30.4s, v30.4s, #\shift | 1084 srshr v30.4s, v30.4s, #\shift |
1150 xtn \y27, v20.4s | 1085 xtn \y27, v20.4s |
1151 xtn \y28, v30.4s | 1086 xtn \y28, v30.4s |
1152 .else | 1087 .else |
1153 rshrn \y27, v20.4s, #\shift | 1088 rshrn \y27, v20.4s, #\shift |
1154 rshrn \y28, v30.4s, #\shift | 1089 rshrn \y28, v30.4s, #\shift |
1155 .endif | 1090 .endif |
1156 | |
1157 .endm | 1091 .endm |
1158 | 1092 |
1159 asm_function jsimd_idct_4x4_neon | 1093 asm_function jsimd_idct_4x4_neon |
1160 | 1094 |
1161 DCT_TABLE .req x0 | 1095 DCT_TABLE .req x0 |
1162 COEF_BLOCK .req x1 | 1096 COEF_BLOCK .req x1 |
1163 OUTPUT_BUF .req x2 | 1097 OUTPUT_BUF .req x2 |
1164 OUTPUT_COL .req x3 | 1098 OUTPUT_COL .req x3 |
1165 TMP1 .req x0 | 1099 TMP1 .req x0 |
1166 TMP2 .req x1 | 1100 TMP2 .req x1 |
1167 TMP3 .req x2 | 1101 TMP3 .req x2 |
1168 TMP4 .req x15 | 1102 TMP4 .req x15 |
1169 | 1103 |
1170 /* Save all used NEON registers */ | 1104 /* Save all used NEON registers */ |
1171 sub sp, sp, 272 | 1105 sub sp, sp, 272 |
1172 str x15, [sp], 16 | 1106 str x15, [sp], 16 |
1173 /* Load constants (v3.4h is just used for padding) */ | 1107 /* Load constants (v3.4h is just used for padding) */ |
1174 adr TMP4, jsimd_idct_4x4_neon_consts | 1108 adr TMP4, Ljsimd_idct_4x4_neon_consts |
1175 st1 {v0.8b - v3.8b}, [sp], 32 | 1109 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 |
1176 st1 {v4.8b - v7.8b}, [sp], 32 | 1110 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 |
1177 st1 {v8.8b - v11.8b}, [sp], 32 | 1111 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
1178 st1 {v12.8b - v15.8b}, [sp], 32 | 1112 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
1179 st1 {v16.8b - v19.8b}, [sp], 32 | 1113 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 |
1180 st1 {v20.8b - v23.8b}, [sp], 32 | 1114 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 |
1181 st1 {v24.8b - v27.8b}, [sp], 32 | 1115 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 |
1182 st1 {v28.8b - v31.8b}, [sp], 32 | 1116 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 |
1183 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] | 1117 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] |
1184 | 1118 |
1185 /* Load all COEF_BLOCK into NEON registers with the following allocation: | 1119 /* Load all COEF_BLOCK into NEON registers with the following allocation: |
1186 * 0 1 2 3 | 4 5 6 7 | 1120 * 0 1 2 3 | 4 5 6 7 |
1187 * ---------+-------- | 1121 * ---------+-------- |
1188 * 0 | v4.4h | v5.4h | 1122 * 0 | v4.4h | v5.4h |
1189 * 1 | v6.4h | v7.4h | 1123 * 1 | v6.4h | v7.4h |
1190 * 2 | v8.4h | v9.4h | 1124 * 2 | v8.4h | v9.4h |
1191 * 3 | v10.4h | v11.4h | 1125 * 3 | v10.4h | v11.4h |
1192 * 4 | - | - | 1126 * 4 | - | - |
1193 * 5 | v12.4h | v13.4h | 1127 * 5 | v12.4h | v13.4h |
1194 * 6 | v14.4h | v15.4h | 1128 * 6 | v14.4h | v15.4h |
1195 * 7 | v16.4h | v17.4h | 1129 * 7 | v16.4h | v17.4h |
1196 */ | 1130 */ |
1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 | 1131 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 |
1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 | 1132 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 |
1199 add COEF_BLOCK, COEF_BLOCK, #16 | 1133 add COEF_BLOCK, COEF_BLOCK, #16 |
1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 | 1134 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 |
1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | 1135 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 |
1202 /* dequantize */ | 1136 /* dequantize */ |
1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | 1137 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 |
1204 mul v4.4h, v4.4h, v18.4h | 1138 mul v4.4h, v4.4h, v18.4h |
1205 mul v5.4h, v5.4h, v19.4h | 1139 mul v5.4h, v5.4h, v19.4h |
1206 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */ | 1140 ins v4.d[1], v5.d[0] /* 128 bit q4 */ |
1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 | 1141 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 |
1208 mul v6.4h, v6.4h, v20.4h | 1142 mul v6.4h, v6.4h, v20.4h |
1209 mul v7.4h, v7.4h, v21.4h | 1143 mul v7.4h, v7.4h, v21.4h |
1210 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */ | 1144 ins v6.d[1], v7.d[0] /* 128 bit q6 */ |
1211 mul v8.4h, v8.4h, v22.4h | 1145 mul v8.4h, v8.4h, v22.4h |
1212 mul v9.4h, v9.4h, v23.4h | 1146 mul v9.4h, v9.4h, v23.4h |
1213 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */ | 1147 ins v8.d[1], v9.d[0] /* 128 bit q8 */ |
1214 add DCT_TABLE, DCT_TABLE, #16 | 1148 add DCT_TABLE, DCT_TABLE, #16 |
1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 | 1149 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 |
1216 mul v10.4h, v10.4h, v24.4h | 1150 mul v10.4h, v10.4h, v24.4h |
1217 mul v11.4h, v11.4h, v25.4h | 1151 mul v11.4h, v11.4h, v25.4h |
1218 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */ | 1152 ins v10.d[1], v11.d[0] /* 128 bit q10 */ |
1219 mul v12.4h, v12.4h, v26.4h | 1153 mul v12.4h, v12.4h, v26.4h |
1220 mul v13.4h, v13.4h, v27.4h | 1154 mul v13.4h, v13.4h, v27.4h |
1221 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */ | 1155 ins v12.d[1], v13.d[0] /* 128 bit q12 */ |
1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | 1156 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 |
1223 mul v14.4h, v14.4h, v28.4h | 1157 mul v14.4h, v14.4h, v28.4h |
1224 mul v15.4h, v15.4h, v29.4h | 1158 mul v15.4h, v15.4h, v29.4h |
1225 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */ | 1159 ins v14.d[1], v15.d[0] /* 128 bit q14 */ |
1226 mul v16.4h, v16.4h, v30.4h | 1160 mul v16.4h, v16.4h, v30.4h |
1227 mul v17.4h, v17.4h, v31.4h | 1161 mul v17.4h, v17.4h, v31.4h |
1228 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */ | 1162 ins v16.d[1], v17.d[0] /* 128 bit q16 */ |
1229 | 1163 |
1230 /* Pass 1 */ | 1164 /* Pass 1 */ |
1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.
4h, v6.4h, v8.4h, v10.4h | 1165 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ |
| 1166 v4.4h, v6.4h, v8.4h, v10.4h |
1232 transpose_4x4 v4, v6, v8, v10, v3 | 1167 transpose_4x4 v4, v6, v8, v10, v3 |
1233 ins v10.2d[1], v11.2d[0] | 1168 ins v10.d[1], v11.d[0] |
1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.
4h, v7.4h, v9.4h, v11.4h | 1169 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ |
| 1170 v5.4h, v7.4h, v9.4h, v11.4h |
1235 transpose_4x4 v5, v7, v9, v11, v3 | 1171 transpose_4x4 v5, v7, v9, v11, v3 |
1236 ins v10.2d[1], v11.2d[0] | 1172 ins v10.d[1], v11.d[0] |
| 1173 |
1237 /* Pass 2 */ | 1174 /* Pass 2 */ |
1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4
h, v27.4h, v28.4h, v29.4h | 1175 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ |
| 1176 v26.4h, v27.4h, v28.4h, v29.4h |
1239 transpose_4x4 v26, v27, v28, v29, v3 | 1177 transpose_4x4 v26, v27, v28, v29, v3 |
1240 | 1178 |
1241 /* Range limit */ | 1179 /* Range limit */ |
1242 movi v30.8h, #0x80 | 1180 movi v30.8h, #0x80 |
1243 ins v26.2d[1], v27.2d[0] | 1181 ins v26.d[1], v27.d[0] |
1244 ins v28.2d[1], v29.2d[0] | 1182 ins v28.d[1], v29.d[0] |
1245 add v26.8h, v26.8h, v30.8h | 1183 add v26.8h, v26.8h, v30.8h |
1246 add v28.8h, v28.8h, v30.8h | 1184 add v28.8h, v28.8h, v30.8h |
1247 sqxtun v26.8b, v26.8h | 1185 sqxtun v26.8b, v26.8h |
1248 sqxtun v27.8b, v28.8h | 1186 sqxtun v27.8b, v28.8h |
1249 | 1187 |
1250 /* Store results to the output buffer */ | 1188 /* Store results to the output buffer */ |
1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1189 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
1252 ldp TMP3, TMP4, [OUTPUT_BUF] | 1190 ldp TMP3, TMP4, [OUTPUT_BUF] |
1253 add TMP1, TMP1, OUTPUT_COL | 1191 add TMP1, TMP1, OUTPUT_COL |
1254 add TMP2, TMP2, OUTPUT_COL | 1192 add TMP2, TMP2, OUTPUT_COL |
(...skipping 24 matching lines...) Expand all Loading... |
1279 st1 {v27.b}[5], [TMP4], 1 | 1217 st1 {v27.b}[5], [TMP4], 1 |
1280 st1 {v26.b}[6], [TMP2], 1 | 1218 st1 {v26.b}[6], [TMP2], 1 |
1281 st1 {v27.b}[6], [TMP4], 1 | 1219 st1 {v27.b}[6], [TMP4], 1 |
1282 st1 {v26.b}[7], [TMP2], 1 | 1220 st1 {v26.b}[7], [TMP2], 1 |
1283 st1 {v27.b}[7], [TMP4], 1 | 1221 st1 {v27.b}[7], [TMP4], 1 |
1284 #endif | 1222 #endif |
1285 | 1223 |
1286 /* vpop {v8.4h - v15.4h} ;not available */ | 1224 /* vpop {v8.4h - v15.4h} ;not available */ |
1287 sub sp, sp, #272 | 1225 sub sp, sp, #272 |
1288 ldr x15, [sp], 16 | 1226 ldr x15, [sp], 16 |
1289 ld1 {v0.8b - v3.8b}, [sp], 32 | 1227 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 |
1290 ld1 {v4.8b - v7.8b}, [sp], 32 | 1228 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 |
1291 ld1 {v8.8b - v11.8b}, [sp], 32 | 1229 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
1292 ld1 {v12.8b - v15.8b}, [sp], 32 | 1230 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
1293 ld1 {v16.8b - v19.8b}, [sp], 32 | 1231 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 |
1294 ld1 {v20.8b - v23.8b}, [sp], 32 | 1232 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 |
1295 ld1 {v24.8b - v27.8b}, [sp], 32 | 1233 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 |
1296 ld1 {v28.8b - v31.8b}, [sp], 32 | 1234 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 |
1297 blr x30 | 1235 blr x30 |
1298 | 1236 |
1299 .unreq DCT_TABLE | 1237 .unreq DCT_TABLE |
1300 .unreq COEF_BLOCK | 1238 .unreq COEF_BLOCK |
1301 .unreq OUTPUT_BUF | 1239 .unreq OUTPUT_BUF |
1302 .unreq OUTPUT_COL | 1240 .unreq OUTPUT_COL |
1303 .unreq TMP1 | 1241 .unreq TMP1 |
1304 .unreq TMP2 | 1242 .unreq TMP2 |
1305 .unreq TMP3 | 1243 .unreq TMP3 |
1306 .unreq TMP4 | 1244 .unreq TMP4 |
(...skipping 11 matching lines...) Expand all Loading... |
1318 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' | 1256 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' |
1319 * function from jpeg-6b (jidctred.c). | 1257 * function from jpeg-6b (jidctred.c). |
1320 * | 1258 * |
1321 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which | 1259 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which |
1322 * requires much less arithmetic operations and hence should be faster. | 1260 * requires much less arithmetic operations and hence should be faster. |
1323 * The primary purpose of this particular NEON optimized function is | 1261 * The primary purpose of this particular NEON optimized function is |
1324 * bit exact compatibility with jpeg-6b. | 1262 * bit exact compatibility with jpeg-6b. |
1325 */ | 1263 */ |
1326 | 1264 |
1327 .balign 8 | 1265 .balign 8 |
1328 jsimd_idct_2x2_neon_consts: | 1266 Ljsimd_idct_2x2_neon_consts: |
1329 .short -FIX_0_720959822 /* v14[0] */ | 1267 .short -FIX_0_720959822 /* v14[0] */ |
1330 .short FIX_0_850430095 /* v14[1] */ | 1268 .short FIX_0_850430095 /* v14[1] */ |
1331 .short -FIX_1_272758580 /* v14[2] */ | 1269 .short -FIX_1_272758580 /* v14[2] */ |
1332 .short FIX_3_624509785 /* v14[3] */ | 1270 .short FIX_3_624509785 /* v14[3] */ |
1333 | 1271 |
1334 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 | 1272 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 |
1335 sshll v15.4s, \x4, #15 | 1273 sshll v15.4s, \x4, #15 |
1336 smull v26.4s, \x6, v14.4h[3] | 1274 smull v26.4s, \x6, v14.h[3] |
1337 smlal v26.4s, \x10, v14.4h[2] | 1275 smlal v26.4s, \x10, v14.h[2] |
1338 smlal v26.4s, \x12, v14.4h[1] | 1276 smlal v26.4s, \x12, v14.h[1] |
1339 smlal v26.4s, \x16, v14.4h[0] | 1277 smlal v26.4s, \x16, v14.h[0] |
1340 | 1278 |
1341 add v20.4s, v15.4s, v26.4s | 1279 add v20.4s, v15.4s, v26.4s |
1342 sub v15.4s, v15.4s, v26.4s | 1280 sub v15.4s, v15.4s, v26.4s |
1343 | 1281 |
1344 .if \shift > 16 | 1282 .if \shift > 16 |
1345 srshr v20.4s, v20.4s, #\shift | 1283 srshr v20.4s, v20.4s, #\shift |
1346 srshr v15.4s, v15.4s, #\shift | 1284 srshr v15.4s, v15.4s, #\shift |
1347 xtn \y26, v20.4s | 1285 xtn \y26, v20.4s |
1348 xtn \y27, v15.4s | 1286 xtn \y27, v15.4s |
1349 .else | 1287 .else |
1350 rshrn \y26, v20.4s, #\shift | 1288 rshrn \y26, v20.4s, #\shift |
1351 rshrn \y27, v15.4s, #\shift | 1289 rshrn \y27, v15.4s, #\shift |
1352 .endif | 1290 .endif |
1353 | |
1354 .endm | 1291 .endm |
1355 | 1292 |
1356 asm_function jsimd_idct_2x2_neon | 1293 asm_function jsimd_idct_2x2_neon |
1357 | 1294 |
1358 DCT_TABLE .req x0 | 1295 DCT_TABLE .req x0 |
1359 COEF_BLOCK .req x1 | 1296 COEF_BLOCK .req x1 |
1360 OUTPUT_BUF .req x2 | 1297 OUTPUT_BUF .req x2 |
1361 OUTPUT_COL .req x3 | 1298 OUTPUT_COL .req x3 |
1362 TMP1 .req x0 | 1299 TMP1 .req x0 |
1363 TMP2 .req x15 | 1300 TMP2 .req x15 |
1364 | 1301 |
1365 /* vpush {v8.4h - v15.4h} ; not available */ | 1302 /* vpush {v8.4h - v15.4h} ; not available */ |
1366 sub sp, sp, 208 | 1303 sub sp, sp, 208 |
1367 str x15, [sp], 16 | 1304 str x15, [sp], 16 |
1368 | 1305 |
1369 /* Load constants */ | 1306 /* Load constants */ |
1370 adr TMP2, jsimd_idct_2x2_neon_consts | 1307 adr TMP2, Ljsimd_idct_2x2_neon_consts |
1371 st1 {v4.8b - v7.8b}, [sp], 32 | 1308 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 |
1372 st1 {v8.8b - v11.8b}, [sp], 32 | 1309 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
1373 st1 {v12.8b - v15.8b}, [sp], 32 | 1310 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
1374 st1 {v16.8b - v19.8b}, [sp], 32 | 1311 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 |
1375 st1 {v21.8b - v22.8b}, [sp], 16 | 1312 st1 {v21.8b, v22.8b}, [sp], 16 |
1376 st1 {v24.8b - v27.8b}, [sp], 32 | 1313 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 |
1377 st1 {v30.8b - v31.8b}, [sp], 16 | 1314 st1 {v30.8b, v31.8b}, [sp], 16 |
1378 ld1 {v14.4h}, [TMP2] | 1315 ld1 {v14.4h}, [TMP2] |
1379 | 1316 |
1380 /* Load all COEF_BLOCK into NEON registers with the following allocation: | 1317 /* Load all COEF_BLOCK into NEON registers with the following allocation: |
1381 * 0 1 2 3 | 4 5 6 7 | 1318 * 0 1 2 3 | 4 5 6 7 |
1382 * ---------+-------- | 1319 * ---------+-------- |
1383 * 0 | v4.4h | v5.4h | 1320 * 0 | v4.4h | v5.4h |
1384 * 1 | v6.4h | v7.4h | 1321 * 1 | v6.4h | v7.4h |
1385 * 2 | - | - | 1322 * 2 | - | - |
1386 * 3 | v10.4h | v11.4h | 1323 * 3 | v10.4h | v11.4h |
1387 * 4 | - | - | 1324 * 4 | - | - |
1388 * 5 | v12.4h | v13.4h | 1325 * 5 | v12.4h | v13.4h |
1389 * 6 | - | - | 1326 * 6 | - | - |
1390 * 7 | v16.4h | v17.4h | 1327 * 7 | v16.4h | v17.4h |
1391 */ | 1328 */ |
1392 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 | 1329 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 |
1393 add COEF_BLOCK, COEF_BLOCK, #16 | 1330 add COEF_BLOCK, COEF_BLOCK, #16 |
1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 | 1331 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 |
1395 add COEF_BLOCK, COEF_BLOCK, #16 | 1332 add COEF_BLOCK, COEF_BLOCK, #16 |
1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 | 1333 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 |
1397 add COEF_BLOCK, COEF_BLOCK, #16 | 1334 add COEF_BLOCK, COEF_BLOCK, #16 |
1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | 1335 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 |
1399 /* Dequantize */ | 1336 /* Dequantize */ |
1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | 1337 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 |
1401 mul v4.4h, v4.4h, v18.4h | 1338 mul v4.4h, v4.4h, v18.4h |
1402 mul v5.4h, v5.4h, v19.4h | 1339 mul v5.4h, v5.4h, v19.4h |
1403 ins v4.2d[1], v5.2d[0] | 1340 ins v4.d[1], v5.d[0] |
1404 mul v6.4h, v6.4h, v20.4h | 1341 mul v6.4h, v6.4h, v20.4h |
1405 mul v7.4h, v7.4h, v21.4h | 1342 mul v7.4h, v7.4h, v21.4h |
1406 ins v6.2d[1], v7.2d[0] | 1343 ins v6.d[1], v7.d[0] |
1407 add DCT_TABLE, DCT_TABLE, #16 | 1344 add DCT_TABLE, DCT_TABLE, #16 |
1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 | 1345 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 |
1409 mul v10.4h, v10.4h, v24.4h | 1346 mul v10.4h, v10.4h, v24.4h |
1410 mul v11.4h, v11.4h, v25.4h | 1347 mul v11.4h, v11.4h, v25.4h |
1411 ins v10.2d[1], v11.2d[0] | 1348 ins v10.d[1], v11.d[0] |
1412 add DCT_TABLE, DCT_TABLE, #16 | 1349 add DCT_TABLE, DCT_TABLE, #16 |
1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 | 1350 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 |
1414 mul v12.4h, v12.4h, v26.4h | 1351 mul v12.4h, v12.4h, v26.4h |
1415 mul v13.4h, v13.4h, v27.4h | 1352 mul v13.4h, v13.4h, v27.4h |
1416 ins v12.2d[1], v13.2d[0] | 1353 ins v12.d[1], v13.d[0] |
1417 add DCT_TABLE, DCT_TABLE, #16 | 1354 add DCT_TABLE, DCT_TABLE, #16 |
1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | 1355 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 |
1419 mul v16.4h, v16.4h, v30.4h | 1356 mul v16.4h, v16.4h, v30.4h |
1420 mul v17.4h, v17.4h, v31.4h | 1357 mul v17.4h, v17.4h, v31.4h |
1421 ins v16.2d[1], v17.2d[0] | 1358 ins v16.d[1], v17.d[0] |
1422 | 1359 |
1423 /* Pass 1 */ | 1360 /* Pass 1 */ |
1424 #if 0 | 1361 #if 0 |
1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h | 1362 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h |
1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h | 1363 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h |
1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h | 1364 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h |
1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h | 1365 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h |
1429 #else | 1366 #else |
1430 smull v26.4s, v6.4h, v14.4h[3] | 1367 smull v26.4s, v6.4h, v14.h[3] |
1431 smlal v26.4s, v10.4h, v14.4h[2] | 1368 smlal v26.4s, v10.4h, v14.h[2] |
1432 smlal v26.4s, v12.4h, v14.4h[1] | 1369 smlal v26.4s, v12.4h, v14.h[1] |
1433 smlal v26.4s, v16.4h, v14.4h[0] | 1370 smlal v26.4s, v16.4h, v14.h[0] |
1434 smull v24.4s, v7.4h, v14.4h[3] | 1371 smull v24.4s, v7.4h, v14.h[3] |
1435 smlal v24.4s, v11.4h, v14.4h[2] | 1372 smlal v24.4s, v11.4h, v14.h[2] |
1436 smlal v24.4s, v13.4h, v14.4h[1] | 1373 smlal v24.4s, v13.4h, v14.h[1] |
1437 smlal v24.4s, v17.4h, v14.4h[0] | 1374 smlal v24.4s, v17.4h, v14.h[0] |
1438 sshll v15.4s, v4.4h, #15 | 1375 sshll v15.4s, v4.4h, #15 |
1439 sshll v30.4s, v5.4h, #15 | 1376 sshll v30.4s, v5.4h, #15 |
1440 add v20.4s, v15.4s, v26.4s | 1377 add v20.4s, v15.4s, v26.4s |
1441 sub v15.4s, v15.4s, v26.4s | 1378 sub v15.4s, v15.4s, v26.4s |
1442 rshrn v4.4h, v20.4s, #13 | 1379 rshrn v4.4h, v20.4s, #13 |
1443 rshrn v6.4h, v15.4s, #13 | 1380 rshrn v6.4h, v15.4s, #13 |
1444 add v20.4s, v30.4s, v24.4s | 1381 add v20.4s, v30.4s, v24.4s |
1445 sub v15.4s, v30.4s, v24.4s | 1382 sub v15.4s, v30.4s, v24.4s |
1446 rshrn v5.4h, v20.4s, #13 | 1383 rshrn v5.4h, v20.4s, #13 |
1447 rshrn v7.4h, v15.4s, #13 | 1384 rshrn v7.4h, v15.4s, #13 |
1448 ins v4.2d[1], v5.2d[0] | 1385 ins v4.d[1], v5.d[0] |
1449 ins v6.2d[1], v7.2d[0] | 1386 ins v6.d[1], v7.d[0] |
1450 transpose v4, v6, v3, .16b, .8h | 1387 transpose v4, v6, v3, .16b, .8h |
1451 transpose v6, v10, v3, .16b, .4s | 1388 transpose v6, v10, v3, .16b, .4s |
1452 ins v11.2d[0], v10.2d[1] | 1389 ins v11.d[0], v10.d[1] |
1453 ins v7.2d[0], v6.2d[1] | 1390 ins v7.d[0], v6.d[1] |
1454 #endif | 1391 #endif |
1455 | 1392 |
1456 /* Pass 2 */ | 1393 /* Pass 2 */ |
1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h | 1394 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h |
1458 | 1395 |
1459 /* Range limit */ | 1396 /* Range limit */ |
1460 movi v30.8h, #0x80 | 1397 movi v30.8h, #0x80 |
1461 ins v26.2d[1], v27.2d[0] | 1398 ins v26.d[1], v27.d[0] |
1462 add v26.8h, v26.8h, v30.8h | 1399 add v26.8h, v26.8h, v30.8h |
1463 sqxtun v30.8b, v26.8h | 1400 sqxtun v30.8b, v26.8h |
1464 ins v26.2d[0], v30.2d[0] | 1401 ins v26.d[0], v30.d[0] |
1465 sqxtun v27.8b, v26.8h | 1402 sqxtun v27.8b, v26.8h |
1466 | 1403 |
1467 /* Store results to the output buffer */ | 1404 /* Store results to the output buffer */ |
1468 ldp TMP1, TMP2, [OUTPUT_BUF] | 1405 ldp TMP1, TMP2, [OUTPUT_BUF] |
1469 add TMP1, TMP1, OUTPUT_COL | 1406 add TMP1, TMP1, OUTPUT_COL |
1470 add TMP2, TMP2, OUTPUT_COL | 1407 add TMP2, TMP2, OUTPUT_COL |
1471 | 1408 |
1472 st1 {v26.b}[0], [TMP1], 1 | 1409 st1 {v26.b}[0], [TMP1], 1 |
1473 st1 {v27.b}[4], [TMP1], 1 | 1410 st1 {v27.b}[4], [TMP1], 1 |
1474 st1 {v26.b}[1], [TMP2], 1 | 1411 st1 {v26.b}[1], [TMP2], 1 |
1475 st1 {v27.b}[5], [TMP2], 1 | 1412 st1 {v27.b}[5], [TMP2], 1 |
1476 | 1413 |
1477 sub sp, sp, #208 | 1414 sub sp, sp, #208 |
1478 ldr x15, [sp], 16 | 1415 ldr x15, [sp], 16 |
1479 ld1 {v4.8b - v7.8b}, [sp], 32 | 1416 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 |
1480 ld1 {v8.8b - v11.8b}, [sp], 32 | 1417 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
1481 ld1 {v12.8b - v15.8b}, [sp], 32 | 1418 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
1482 ld1 {v16.8b - v19.8b}, [sp], 32 | 1419 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 |
1483 ld1 {v21.8b - v22.8b}, [sp], 16 | 1420 ld1 {v21.8b, v22.8b}, [sp], 16 |
1484 ld1 {v24.8b - v27.8b}, [sp], 32 | 1421 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 |
1485 ld1 {v30.8b - v31.8b}, [sp], 16 | 1422 ld1 {v30.8b, v31.8b}, [sp], 16 |
1486 blr x30 | 1423 blr x30 |
1487 | 1424 |
1488 .unreq DCT_TABLE | 1425 .unreq DCT_TABLE |
1489 .unreq COEF_BLOCK | 1426 .unreq COEF_BLOCK |
1490 .unreq OUTPUT_BUF | 1427 .unreq OUTPUT_BUF |
1491 .unreq OUTPUT_COL | 1428 .unreq OUTPUT_COL |
1492 .unreq TMP1 | 1429 .unreq TMP1 |
1493 .unreq TMP2 | 1430 .unreq TMP2 |
1494 | 1431 |
1495 .purgem idct_helper | 1432 .purgem idct_helper |
1496 | 1433 |
1497 | 1434 |
1498 /*****************************************************************************/ | 1435 /*****************************************************************************/ |
1499 | 1436 |
1500 /* | 1437 /* |
1501 * jsimd_ycc_extrgb_convert_neon | 1438 * jsimd_ycc_extrgb_convert_neon |
1502 * jsimd_ycc_extbgr_convert_neon | 1439 * jsimd_ycc_extbgr_convert_neon |
1503 * jsimd_ycc_extrgbx_convert_neon | 1440 * jsimd_ycc_extrgbx_convert_neon |
1504 * jsimd_ycc_extbgrx_convert_neon | 1441 * jsimd_ycc_extbgrx_convert_neon |
1505 * jsimd_ycc_extxbgr_convert_neon | 1442 * jsimd_ycc_extxbgr_convert_neon |
1506 * jsimd_ycc_extxrgb_convert_neon | 1443 * jsimd_ycc_extxrgb_convert_neon |
1507 * | 1444 * |
1508 * Colorspace conversion YCbCr -> RGB | 1445 * Colorspace conversion YCbCr -> RGB |
1509 */ | 1446 */ |
1510 | 1447 |
1511 | |
1512 .macro do_load size | 1448 .macro do_load size |
| 1449 .if \size == 8 |
| 1450 ld1 {v4.8b}, [U], 8 |
| 1451 ld1 {v5.8b}, [V], 8 |
| 1452 ld1 {v0.8b}, [Y], 8 |
| 1453 prfm pldl1keep, [U, #64] |
| 1454 prfm pldl1keep, [V, #64] |
| 1455 prfm pldl1keep, [Y, #64] |
| 1456 .elseif \size == 4 |
| 1457 ld1 {v4.b}[0], [U], 1 |
| 1458 ld1 {v4.b}[1], [U], 1 |
| 1459 ld1 {v4.b}[2], [U], 1 |
| 1460 ld1 {v4.b}[3], [U], 1 |
| 1461 ld1 {v5.b}[0], [V], 1 |
| 1462 ld1 {v5.b}[1], [V], 1 |
| 1463 ld1 {v5.b}[2], [V], 1 |
| 1464 ld1 {v5.b}[3], [V], 1 |
| 1465 ld1 {v0.b}[0], [Y], 1 |
| 1466 ld1 {v0.b}[1], [Y], 1 |
| 1467 ld1 {v0.b}[2], [Y], 1 |
| 1468 ld1 {v0.b}[3], [Y], 1 |
| 1469 .elseif \size == 2 |
| 1470 ld1 {v4.b}[4], [U], 1 |
| 1471 ld1 {v4.b}[5], [U], 1 |
| 1472 ld1 {v5.b}[4], [V], 1 |
| 1473 ld1 {v5.b}[5], [V], 1 |
| 1474 ld1 {v0.b}[4], [Y], 1 |
| 1475 ld1 {v0.b}[5], [Y], 1 |
| 1476 .elseif \size == 1 |
| 1477 ld1 {v4.b}[6], [U], 1 |
| 1478 ld1 {v5.b}[6], [V], 1 |
| 1479 ld1 {v0.b}[6], [Y], 1 |
| 1480 .else |
| 1481 .error unsupported macroblock size |
| 1482 .endif |
| 1483 .endm |
| 1484 |
| 1485 .macro do_store bpp, size, fast_st3 |
| 1486 .if \bpp == 24 |
1513 .if \size == 8 | 1487 .if \size == 8 |
1514 ld1 {v4.8b}, [U], 8 | 1488 .if \fast_st3 == 1 |
1515 ld1 {v5.8b}, [V], 8 | 1489 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 |
1516 ld1 {v0.8b}, [Y], 8 | 1490 .else |
1517 prfm PLDL1KEEP, [U, #64] | 1491 st1 {v10.b}[0], [RGB], #1 |
1518 prfm PLDL1KEEP, [V, #64] | 1492 st1 {v11.b}[0], [RGB], #1 |
1519 prfm PLDL1KEEP, [Y, #64] | 1493 st1 {v12.b}[0], [RGB], #1 |
| 1494 |
| 1495 st1 {v10.b}[1], [RGB], #1 |
| 1496 st1 {v11.b}[1], [RGB], #1 |
| 1497 st1 {v12.b}[1], [RGB], #1 |
| 1498 |
| 1499 st1 {v10.b}[2], [RGB], #1 |
| 1500 st1 {v11.b}[2], [RGB], #1 |
| 1501 st1 {v12.b}[2], [RGB], #1 |
| 1502 |
| 1503 st1 {v10.b}[3], [RGB], #1 |
| 1504 st1 {v11.b}[3], [RGB], #1 |
| 1505 st1 {v12.b}[3], [RGB], #1 |
| 1506 |
| 1507 st1 {v10.b}[4], [RGB], #1 |
| 1508 st1 {v11.b}[4], [RGB], #1 |
| 1509 st1 {v12.b}[4], [RGB], #1 |
| 1510 |
| 1511 st1 {v10.b}[5], [RGB], #1 |
| 1512 st1 {v11.b}[5], [RGB], #1 |
| 1513 st1 {v12.b}[5], [RGB], #1 |
| 1514 |
| 1515 st1 {v10.b}[6], [RGB], #1 |
| 1516 st1 {v11.b}[6], [RGB], #1 |
| 1517 st1 {v12.b}[6], [RGB], #1 |
| 1518 |
| 1519 st1 {v10.b}[7], [RGB], #1 |
| 1520 st1 {v11.b}[7], [RGB], #1 |
| 1521 st1 {v12.b}[7], [RGB], #1 |
| 1522 .endif |
1520 .elseif \size == 4 | 1523 .elseif \size == 4 |
1521 ld1 {v4.b}[0], [U], 1 | 1524 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 |
1522 ld1 {v4.b}[1], [U], 1 | 1525 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 |
1523 ld1 {v4.b}[2], [U], 1 | 1526 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 |
1524 ld1 {v4.b}[3], [U], 1 | 1527 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 |
1525 ld1 {v5.b}[0], [V], 1 | |
1526 ld1 {v5.b}[1], [V], 1 | |
1527 ld1 {v5.b}[2], [V], 1 | |
1528 ld1 {v5.b}[3], [V], 1 | |
1529 ld1 {v0.b}[0], [Y], 1 | |
1530 ld1 {v0.b}[1], [Y], 1 | |
1531 ld1 {v0.b}[2], [Y], 1 | |
1532 ld1 {v0.b}[3], [Y], 1 | |
1533 .elseif \size == 2 | 1528 .elseif \size == 2 |
1534 ld1 {v4.b}[4], [U], 1 | 1529 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 |
1535 ld1 {v4.b}[5], [U], 1 | 1530 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 |
1536 ld1 {v5.b}[4], [V], 1 | |
1537 ld1 {v5.b}[5], [V], 1 | |
1538 ld1 {v0.b}[4], [Y], 1 | |
1539 ld1 {v0.b}[5], [Y], 1 | |
1540 .elseif \size == 1 | 1531 .elseif \size == 1 |
1541 ld1 {v4.b}[6], [U], 1 | 1532 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 |
1542 ld1 {v5.b}[6], [V], 1 | |
1543 ld1 {v0.b}[6], [Y], 1 | |
1544 .else | 1533 .else |
1545 .error unsupported macroblock size | 1534 .error unsupported macroblock size |
1546 .endif | 1535 .endif |
1547 .endm | 1536 .elseif \bpp == 32 |
1548 | 1537 .if \size == 8 |
1549 .macro do_store bpp, size | 1538 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 |
1550 .if \bpp == 24 | 1539 .elseif \size == 4 |
1551 .if \size == 8 | 1540 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 |
1552 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 | 1541 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 |
1553 .elseif \size == 4 | 1542 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 |
1554 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 | 1543 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 |
1555 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 | 1544 .elseif \size == 2 |
1556 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 | 1545 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 |
1557 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 | 1546 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 |
1558 .elseif \size == 2 | 1547 .elseif \size == 1 |
1559 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 | 1548 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 |
1560 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 | 1549 .else |
1561 .elseif \size == 1 | 1550 .error unsupported macroblock size |
1562 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 | |
1563 .else | |
1564 .error unsupported macroblock size | |
1565 .endif | |
1566 .elseif \bpp == 32 | |
1567 .if \size == 8 | |
1568 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 | |
1569 .elseif \size == 4 | |
1570 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 | |
1571 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 | |
1572 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 | |
1573 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 | |
1574 .elseif \size == 2 | |
1575 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 | |
1576 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 | |
1577 .elseif \size == 1 | |
1578 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 | |
1579 .else | |
1580 .error unsupported macroblock size | |
1581 .endif | |
1582 .elseif \bpp==16 | |
1583 .if \size == 8 | |
1584 st1 {v25.8h}, [RGB],16 | |
1585 .elseif \size == 4 | |
1586 st1 {v25.4h}, [RGB],8 | |
1587 .elseif \size == 2 | |
1588 st1 {v25.h}[4], [RGB],2 | |
1589 st1 {v25.h}[5], [RGB],2 | |
1590 .elseif \size == 1 | |
1591 st1 {v25.h}[6], [RGB],2 | |
1592 .else | |
1593 .error unsupported macroblock size | |
1594 .endif | |
1595 .else | |
1596 .error unsupported bpp | |
1597 .endif | 1551 .endif |
1598 .endm | 1552 .elseif \bpp==16 |
1599 | 1553 .if \size == 8 |
1600 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs,
gsize, b_offs, bsize, defsize | 1554 st1 {v25.8h}, [RGB], 16 |
| 1555 .elseif \size == 4 |
| 1556 st1 {v25.4h}, [RGB], 8 |
| 1557 .elseif \size == 2 |
| 1558 st1 {v25.h}[4], [RGB], 2 |
| 1559 st1 {v25.h}[5], [RGB], 2 |
| 1560 .elseif \size == 1 |
| 1561 st1 {v25.h}[6], [RGB], 2 |
| 1562 .else |
| 1563 .error unsupported macroblock size |
| 1564 .endif |
| 1565 .else |
| 1566 .error unsupported bpp |
| 1567 .endif |
| 1568 .endm |
| 1569 |
| 1570 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ |
| 1571 g_offs, gsize, b_offs, bsize, \ |
| 1572 defsize, fast_st3 |
1601 | 1573 |
1602 /* | 1574 /* |
1603 * 2-stage pipelined YCbCr->RGB conversion | 1575 * 2-stage pipelined YCbCr->RGB conversion |
1604 */ | 1576 */ |
1605 | 1577 |
1606 .macro do_yuv_to_rgb_stage1 | 1578 .macro do_yuv_to_rgb_stage1 |
1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ | 1579 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ |
1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1580 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
1609 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1581 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
1610 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1582 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
1611 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1583 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
1612 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1584 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
1613 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1585 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
1614 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1586 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
1615 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | 1587 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ |
1616 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | 1588 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ |
1617 .endm | 1589 .endm |
1618 | 1590 |
1619 .macro do_yuv_to_rgb_stage2 | 1591 .macro do_yuv_to_rgb_stage2 |
1620 rshrn v20.4h, v20.4s, #15 | 1592 rshrn v20.4h, v20.4s, #15 |
1621 rshrn2 v20.8h, v22.4s, #15 | 1593 rshrn2 v20.8h, v22.4s, #15 |
1622 rshrn v24.4h, v24.4s, #14 | 1594 rshrn v24.4h, v24.4s, #14 |
1623 rshrn2 v24.8h, v26.4s, #14 | 1595 rshrn2 v24.8h, v26.4s, #14 |
1624 rshrn v28.4h, v28.4s, #14 | 1596 rshrn v28.4h, v28.4s, #14 |
1625 rshrn2 v28.8h, v30.4s, #14 | 1597 rshrn2 v28.8h, v30.4s, #14 |
1626 uaddw v20.8h, v20.8h, v0.8b | 1598 uaddw v20.8h, v20.8h, v0.8b |
1627 uaddw v24.8h, v24.8h, v0.8b | 1599 uaddw v24.8h, v24.8h, v0.8b |
1628 uaddw v28.8h, v28.8h, v0.8b | 1600 uaddw v28.8h, v28.8h, v0.8b |
1629 .if \bpp != 16 | 1601 .if \bpp != 16 |
1630 sqxtun v1\g_offs\defsize, v20.8h | 1602 sqxtun v1\g_offs\defsize, v20.8h |
1631 sqxtun v1\r_offs\defsize, v24.8h | 1603 sqxtun v1\r_offs\defsize, v24.8h |
1632 sqxtun v1\b_offs\defsize, v28.8h | 1604 sqxtun v1\b_offs\defsize, v28.8h |
1633 .else | 1605 .else |
1634 sqshlu v21.8h, v20.8h, #8 | 1606 sqshlu v21.8h, v20.8h, #8 |
1635 sqshlu v25.8h, v24.8h, #8 | 1607 sqshlu v25.8h, v24.8h, #8 |
1636 sqshlu v29.8h, v28.8h, #8 | 1608 sqshlu v29.8h, v28.8h, #8 |
1637 sri v25.8h, v21.8h, #5 | 1609 sri v25.8h, v21.8h, #5 |
1638 sri v25.8h, v29.8h, #11 | 1610 sri v25.8h, v29.8h, #11 |
1639 .endif | 1611 .endif |
1640 | 1612 .endm |
1641 .endm | 1613 |
1642 | 1614 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 |
1643 .macro do_yuv_to_rgb_stage2_store_load_stage1 | 1615 rshrn v20.4h, v20.4s, #15 |
1644 rshrn v20.4h, v20.4s, #15 | 1616 rshrn v24.4h, v24.4s, #14 |
1645 rshrn v24.4h, v24.4s, #14 | 1617 rshrn v28.4h, v28.4s, #14 |
1646 rshrn v28.4h, v28.4s, #14 | 1618 ld1 {v4.8b}, [U], 8 |
1647 ld1 {v4.8b}, [U], 8 | 1619 rshrn2 v20.8h, v22.4s, #15 |
1648 rshrn2 v20.8h, v22.4s, #15 | 1620 rshrn2 v24.8h, v26.4s, #14 |
1649 rshrn2 v24.8h, v26.4s, #14 | 1621 rshrn2 v28.8h, v30.4s, #14 |
1650 rshrn2 v28.8h, v30.4s, #14 | 1622 ld1 {v5.8b}, [V], 8 |
1651 ld1 {v5.8b}, [V], 8 | 1623 uaddw v20.8h, v20.8h, v0.8b |
1652 uaddw v20.8h, v20.8h, v0.8b | 1624 uaddw v24.8h, v24.8h, v0.8b |
1653 uaddw v24.8h, v24.8h, v0.8b | 1625 uaddw v28.8h, v28.8h, v0.8b |
1654 uaddw v28.8h, v28.8h, v0.8b | 1626 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ |
1655 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ | 1627 sqxtun v1\g_offs\defsize, v20.8h |
1656 sqxtun v1\g_offs\defsize, v20.8h | 1628 ld1 {v0.8b}, [Y], 8 |
1657 ld1 {v0.8b}, [Y], 8 | 1629 sqxtun v1\r_offs\defsize, v24.8h |
1658 sqxtun v1\r_offs\defsize, v24.8h | 1630 prfm pldl1keep, [U, #64] |
1659 prfm PLDL1KEEP, [U, #64] | 1631 prfm pldl1keep, [V, #64] |
1660 prfm PLDL1KEEP, [V, #64] | 1632 prfm pldl1keep, [Y, #64] |
1661 prfm PLDL1KEEP, [Y, #64] | 1633 sqxtun v1\b_offs\defsize, v28.8h |
1662 sqxtun v1\b_offs\defsize, v28.8h | 1634 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ |
1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | 1635 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1636 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
1665 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1637 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
1666 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1638 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
1667 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1639 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
1668 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1640 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
1669 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1641 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
1670 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1642 .else /**************************** rgb565 ********************************/ |
1671 .else /**************************** rgb565 ***********************************/ | 1643 sqshlu v21.8h, v20.8h, #8 |
1672 sqshlu v21.8h, v20.8h, #8 | 1644 sqshlu v25.8h, v24.8h, #8 |
1673 sqshlu v25.8h, v24.8h, #8 | 1645 sqshlu v29.8h, v28.8h, #8 |
1674 sqshlu v29.8h, v28.8h, #8 | 1646 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ |
1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | 1647 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1648 ld1 {v0.8b}, [Y], 8 |
1677 ld1 {v0.8b}, [Y], 8 | 1649 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
1678 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1650 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
1679 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1651 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
1680 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1652 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
1681 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1653 sri v25.8h, v21.8h, #5 |
1682 sri v25.8h, v21.8h, #5 | 1654 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
1683 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1655 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
1684 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1656 prfm pldl1keep, [U, #64] |
1685 prfm PLDL1KEEP, [U, #64] | 1657 prfm pldl1keep, [V, #64] |
1686 prfm PLDL1KEEP, [V, #64] | 1658 prfm pldl1keep, [Y, #64] |
1687 prfm PLDL1KEEP, [Y, #64] | 1659 sri v25.8h, v29.8h, #11 |
1688 sri v25.8h, v29.8h, #11 | 1660 .endif |
1689 .endif | 1661 do_store \bpp, 8, \fast_st3 |
1690 do_store \bpp, 8 | 1662 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ |
1691 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | 1663 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ |
1692 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | |
1693 .endm | 1664 .endm |
1694 | 1665 |
1695 .macro do_yuv_to_rgb | 1666 .macro do_yuv_to_rgb |
1696 do_yuv_to_rgb_stage1 | 1667 do_yuv_to_rgb_stage1 |
1697 do_yuv_to_rgb_stage2 | 1668 do_yuv_to_rgb_stage2 |
1698 .endm | 1669 .endm |
1699 | 1670 |
1700 /* Apple gas crashes on adrl, work around that by using adr. | 1671 /* Apple gas crashes on adrl, work around that by using adr. |
1701 * But this requires a copy of these constants for each function. | 1672 * But this requires a copy of these constants for each function. |
1702 */ | 1673 */ |
1703 | 1674 |
1704 .balign 16 | 1675 .balign 16 |
1705 jsimd_ycc_\colorid\()_neon_consts: | 1676 .if \fast_st3 == 1 |
1706 .short 0, 0, 0, 0 | 1677 Ljsimd_ycc_\colorid\()_neon_consts: |
1707 .short 22971, -11277, -23401, 29033 | 1678 .else |
1708 .short -128, -128, -128, -128 | 1679 Ljsimd_ycc_\colorid\()_neon_slowst3_consts: |
1709 .short -128, -128, -128, -128 | 1680 .endif |
| 1681 .short 0, 0, 0, 0 |
| 1682 .short 22971, -11277, -23401, 29033 |
| 1683 .short -128, -128, -128, -128 |
| 1684 .short -128, -128, -128, -128 |
1710 | 1685 |
| 1686 .if \fast_st3 == 1 |
1711 asm_function jsimd_ycc_\colorid\()_convert_neon | 1687 asm_function jsimd_ycc_\colorid\()_convert_neon |
| 1688 .else |
| 1689 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 |
| 1690 .endif |
1712 OUTPUT_WIDTH .req x0 | 1691 OUTPUT_WIDTH .req x0 |
1713 INPUT_BUF .req x1 | 1692 INPUT_BUF .req x1 |
1714 INPUT_ROW .req x2 | 1693 INPUT_ROW .req x2 |
1715 OUTPUT_BUF .req x3 | 1694 OUTPUT_BUF .req x3 |
1716 NUM_ROWS .req x4 | 1695 NUM_ROWS .req x4 |
1717 | 1696 |
1718 INPUT_BUF0 .req x5 | 1697 INPUT_BUF0 .req x5 |
1719 INPUT_BUF1 .req x6 | 1698 INPUT_BUF1 .req x6 |
1720 INPUT_BUF2 .req INPUT_BUF | 1699 INPUT_BUF2 .req x1 |
1721 | 1700 |
1722 RGB .req x7 | 1701 RGB .req x7 |
1723 Y .req x8 | 1702 Y .req x8 |
1724 U .req x9 | 1703 U .req x9 |
1725 V .req x10 | 1704 V .req x10 |
1726 N .req x15 | 1705 N .req x15 |
1727 | 1706 |
1728 sub sp, sp, 336 | 1707 sub sp, sp, 336 |
1729 str x15, [sp], 16 | 1708 str x15, [sp], 16 |
| 1709 |
1730 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ | 1710 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ |
1731 adr x15, jsimd_ycc_\colorid\()_neon_consts | 1711 .if \fast_st3 == 1 |
| 1712 adr x15, Ljsimd_ycc_\colorid\()_neon_consts |
| 1713 .else |
| 1714 adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts |
| 1715 .endif |
| 1716 |
1732 /* Save NEON registers */ | 1717 /* Save NEON registers */ |
1733 st1 {v0.8b - v3.8b}, [sp], 32 | 1718 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 |
1734 st1 {v4.8b - v7.8b}, [sp], 32 | 1719 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 |
1735 st1 {v8.8b - v11.8b}, [sp], 32 | 1720 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
1736 st1 {v12.8b - v15.8b}, [sp], 32 | 1721 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
1737 st1 {v16.8b - v19.8b}, [sp], 32 | 1722 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 |
1738 st1 {v20.8b - v23.8b}, [sp], 32 | 1723 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 |
1739 st1 {v24.8b - v27.8b}, [sp], 32 | 1724 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 |
1740 st1 {v28.8b - v31.8b}, [sp], 32 | 1725 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 |
1741 ld1 {v0.4h, v1.4h}, [x15], 16 | 1726 ld1 {v0.4h, v1.4h}, [x15], 16 |
1742 ld1 {v2.8h}, [x15] | 1727 ld1 {v2.8h}, [x15] |
1743 | 1728 |
1744 /* Save ARM registers and handle input arguments */ | 1729 /* Save ARM registers and handle input arguments */ |
1745 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ | 1730 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ |
1746 stp x4, x5, [sp], 16 | 1731 stp x4, x5, [sp], 16 |
1747 stp x6, x7, [sp], 16 | 1732 stp x6, x7, [sp], 16 |
1748 stp x8, x9, [sp], 16 | 1733 stp x8, x9, [sp], 16 |
1749 stp x10, x30, [sp], 16 | 1734 stp x10, x30, [sp], 16 |
1750 ldr INPUT_BUF0, [INPUT_BUF] | 1735 ldr INPUT_BUF0, [INPUT_BUF] |
1751 ldr INPUT_BUF1, [INPUT_BUF, 8] | 1736 ldr INPUT_BUF1, [INPUT_BUF, #8] |
1752 ldr INPUT_BUF2, [INPUT_BUF, 16] | 1737 ldr INPUT_BUF2, [INPUT_BUF, #16] |
1753 .unreq INPUT_BUF | 1738 .unreq INPUT_BUF |
1754 | 1739 |
1755 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ | 1740 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ |
1756 movi v10.16b, #255 | 1741 movi v10.16b, #255 |
1757 movi v13.16b, #255 | 1742 movi v13.16b, #255 |
1758 | 1743 |
1759 /* Outer loop over scanlines */ | 1744 /* Outer loop over scanlines */ |
1760 cmp NUM_ROWS, #1 | 1745 cmp NUM_ROWS, #1 |
1761 blt 9f | 1746 b.lt 9f |
1762 0: | 1747 0: |
1763 lsl x16, INPUT_ROW, #3 | 1748 lsl x16, INPUT_ROW, #3 |
1764 ldr Y, [INPUT_BUF0, x16] | 1749 ldr Y, [INPUT_BUF0, x16] |
1765 ldr U, [INPUT_BUF1, x16] | 1750 ldr U, [INPUT_BUF1, x16] |
1766 mov N, OUTPUT_WIDTH | 1751 mov N, OUTPUT_WIDTH |
1767 ldr V, [INPUT_BUF2, x16] | 1752 ldr V, [INPUT_BUF2, x16] |
1768 add INPUT_ROW, INPUT_ROW, #1 | 1753 add INPUT_ROW, INPUT_ROW, #1 |
1769 ldr RGB, [OUTPUT_BUF], #8 | 1754 ldr RGB, [OUTPUT_BUF], #8 |
1770 | 1755 |
1771 /* Inner loop over pixels */ | 1756 /* Inner loop over pixels */ |
1772 subs N, N, #8 | 1757 subs N, N, #8 |
1773 blt 3f | 1758 b.lt 3f |
1774 do_load 8 | 1759 do_load 8 |
1775 do_yuv_to_rgb_stage1 | 1760 do_yuv_to_rgb_stage1 |
1776 subs N, N, #8 | 1761 subs N, N, #8 |
1777 blt 2f | 1762 b.lt 2f |
1778 1: | 1763 1: |
1779 do_yuv_to_rgb_stage2_store_load_stage1 | 1764 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 |
1780 subs N, N, #8 | 1765 subs N, N, #8 |
1781 bge 1b | 1766 b.ge 1b |
1782 2: | 1767 2: |
1783 do_yuv_to_rgb_stage2 | 1768 do_yuv_to_rgb_stage2 |
1784 do_store \bpp, 8 | 1769 do_store \bpp, 8, \fast_st3 |
1785 tst N, #7 | 1770 tst N, #7 |
1786 beq 8f | 1771 b.eq 8f |
1787 3: | 1772 3: |
1788 tst N, #4 | 1773 tst N, #4 |
1789 beq 3f | 1774 b.eq 3f |
1790 do_load 4 | 1775 do_load 4 |
1791 3: | 1776 3: |
1792 tst N, #2 | 1777 tst N, #2 |
1793 beq 4f | 1778 b.eq 4f |
1794 do_load 2 | 1779 do_load 2 |
1795 4: | 1780 4: |
1796 tst N, #1 | 1781 tst N, #1 |
1797 beq 5f | 1782 b.eq 5f |
1798 do_load 1 | 1783 do_load 1 |
1799 5: | 1784 5: |
1800 do_yuv_to_rgb | 1785 do_yuv_to_rgb |
1801 tst N, #4 | 1786 tst N, #4 |
1802 beq 6f | 1787 b.eq 6f |
1803 do_store \bpp, 4 | 1788 do_store \bpp, 4, \fast_st3 |
1804 6: | 1789 6: |
1805 tst N, #2 | 1790 tst N, #2 |
1806 beq 7f | 1791 b.eq 7f |
1807 do_store \bpp, 2 | 1792 do_store \bpp, 2, \fast_st3 |
1808 7: | 1793 7: |
1809 tst N, #1 | 1794 tst N, #1 |
1810 beq 8f | 1795 b.eq 8f |
1811 do_store \bpp, 1 | 1796 do_store \bpp, 1, \fast_st3 |
1812 8: | 1797 8: |
1813 subs NUM_ROWS, NUM_ROWS, #1 | 1798 subs NUM_ROWS, NUM_ROWS, #1 |
1814 bgt 0b | 1799 b.gt 0b |
1815 9: | 1800 9: |
1816 /* Restore all registers and return */ | 1801 /* Restore all registers and return */ |
1817 sub sp, sp, #336 | 1802 sub sp, sp, #336 |
1818 ldr x15, [sp], 16 | 1803 ldr x15, [sp], 16 |
1819 ld1 {v0.8b - v3.8b}, [sp], 32 | 1804 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 |
1820 ld1 {v4.8b - v7.8b}, [sp], 32 | 1805 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 |
1821 ld1 {v8.8b - v11.8b}, [sp], 32 | 1806 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
1822 ld1 {v12.8b - v15.8b}, [sp], 32 | 1807 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
1823 ld1 {v16.8b - v19.8b}, [sp], 32 | 1808 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 |
1824 ld1 {v20.8b - v23.8b}, [sp], 32 | 1809 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 |
1825 ld1 {v24.8b - v27.8b}, [sp], 32 | 1810 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 |
1826 ld1 {v28.8b - v31.8b}, [sp], 32 | 1811 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 |
1827 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ | 1812 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ |
1828 ldp x4, x5, [sp], 16 | 1813 ldp x4, x5, [sp], 16 |
1829 ldp x6, x7, [sp], 16 | 1814 ldp x6, x7, [sp], 16 |
1830 ldp x8, x9, [sp], 16 | 1815 ldp x8, x9, [sp], 16 |
1831 ldp x10, x30, [sp], 16 | 1816 ldp x10, x30, [sp], 16 |
1832 br x30 | 1817 br x30 |
1833 .unreq OUTPUT_WIDTH | 1818 .unreq OUTPUT_WIDTH |
1834 .unreq INPUT_ROW | 1819 .unreq INPUT_ROW |
1835 .unreq OUTPUT_BUF | 1820 .unreq OUTPUT_BUF |
1836 .unreq NUM_ROWS | 1821 .unreq NUM_ROWS |
1837 .unreq INPUT_BUF0 | 1822 .unreq INPUT_BUF0 |
1838 .unreq INPUT_BUF1 | 1823 .unreq INPUT_BUF1 |
1839 .unreq INPUT_BUF2 | 1824 .unreq INPUT_BUF2 |
1840 .unreq RGB | 1825 .unreq RGB |
1841 .unreq Y | 1826 .unreq Y |
1842 .unreq U | 1827 .unreq U |
1843 .unreq V | 1828 .unreq V |
1844 .unreq N | 1829 .unreq N |
1845 | 1830 |
1846 .purgem do_yuv_to_rgb | 1831 .purgem do_yuv_to_rgb |
1847 .purgem do_yuv_to_rgb_stage1 | 1832 .purgem do_yuv_to_rgb_stage1 |
1848 .purgem do_yuv_to_rgb_stage2 | 1833 .purgem do_yuv_to_rgb_stage2 |
1849 .purgem do_yuv_to_rgb_stage2_store_load_stage1 | 1834 .purgem do_yuv_to_rgb_stage2_store_load_stage1 |
| 1835 |
1850 .endm | 1836 .endm |
1851 | 1837 |
1852 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d
efsize */ | 1838 /*--------------------------------- id ----- bpp R rsize G gsize B bsize defs
ize fast_st3*/ |
1853 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .
8b | 1839 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b,
1 |
1854 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .
8b | 1840 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b,
1 |
1855 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .
8b | 1841 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b,
1 |
1856 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .
8b | 1842 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b,
1 |
1857 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .
8b | 1843 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b,
1 |
1858 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .
8b | 1844 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b,
1 |
1859 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .
8b | 1845 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b,
1 |
| 1846 |
| 1847 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b,
0 |
| 1848 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b,
0 |
| 1849 |
1860 .purgem do_load | 1850 .purgem do_load |
1861 .purgem do_store | 1851 .purgem do_store |
| 1852 |
| 1853 |
| 1854 /*****************************************************************************/ |
| 1855 |
| 1856 /* |
| 1857 * jsimd_extrgb_ycc_convert_neon |
| 1858 * jsimd_extbgr_ycc_convert_neon |
| 1859 * jsimd_extrgbx_ycc_convert_neon |
| 1860 * jsimd_extbgrx_ycc_convert_neon |
| 1861 * jsimd_extxbgr_ycc_convert_neon |
| 1862 * jsimd_extxrgb_ycc_convert_neon |
| 1863 * |
| 1864 * Colorspace conversion RGB -> YCbCr |
| 1865 */ |
| 1866 |
| 1867 .macro do_store size |
| 1868 .if \size == 8 |
| 1869 st1 {v20.8b}, [Y], #8 |
| 1870 st1 {v21.8b}, [U], #8 |
| 1871 st1 {v22.8b}, [V], #8 |
| 1872 .elseif \size == 4 |
| 1873 st1 {v20.b}[0], [Y], #1 |
| 1874 st1 {v20.b}[1], [Y], #1 |
| 1875 st1 {v20.b}[2], [Y], #1 |
| 1876 st1 {v20.b}[3], [Y], #1 |
| 1877 st1 {v21.b}[0], [U], #1 |
| 1878 st1 {v21.b}[1], [U], #1 |
| 1879 st1 {v21.b}[2], [U], #1 |
| 1880 st1 {v21.b}[3], [U], #1 |
| 1881 st1 {v22.b}[0], [V], #1 |
| 1882 st1 {v22.b}[1], [V], #1 |
| 1883 st1 {v22.b}[2], [V], #1 |
| 1884 st1 {v22.b}[3], [V], #1 |
| 1885 .elseif \size == 2 |
| 1886 st1 {v20.b}[4], [Y], #1 |
| 1887 st1 {v20.b}[5], [Y], #1 |
| 1888 st1 {v21.b}[4], [U], #1 |
| 1889 st1 {v21.b}[5], [U], #1 |
| 1890 st1 {v22.b}[4], [V], #1 |
| 1891 st1 {v22.b}[5], [V], #1 |
| 1892 .elseif \size == 1 |
| 1893 st1 {v20.b}[6], [Y], #1 |
| 1894 st1 {v21.b}[6], [U], #1 |
| 1895 st1 {v22.b}[6], [V], #1 |
| 1896 .else |
| 1897 .error unsupported macroblock size |
| 1898 .endif |
| 1899 .endm |
| 1900 |
| 1901 .macro do_load bpp, size, fast_ld3 |
| 1902 .if \bpp == 24 |
| 1903 .if \size == 8 |
| 1904 .if \fast_ld3 == 1 |
| 1905 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 |
| 1906 .else |
| 1907 ld1 {v10.b}[0], [RGB], #1 |
| 1908 ld1 {v11.b}[0], [RGB], #1 |
| 1909 ld1 {v12.b}[0], [RGB], #1 |
| 1910 |
| 1911 ld1 {v10.b}[1], [RGB], #1 |
| 1912 ld1 {v11.b}[1], [RGB], #1 |
| 1913 ld1 {v12.b}[1], [RGB], #1 |
| 1914 |
| 1915 ld1 {v10.b}[2], [RGB], #1 |
| 1916 ld1 {v11.b}[2], [RGB], #1 |
| 1917 ld1 {v12.b}[2], [RGB], #1 |
| 1918 |
| 1919 ld1 {v10.b}[3], [RGB], #1 |
| 1920 ld1 {v11.b}[3], [RGB], #1 |
| 1921 ld1 {v12.b}[3], [RGB], #1 |
| 1922 |
| 1923 ld1 {v10.b}[4], [RGB], #1 |
| 1924 ld1 {v11.b}[4], [RGB], #1 |
| 1925 ld1 {v12.b}[4], [RGB], #1 |
| 1926 |
| 1927 ld1 {v10.b}[5], [RGB], #1 |
| 1928 ld1 {v11.b}[5], [RGB], #1 |
| 1929 ld1 {v12.b}[5], [RGB], #1 |
| 1930 |
| 1931 ld1 {v10.b}[6], [RGB], #1 |
| 1932 ld1 {v11.b}[6], [RGB], #1 |
| 1933 ld1 {v12.b}[6], [RGB], #1 |
| 1934 |
| 1935 ld1 {v10.b}[7], [RGB], #1 |
| 1936 ld1 {v11.b}[7], [RGB], #1 |
| 1937 ld1 {v12.b}[7], [RGB], #1 |
| 1938 .endif |
| 1939 prfm pldl1keep, [RGB, #128] |
| 1940 .elseif \size == 4 |
| 1941 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 |
| 1942 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 |
| 1943 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 |
| 1944 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 |
| 1945 .elseif \size == 2 |
| 1946 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 |
| 1947 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 |
| 1948 .elseif \size == 1 |
| 1949 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 |
| 1950 .else |
| 1951 .error unsupported macroblock size |
| 1952 .endif |
| 1953 .elseif \bpp == 32 |
| 1954 .if \size == 8 |
| 1955 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 |
| 1956 prfm pldl1keep, [RGB, #128] |
| 1957 .elseif \size == 4 |
| 1958 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 |
| 1959 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 |
| 1960 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 |
| 1961 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 |
| 1962 .elseif \size == 2 |
| 1963 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 |
| 1964 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 |
| 1965 .elseif \size == 1 |
| 1966 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 |
| 1967 .else |
| 1968 .error unsupported macroblock size |
| 1969 .endif |
| 1970 .else |
| 1971 .error unsupported bpp |
| 1972 .endif |
| 1973 .endm |
| 1974 |
| 1975 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ |
| 1976 b_offs, fast_ld3 |
| 1977 |
| 1978 /* |
| 1979 * 2-stage pipelined RGB->YCbCr conversion |
| 1980 */ |
| 1981 |
| 1982 .macro do_rgb_to_yuv_stage1 |
| 1983 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ |
| 1984 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ |
| 1985 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ |
| 1986 rev64 v18.4s, v1.4s |
| 1987 rev64 v26.4s, v1.4s |
| 1988 rev64 v28.4s, v1.4s |
| 1989 rev64 v30.4s, v1.4s |
| 1990 umull v14.4s, v4.4h, v0.h[0] |
| 1991 umull2 v16.4s, v4.8h, v0.h[0] |
| 1992 umlsl v18.4s, v4.4h, v0.h[3] |
| 1993 umlsl2 v26.4s, v4.8h, v0.h[3] |
| 1994 umlal v28.4s, v4.4h, v0.h[5] |
| 1995 umlal2 v30.4s, v4.8h, v0.h[5] |
| 1996 umlal v14.4s, v6.4h, v0.h[1] |
| 1997 umlal2 v16.4s, v6.8h, v0.h[1] |
| 1998 umlsl v18.4s, v6.4h, v0.h[4] |
| 1999 umlsl2 v26.4s, v6.8h, v0.h[4] |
| 2000 umlsl v28.4s, v6.4h, v0.h[6] |
| 2001 umlsl2 v30.4s, v6.8h, v0.h[6] |
| 2002 umlal v14.4s, v8.4h, v0.h[2] |
| 2003 umlal2 v16.4s, v8.8h, v0.h[2] |
| 2004 umlal v18.4s, v8.4h, v0.h[5] |
| 2005 umlal2 v26.4s, v8.8h, v0.h[5] |
| 2006 umlsl v28.4s, v8.4h, v0.h[7] |
| 2007 umlsl2 v30.4s, v8.8h, v0.h[7] |
| 2008 .endm |
| 2009 |
| 2010 .macro do_rgb_to_yuv_stage2 |
| 2011 rshrn v20.4h, v14.4s, #16 |
| 2012 shrn v22.4h, v18.4s, #16 |
| 2013 shrn v24.4h, v28.4s, #16 |
| 2014 rshrn2 v20.8h, v16.4s, #16 |
| 2015 shrn2 v22.8h, v26.4s, #16 |
| 2016 shrn2 v24.8h, v30.4s, #16 |
| 2017 xtn v20.8b, v20.8h /* v20 = y */ |
| 2018 xtn v21.8b, v22.8h /* v21 = u */ |
| 2019 xtn v22.8b, v24.8h /* v22 = v */ |
| 2020 .endm |
| 2021 |
| 2022 .macro do_rgb_to_yuv |
| 2023 do_rgb_to_yuv_stage1 |
| 2024 do_rgb_to_yuv_stage2 |
| 2025 .endm |
| 2026 |
| 2027 /* TODO: expand macros and interleave instructions if some in-order |
| 2028 * ARM64 processor actually can dual-issue LOAD/STORE with ALU */ |
| 2029 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 |
| 2030 do_rgb_to_yuv_stage2 |
| 2031 do_load \bpp, 8, \fast_ld3 |
| 2032 st1 {v20.8b}, [Y], #8 |
| 2033 st1 {v21.8b}, [U], #8 |
| 2034 st1 {v22.8b}, [V], #8 |
| 2035 do_rgb_to_yuv_stage1 |
| 2036 .endm |
| 2037 |
| 2038 .balign 16 |
| 2039 .if \fast_ld3 == 1 |
| 2040 Ljsimd_\colorid\()_ycc_neon_consts: |
| 2041 .else |
| 2042 Ljsimd_\colorid\()_ycc_neon_slowld3_consts: |
| 2043 .endif |
| 2044 .short 19595, 38470, 7471, 11059 |
| 2045 .short 21709, 32768, 27439, 5329 |
| 2046 .short 32767, 128, 32767, 128 |
| 2047 .short 32767, 128, 32767, 128 |
| 2048 |
| 2049 .if \fast_ld3 == 1 |
| 2050 asm_function jsimd_\colorid\()_ycc_convert_neon |
| 2051 .else |
| 2052 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 |
| 2053 .endif |
| 2054 OUTPUT_WIDTH .req w0 |
| 2055 INPUT_BUF .req x1 |
| 2056 OUTPUT_BUF .req x2 |
| 2057 OUTPUT_ROW .req x3 |
| 2058 NUM_ROWS .req x4 |
| 2059 |
| 2060 OUTPUT_BUF0 .req x5 |
| 2061 OUTPUT_BUF1 .req x6 |
| 2062 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ |
| 2063 |
| 2064 RGB .req x7 |
| 2065 Y .req x9 |
| 2066 U .req x10 |
| 2067 V .req x11 |
| 2068 N .req w12 |
| 2069 |
| 2070 /* Load constants to d0, d1, d2, d3 */ |
| 2071 .if \fast_ld3 == 1 |
| 2072 adr x13, Ljsimd_\colorid\()_ycc_neon_consts |
| 2073 .else |
| 2074 adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts |
| 2075 .endif |
| 2076 ld1 {v0.8h, v1.8h}, [x13] |
| 2077 |
| 2078 ldr OUTPUT_BUF0, [OUTPUT_BUF] |
| 2079 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] |
| 2080 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] |
| 2081 .unreq OUTPUT_BUF |
| 2082 |
| 2083 /* Save NEON registers */ |
| 2084 sub sp, sp, #64 |
| 2085 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
| 2086 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
| 2087 |
| 2088 /* Outer loop over scanlines */ |
| 2089 cmp NUM_ROWS, #1 |
| 2090 b.lt 9f |
| 2091 0: |
| 2092 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3] |
| 2093 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3] |
| 2094 mov N, OUTPUT_WIDTH |
| 2095 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3] |
| 2096 add OUTPUT_ROW, OUTPUT_ROW, #1 |
| 2097 ldr RGB, [INPUT_BUF], #8 |
| 2098 |
| 2099 /* Inner loop over pixels */ |
| 2100 subs N, N, #8 |
| 2101 b.lt 3f |
| 2102 do_load \bpp, 8, \fast_ld3 |
| 2103 do_rgb_to_yuv_stage1 |
| 2104 subs N, N, #8 |
| 2105 b.lt 2f |
| 2106 1: |
| 2107 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 |
| 2108 subs N, N, #8 |
| 2109 b.ge 1b |
| 2110 2: |
| 2111 do_rgb_to_yuv_stage2 |
| 2112 do_store 8 |
| 2113 tst N, #7 |
| 2114 b.eq 8f |
| 2115 3: |
| 2116 tbz N, #2, 3f |
| 2117 do_load \bpp, 4, \fast_ld3 |
| 2118 3: |
| 2119 tbz N, #1, 4f |
| 2120 do_load \bpp, 2, \fast_ld3 |
| 2121 4: |
| 2122 tbz N, #0, 5f |
| 2123 do_load \bpp, 1, \fast_ld3 |
| 2124 5: |
| 2125 do_rgb_to_yuv |
| 2126 tbz N, #2, 6f |
| 2127 do_store 4 |
| 2128 6: |
| 2129 tbz N, #1, 7f |
| 2130 do_store 2 |
| 2131 7: |
| 2132 tbz N, #0, 8f |
| 2133 do_store 1 |
| 2134 8: |
| 2135 subs NUM_ROWS, NUM_ROWS, #1 |
| 2136 b.gt 0b |
| 2137 9: |
| 2138 /* Restore all registers and return */ |
| 2139 sub sp, sp, #64 |
| 2140 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
| 2141 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
| 2142 br x30 |
| 2143 |
| 2144 .unreq OUTPUT_WIDTH |
| 2145 .unreq OUTPUT_ROW |
| 2146 .unreq INPUT_BUF |
| 2147 .unreq NUM_ROWS |
| 2148 .unreq OUTPUT_BUF0 |
| 2149 .unreq OUTPUT_BUF1 |
| 2150 .unreq OUTPUT_BUF2 |
| 2151 .unreq RGB |
| 2152 .unreq Y |
| 2153 .unreq U |
| 2154 .unreq V |
| 2155 .unreq N |
| 2156 |
| 2157 .purgem do_rgb_to_yuv |
| 2158 .purgem do_rgb_to_yuv_stage1 |
| 2159 .purgem do_rgb_to_yuv_stage2 |
| 2160 .purgem do_rgb_to_yuv_stage2_store_load_stage1 |
| 2161 |
| 2162 .endm |
| 2163 |
| 2164 /*--------------------------------- id ----- bpp R G B Fast LD3 */ |
| 2165 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 |
| 2166 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 |
| 2167 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 |
| 2168 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 |
| 2169 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 |
| 2170 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 |
| 2171 |
| 2172 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 |
| 2173 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 |
| 2174 |
| 2175 .purgem do_load |
| 2176 .purgem do_store |
| 2177 |
| 2178 |
| 2179 /*****************************************************************************/ |
| 2180 |
| 2181 /* |
| 2182 * Load data into workspace, applying unsigned->signed conversion |
| 2183 * |
| 2184 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get |
| 2185 * rid of VST1.16 instructions |
| 2186 */ |
| 2187 |
| 2188 asm_function jsimd_convsamp_neon |
| 2189 SAMPLE_DATA .req x0 |
| 2190 START_COL .req x1 |
| 2191 WORKSPACE .req x2 |
| 2192 TMP1 .req x9 |
| 2193 TMP2 .req x10 |
| 2194 TMP3 .req x11 |
| 2195 TMP4 .req x12 |
| 2196 TMP5 .req x13 |
| 2197 TMP6 .req x14 |
| 2198 TMP7 .req x15 |
| 2199 TMP8 .req x4 |
| 2200 TMPDUP .req w3 |
| 2201 |
| 2202 mov TMPDUP, #128 |
| 2203 ldp TMP1, TMP2, [SAMPLE_DATA], 16 |
| 2204 ldp TMP3, TMP4, [SAMPLE_DATA], 16 |
| 2205 dup v0.8b, TMPDUP |
| 2206 add TMP1, TMP1, START_COL |
| 2207 add TMP2, TMP2, START_COL |
| 2208 ldp TMP5, TMP6, [SAMPLE_DATA], 16 |
| 2209 add TMP3, TMP3, START_COL |
| 2210 add TMP4, TMP4, START_COL |
| 2211 ldp TMP7, TMP8, [SAMPLE_DATA], 16 |
| 2212 add TMP5, TMP5, START_COL |
| 2213 add TMP6, TMP6, START_COL |
| 2214 ld1 {v16.8b}, [TMP1] |
| 2215 add TMP7, TMP7, START_COL |
| 2216 add TMP8, TMP8, START_COL |
| 2217 ld1 {v17.8b}, [TMP2] |
| 2218 usubl v16.8h, v16.8b, v0.8b |
| 2219 ld1 {v18.8b}, [TMP3] |
| 2220 usubl v17.8h, v17.8b, v0.8b |
| 2221 ld1 {v19.8b}, [TMP4] |
| 2222 usubl v18.8h, v18.8b, v0.8b |
| 2223 ld1 {v20.8b}, [TMP5] |
| 2224 usubl v19.8h, v19.8b, v0.8b |
| 2225 ld1 {v21.8b}, [TMP6] |
| 2226 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 |
| 2227 usubl v20.8h, v20.8b, v0.8b |
| 2228 ld1 {v22.8b}, [TMP7] |
| 2229 usubl v21.8h, v21.8b, v0.8b |
| 2230 ld1 {v23.8b}, [TMP8] |
| 2231 usubl v22.8h, v22.8b, v0.8b |
| 2232 usubl v23.8h, v23.8b, v0.8b |
| 2233 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 |
| 2234 |
| 2235 br x30 |
| 2236 |
| 2237 .unreq SAMPLE_DATA |
| 2238 .unreq START_COL |
| 2239 .unreq WORKSPACE |
| 2240 .unreq TMP1 |
| 2241 .unreq TMP2 |
| 2242 .unreq TMP3 |
| 2243 .unreq TMP4 |
| 2244 .unreq TMP5 |
| 2245 .unreq TMP6 |
| 2246 .unreq TMP7 |
| 2247 .unreq TMP8 |
| 2248 .unreq TMPDUP |
| 2249 |
| 2250 /*****************************************************************************/ |
| 2251 |
| 2252 /* |
| 2253 * jsimd_fdct_islow_neon |
| 2254 * |
| 2255 * This file contains a slow-but-accurate integer implementation of the |
| 2256 * forward DCT (Discrete Cosine Transform). The following code is based |
| 2257 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for |
| 2258 * more details. |
| 2259 * |
| 2260 * TODO: can be combined with 'jsimd_convsamp_neon' to get |
| 2261 * rid of a bunch of VLD1.16 instructions |
| 2262 */ |
| 2263 |
| 2264 #define CONST_BITS 13 |
| 2265 #define PASS1_BITS 2 |
| 2266 |
| 2267 #define DESCALE_P1 (CONST_BITS-PASS1_BITS) |
| 2268 #define DESCALE_P2 (CONST_BITS+PASS1_BITS) |
| 2269 |
| 2270 #define F_0_298 2446 /* FIX(0.298631336) */ |
| 2271 #define F_0_390 3196 /* FIX(0.390180644) */ |
| 2272 #define F_0_541 4433 /* FIX(0.541196100) */ |
| 2273 #define F_0_765 6270 /* FIX(0.765366865) */ |
| 2274 #define F_0_899 7373 /* FIX(0.899976223) */ |
| 2275 #define F_1_175 9633 /* FIX(1.175875602) */ |
| 2276 #define F_1_501 12299 /* FIX(1.501321110) */ |
| 2277 #define F_1_847 15137 /* FIX(1.847759065) */ |
| 2278 #define F_1_961 16069 /* FIX(1.961570560) */ |
| 2279 #define F_2_053 16819 /* FIX(2.053119869) */ |
| 2280 #define F_2_562 20995 /* FIX(2.562915447) */ |
| 2281 #define F_3_072 25172 /* FIX(3.072711026) */ |
| 2282 |
| 2283 .balign 16 |
| 2284 Ljsimd_fdct_islow_neon_consts: |
| 2285 .short F_0_298 |
| 2286 .short -F_0_390 |
| 2287 .short F_0_541 |
| 2288 .short F_0_765 |
| 2289 .short - F_0_899 |
| 2290 .short F_1_175 |
| 2291 .short F_1_501 |
| 2292 .short - F_1_847 |
| 2293 .short - F_1_961 |
| 2294 .short F_2_053 |
| 2295 .short - F_2_562 |
| 2296 .short F_3_072 |
| 2297 .short 0 /* padding */ |
| 2298 .short 0 |
| 2299 .short 0 |
| 2300 .short 0 |
| 2301 |
| 2302 #undef F_0_298 |
| 2303 #undef F_0_390 |
| 2304 #undef F_0_541 |
| 2305 #undef F_0_765 |
| 2306 #undef F_0_899 |
| 2307 #undef F_1_175 |
| 2308 #undef F_1_501 |
| 2309 #undef F_1_847 |
| 2310 #undef F_1_961 |
| 2311 #undef F_2_053 |
| 2312 #undef F_2_562 |
| 2313 #undef F_3_072 |
| 2314 #define XFIX_P_0_298 v0.h[0] |
| 2315 #define XFIX_N_0_390 v0.h[1] |
| 2316 #define XFIX_P_0_541 v0.h[2] |
| 2317 #define XFIX_P_0_765 v0.h[3] |
| 2318 #define XFIX_N_0_899 v0.h[4] |
| 2319 #define XFIX_P_1_175 v0.h[5] |
| 2320 #define XFIX_P_1_501 v0.h[6] |
| 2321 #define XFIX_N_1_847 v0.h[7] |
| 2322 #define XFIX_N_1_961 v1.h[0] |
| 2323 #define XFIX_P_2_053 v1.h[1] |
| 2324 #define XFIX_N_2_562 v1.h[2] |
| 2325 #define XFIX_P_3_072 v1.h[3] |
| 2326 |
| 2327 asm_function jsimd_fdct_islow_neon |
| 2328 |
| 2329 DATA .req x0 |
| 2330 TMP .req x9 |
| 2331 |
| 2332 /* Load constants */ |
| 2333 adr TMP, Ljsimd_fdct_islow_neon_consts |
| 2334 ld1 {v0.8h, v1.8h}, [TMP] |
| 2335 |
| 2336 /* Save NEON registers */ |
| 2337 sub sp, sp, #64 |
| 2338 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
| 2339 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
| 2340 |
| 2341 /* Load all DATA into NEON registers with the following allocation: |
| 2342 * 0 1 2 3 | 4 5 6 7 |
| 2343 * ---------+-------- |
| 2344 * 0 | d16 | d17 | v16.8h |
| 2345 * 1 | d18 | d19 | v17.8h |
| 2346 * 2 | d20 | d21 | v18.8h |
| 2347 * 3 | d22 | d23 | v19.8h |
| 2348 * 4 | d24 | d25 | v20.8h |
| 2349 * 5 | d26 | d27 | v21.8h |
| 2350 * 6 | d28 | d29 | v22.8h |
| 2351 * 7 | d30 | d31 | v23.8h |
| 2352 */ |
| 2353 |
| 2354 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 |
| 2355 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] |
| 2356 sub DATA, DATA, #64 |
| 2357 |
| 2358 /* Transpose */ |
| 2359 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 |
| 2360 /* 1-D FDCT */ |
| 2361 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; *
/ |
| 2362 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; *
/ |
| 2363 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; *
/ |
| 2364 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; *
/ |
| 2365 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; *
/ |
| 2366 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; *
/ |
| 2367 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; *
/ |
| 2368 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; *
/ |
| 2369 |
| 2370 /* even part */ |
| 2371 |
| 2372 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ |
| 2373 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ |
| 2374 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ |
| 2375 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ |
| 2376 |
| 2377 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ |
| 2378 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ |
| 2379 |
| 2380 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ |
| 2381 |
| 2382 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_
SHIFT(tmp10 + tmp11, PASS1_BITS); */ |
| 2383 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_
SHIFT(tmp10 - tmp11, PASS1_BITS); */ |
| 2384 |
| 2385 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tm
p13, XFIX_P_0_541); */ |
| 2386 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tm
p13, XFIX_P_0_541); */ |
| 2387 mov v22.16b, v18.16b |
| 2388 mov v25.16b, v24.16b |
| 2389 |
| 2390 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFI
X_P_0_765) */ |
| 2391 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFI
X_P_0_765) */ |
| 2392 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFI
X_N_1_847) */ |
| 2393 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFI
X_N_1_847) */ |
| 2394 |
| 2395 rshrn v18.4h, v18.4s, #DESCALE_P1 |
| 2396 rshrn v22.4h, v22.4s, #DESCALE_P1 |
| 2397 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCA
LE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ |
| 2398 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCA
LE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ |
| 2399 |
| 2400 /* Odd part */ |
| 2401 |
| 2402 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ |
| 2403 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ |
| 2404 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ |
| 2405 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ |
| 2406 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175
*/ |
| 2407 smull2 v5.4s, v10.8h, XFIX_P_1_175 |
| 2408 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1
_175875602); */ |
| 2409 smlal2 v5.4s, v11.8h, XFIX_P_1_175 |
| 2410 |
| 2411 smull2 v24.4s, v28.8h, XFIX_P_0_298 |
| 2412 smull2 v25.4s, v29.8h, XFIX_P_2_053 |
| 2413 smull2 v26.4s, v30.8h, XFIX_P_3_072 |
| 2414 smull2 v27.4s, v31.8h, XFIX_P_1_501 |
| 2415 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0
_298631336); */ |
| 2416 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2
_053119869); */ |
| 2417 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3
_072711026); */ |
| 2418 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1
_501321110); */ |
| 2419 |
| 2420 smull2 v12.4s, v8.8h, XFIX_N_0_899 |
| 2421 smull2 v13.4s, v9.8h, XFIX_N_2_562 |
| 2422 smull2 v14.4s, v10.8h, XFIX_N_1_961 |
| 2423 smull2 v15.4s, v11.8h, XFIX_N_0_390 |
| 2424 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8
99976223); */ |
| 2425 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5
62915447); */ |
| 2426 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9
61570560); */ |
| 2427 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3
90180644); */ |
| 2428 |
| 2429 add v10.4s, v10.4s, v4.4s /* z3 += z5 */ |
| 2430 add v14.4s, v14.4s, v5.4s |
| 2431 add v11.4s, v11.4s, v4.4s /* z4 += z5 */ |
| 2432 add v15.4s, v15.4s, v5.4s |
| 2433 |
| 2434 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ |
| 2435 add v24.4s, v24.4s, v12.4s |
| 2436 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ |
| 2437 add v25.4s, v25.4s, v13.4s |
| 2438 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ |
| 2439 add v26.4s, v26.4s, v14.4s |
| 2440 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ |
| 2441 add v27.4s, v27.4s, v15.4s |
| 2442 |
| 2443 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ |
| 2444 add v24.4s, v24.4s, v14.4s |
| 2445 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ |
| 2446 add v25.4s, v25.4s, v15.4s |
| 2447 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ |
| 2448 add v26.4s, v26.4s, v13.4s |
| 2449 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ |
| 2450 add v27.4s, v27.4s, v12.4s |
| 2451 |
| 2452 rshrn v23.4h, v28.4s, #DESCALE_P1 |
| 2453 rshrn v21.4h, v29.4s, #DESCALE_P1 |
| 2454 rshrn v19.4h, v30.4s, #DESCALE_P1 |
| 2455 rshrn v17.4h, v31.4s, #DESCALE_P1 |
| 2456 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCA
LE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ |
| 2457 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCA
LE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ |
| 2458 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCA
LE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ |
| 2459 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCA
LE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ |
| 2460 |
| 2461 /* Transpose */ |
| 2462 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 |
| 2463 |
| 2464 /* 1-D FDCT */ |
| 2465 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; *
/ |
| 2466 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; *
/ |
| 2467 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; *
/ |
| 2468 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; *
/ |
| 2469 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; *
/ |
| 2470 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; *
/ |
| 2471 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; *
/ |
| 2472 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; *
/ |
| 2473 |
| 2474 /* even part */ |
| 2475 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ |
| 2476 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ |
| 2477 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ |
| 2478 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ |
| 2479 |
| 2480 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ |
| 2481 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ |
| 2482 |
| 2483 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ |
| 2484 |
| 2485 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCA
LE(tmp10 + tmp11, PASS1_BITS); */ |
| 2486 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCA
LE(tmp10 - tmp11, PASS1_BITS); */ |
| 2487 |
| 2488 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tm
p13, XFIX_P_0_541); */ |
| 2489 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tm
p13, XFIX_P_0_541); */ |
| 2490 mov v22.16b, v18.16b |
| 2491 mov v25.16b, v24.16b |
| 2492 |
| 2493 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFI
X_P_0_765) */ |
| 2494 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFI
X_P_0_765) */ |
| 2495 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFI
X_N_1_847) */ |
| 2496 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFI
X_N_1_847) */ |
| 2497 |
| 2498 rshrn v18.4h, v18.4s, #DESCALE_P2 |
| 2499 rshrn v22.4h, v22.4s, #DESCALE_P2 |
| 2500 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCA
LE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ |
| 2501 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCA
LE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ |
| 2502 |
| 2503 /* Odd part */ |
| 2504 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ |
| 2505 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ |
| 2506 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ |
| 2507 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ |
| 2508 |
| 2509 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175
*/ |
| 2510 smull2 v5.4s, v10.8h, XFIX_P_1_175 |
| 2511 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1
_175875602); */ |
| 2512 smlal2 v5.4s, v11.8h, XFIX_P_1_175 |
| 2513 |
| 2514 smull2 v24.4s, v28.8h, XFIX_P_0_298 |
| 2515 smull2 v25.4s, v29.8h, XFIX_P_2_053 |
| 2516 smull2 v26.4s, v30.8h, XFIX_P_3_072 |
| 2517 smull2 v27.4s, v31.8h, XFIX_P_1_501 |
| 2518 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0
_298631336); */ |
| 2519 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2
_053119869); */ |
| 2520 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3
_072711026); */ |
| 2521 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1
_501321110); */ |
| 2522 |
| 2523 smull2 v12.4s, v8.8h, XFIX_N_0_899 |
| 2524 smull2 v13.4s, v9.8h, XFIX_N_2_562 |
| 2525 smull2 v14.4s, v10.8h, XFIX_N_1_961 |
| 2526 smull2 v15.4s, v11.8h, XFIX_N_0_390 |
| 2527 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8
99976223); */ |
| 2528 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5
62915447); */ |
| 2529 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9
61570560); */ |
| 2530 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3
90180644); */ |
| 2531 |
| 2532 add v10.4s, v10.4s, v4.4s |
| 2533 add v14.4s, v14.4s, v5.4s |
| 2534 add v11.4s, v11.4s, v4.4s |
| 2535 add v15.4s, v15.4s, v5.4s |
| 2536 |
| 2537 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ |
| 2538 add v24.4s, v24.4s, v12.4s |
| 2539 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ |
| 2540 add v25.4s, v25.4s, v13.4s |
| 2541 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ |
| 2542 add v26.4s, v26.4s, v14.4s |
| 2543 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ |
| 2544 add v27.4s, v27.4s, v15.4s |
| 2545 |
| 2546 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ |
| 2547 add v24.4s, v24.4s, v14.4s |
| 2548 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ |
| 2549 add v25.4s, v25.4s, v15.4s |
| 2550 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ |
| 2551 add v26.4s, v26.4s, v13.4s |
| 2552 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ |
| 2553 add v27.4s, v27.4s, v12.4s |
| 2554 |
| 2555 rshrn v23.4h, v28.4s, #DESCALE_P2 |
| 2556 rshrn v21.4h, v29.4s, #DESCALE_P2 |
| 2557 rshrn v19.4h, v30.4s, #DESCALE_P2 |
| 2558 rshrn v17.4h, v31.4s, #DESCALE_P2 |
| 2559 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCA
LE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ |
| 2560 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCA
LE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ |
| 2561 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCA
LE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ |
| 2562 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCA
LE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ |
| 2563 |
| 2564 /* store results */ |
| 2565 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 |
| 2566 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] |
| 2567 |
| 2568 /* Restore NEON registers */ |
| 2569 sub sp, sp, #64 |
| 2570 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 |
| 2571 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 |
| 2572 |
| 2573 br x30 |
| 2574 |
| 2575 .unreq DATA |
| 2576 .unreq TMP |
| 2577 |
| 2578 #undef XFIX_P_0_298 |
| 2579 #undef XFIX_N_0_390 |
| 2580 #undef XFIX_P_0_541 |
| 2581 #undef XFIX_P_0_765 |
| 2582 #undef XFIX_N_0_899 |
| 2583 #undef XFIX_P_1_175 |
| 2584 #undef XFIX_P_1_501 |
| 2585 #undef XFIX_N_1_847 |
| 2586 #undef XFIX_N_1_961 |
| 2587 #undef XFIX_P_2_053 |
| 2588 #undef XFIX_N_2_562 |
| 2589 #undef XFIX_P_3_072 |
| 2590 |
| 2591 |
| 2592 /*****************************************************************************/ |
| 2593 |
| 2594 /* |
| 2595 * jsimd_fdct_ifast_neon |
| 2596 * |
| 2597 * This function contains a fast, not so accurate integer implementation of |
| 2598 * the forward DCT (Discrete Cosine Transform). It uses the same calculations |
| 2599 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' |
| 2600 * function from jfdctfst.c |
| 2601 * |
| 2602 * TODO: can be combined with 'jsimd_convsamp_neon' to get |
| 2603 * rid of a bunch of VLD1.16 instructions |
| 2604 */ |
| 2605 |
| 2606 #undef XFIX_0_541196100 |
| 2607 #define XFIX_0_382683433 v0.h[0] |
| 2608 #define XFIX_0_541196100 v0.h[1] |
| 2609 #define XFIX_0_707106781 v0.h[2] |
| 2610 #define XFIX_1_306562965 v0.h[3] |
| 2611 |
| 2612 .balign 16 |
| 2613 Ljsimd_fdct_ifast_neon_consts: |
| 2614 .short (98 * 128) /* XFIX_0_382683433 */ |
| 2615 .short (139 * 128) /* XFIX_0_541196100 */ |
| 2616 .short (181 * 128) /* XFIX_0_707106781 */ |
| 2617 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ |
| 2618 |
| 2619 asm_function jsimd_fdct_ifast_neon |
| 2620 |
| 2621 DATA .req x0 |
| 2622 TMP .req x9 |
| 2623 |
| 2624 /* Load constants */ |
| 2625 adr TMP, Ljsimd_fdct_ifast_neon_consts |
| 2626 ld1 {v0.4h}, [TMP] |
| 2627 |
| 2628 /* Load all DATA into NEON registers with the following allocation: |
| 2629 * 0 1 2 3 | 4 5 6 7 |
| 2630 * ---------+-------- |
| 2631 * 0 | d16 | d17 | v0.8h |
| 2632 * 1 | d18 | d19 | q9 |
| 2633 * 2 | d20 | d21 | q10 |
| 2634 * 3 | d22 | d23 | q11 |
| 2635 * 4 | d24 | d25 | q12 |
| 2636 * 5 | d26 | d27 | q13 |
| 2637 * 6 | d28 | d29 | q14 |
| 2638 * 7 | d30 | d31 | q15 |
| 2639 */ |
| 2640 |
| 2641 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 |
| 2642 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] |
| 2643 mov TMP, #2 |
| 2644 sub DATA, DATA, #64 |
| 2645 1: |
| 2646 /* Transpose */ |
| 2647 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 |
| 2648 subs TMP, TMP, #1 |
| 2649 /* 1-D FDCT */ |
| 2650 add v4.8h, v19.8h, v20.8h |
| 2651 sub v20.8h, v19.8h, v20.8h |
| 2652 sub v28.8h, v18.8h, v21.8h |
| 2653 add v18.8h, v18.8h, v21.8h |
| 2654 sub v29.8h, v17.8h, v22.8h |
| 2655 add v17.8h, v17.8h, v22.8h |
| 2656 sub v21.8h, v16.8h, v23.8h |
| 2657 add v16.8h, v16.8h, v23.8h |
| 2658 sub v6.8h, v17.8h, v18.8h |
| 2659 sub v7.8h, v16.8h, v4.8h |
| 2660 add v5.8h, v17.8h, v18.8h |
| 2661 add v6.8h, v6.8h, v7.8h |
| 2662 add v4.8h, v16.8h, v4.8h |
| 2663 sqdmulh v6.8h, v6.8h, XFIX_0_707106781 |
| 2664 add v19.8h, v20.8h, v28.8h |
| 2665 add v16.8h, v4.8h, v5.8h |
| 2666 sub v20.8h, v4.8h, v5.8h |
| 2667 add v5.8h, v28.8h, v29.8h |
| 2668 add v29.8h, v29.8h, v21.8h |
| 2669 sqdmulh v5.8h, v5.8h, XFIX_0_707106781 |
| 2670 sub v28.8h, v19.8h, v29.8h |
| 2671 add v18.8h, v7.8h, v6.8h |
| 2672 sqdmulh v28.8h, v28.8h, XFIX_0_382683433 |
| 2673 sub v22.8h, v7.8h, v6.8h |
| 2674 sqdmulh v19.8h, v19.8h, XFIX_0_541196100 |
| 2675 sqdmulh v7.8h, v29.8h, XFIX_1_306562965 |
| 2676 add v6.8h, v21.8h, v5.8h |
| 2677 sub v5.8h, v21.8h, v5.8h |
| 2678 add v29.8h, v29.8h, v28.8h |
| 2679 add v19.8h, v19.8h, v28.8h |
| 2680 add v29.8h, v29.8h, v7.8h |
| 2681 add v21.8h, v5.8h, v19.8h |
| 2682 sub v19.8h, v5.8h, v19.8h |
| 2683 add v17.8h, v6.8h, v29.8h |
| 2684 sub v23.8h, v6.8h, v29.8h |
| 2685 |
| 2686 b.ne 1b |
| 2687 |
| 2688 /* store results */ |
| 2689 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 |
| 2690 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] |
| 2691 |
| 2692 br x30 |
| 2693 |
| 2694 .unreq DATA |
| 2695 .unreq TMP |
| 2696 #undef XFIX_0_382683433 |
| 2697 #undef XFIX_0_541196100 |
| 2698 #undef XFIX_0_707106781 |
| 2699 #undef XFIX_1_306562965 |
| 2700 |
| 2701 |
| 2702 /*****************************************************************************/ |
| 2703 |
| 2704 /* |
| 2705 * GLOBAL(void) |
| 2706 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, |
| 2707 * DCTELEM *workspace); |
| 2708 * |
| 2709 */ |
| 2710 asm_function jsimd_quantize_neon |
| 2711 |
| 2712 COEF_BLOCK .req x0 |
| 2713 DIVISORS .req x1 |
| 2714 WORKSPACE .req x2 |
| 2715 |
| 2716 RECIPROCAL .req DIVISORS |
| 2717 CORRECTION .req x9 |
| 2718 SHIFT .req x10 |
| 2719 LOOP_COUNT .req x11 |
| 2720 |
| 2721 mov LOOP_COUNT, #2 |
| 2722 add CORRECTION, DIVISORS, #(64 * 2) |
| 2723 add SHIFT, DIVISORS, #(64 * 6) |
| 2724 1: |
| 2725 subs LOOP_COUNT, LOOP_COUNT, #1 |
| 2726 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 |
| 2727 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 |
| 2728 abs v20.8h, v0.8h |
| 2729 abs v21.8h, v1.8h |
| 2730 abs v22.8h, v2.8h |
| 2731 abs v23.8h, v3.8h |
| 2732 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 |
| 2733 add v20.8h, v20.8h, v4.8h /* add correction */ |
| 2734 add v21.8h, v21.8h, v5.8h |
| 2735 add v22.8h, v22.8h, v6.8h |
| 2736 add v23.8h, v23.8h, v7.8h |
| 2737 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ |
| 2738 umull2 v16.4s, v20.8h, v28.8h |
| 2739 umull v5.4s, v21.4h, v29.4h |
| 2740 umull2 v17.4s, v21.8h, v29.8h |
| 2741 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ |
| 2742 umull2 v18.4s, v22.8h, v30.8h |
| 2743 umull v7.4s, v23.4h, v31.4h |
| 2744 umull2 v19.4s, v23.8h, v31.8h |
| 2745 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 |
| 2746 shrn v4.4h, v4.4s, #16 |
| 2747 shrn v5.4h, v5.4s, #16 |
| 2748 shrn v6.4h, v6.4s, #16 |
| 2749 shrn v7.4h, v7.4s, #16 |
| 2750 shrn2 v4.8h, v16.4s, #16 |
| 2751 shrn2 v5.8h, v17.4s, #16 |
| 2752 shrn2 v6.8h, v18.4s, #16 |
| 2753 shrn2 v7.8h, v19.4s, #16 |
| 2754 neg v24.8h, v24.8h |
| 2755 neg v25.8h, v25.8h |
| 2756 neg v26.8h, v26.8h |
| 2757 neg v27.8h, v27.8h |
| 2758 sshr v0.8h, v0.8h, #15 /* extract sign */ |
| 2759 sshr v1.8h, v1.8h, #15 |
| 2760 sshr v2.8h, v2.8h, #15 |
| 2761 sshr v3.8h, v3.8h, #15 |
| 2762 ushl v4.8h, v4.8h, v24.8h /* shift */ |
| 2763 ushl v5.8h, v5.8h, v25.8h |
| 2764 ushl v6.8h, v6.8h, v26.8h |
| 2765 ushl v7.8h, v7.8h, v27.8h |
| 2766 |
| 2767 eor v4.16b, v4.16b, v0.16b /* restore sign */ |
| 2768 eor v5.16b, v5.16b, v1.16b |
| 2769 eor v6.16b, v6.16b, v2.16b |
| 2770 eor v7.16b, v7.16b, v3.16b |
| 2771 sub v4.8h, v4.8h, v0.8h |
| 2772 sub v5.8h, v5.8h, v1.8h |
| 2773 sub v6.8h, v6.8h, v2.8h |
| 2774 sub v7.8h, v7.8h, v3.8h |
| 2775 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 |
| 2776 |
| 2777 b.ne 1b |
| 2778 |
| 2779 br x30 /* return */ |
| 2780 |
| 2781 .unreq COEF_BLOCK |
| 2782 .unreq DIVISORS |
| 2783 .unreq WORKSPACE |
| 2784 .unreq RECIPROCAL |
| 2785 .unreq CORRECTION |
| 2786 .unreq SHIFT |
| 2787 .unreq LOOP_COUNT |
| 2788 |
| 2789 |
| 2790 /*****************************************************************************/ |
| 2791 |
| 2792 /* |
| 2793 * Downsample pixel values of a single component. |
| 2794 * This version handles the common case of 2:1 horizontal and 1:1 vertical, |
| 2795 * without smoothing. |
| 2796 * |
| 2797 * GLOBAL(void) |
| 2798 * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, |
| 2799 * JDIMENSION v_samp_factor, |
| 2800 * JDIMENSION width_blocks, JSAMPARRAY input_data, |
| 2801 * JSAMPARRAY output_data); |
| 2802 */ |
| 2803 |
| 2804 .balign 16 |
| 2805 Ljsimd_h2_downsample_neon_consts: |
| 2806 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2807 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ |
| 2808 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2809 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ |
| 2810 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2811 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ |
| 2812 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2813 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ |
| 2814 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2815 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ |
| 2816 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2817 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ |
| 2818 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2819 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ |
| 2820 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2821 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ |
| 2822 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ |
| 2823 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ |
| 2824 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ |
| 2825 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ |
| 2826 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ |
| 2827 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ |
| 2828 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ |
| 2829 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ |
| 2830 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ |
| 2831 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ |
| 2832 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ |
| 2833 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ |
| 2834 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ |
| 2835 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ |
| 2836 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ |
| 2837 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ |
| 2838 |
| 2839 asm_function jsimd_h2v1_downsample_neon |
| 2840 IMAGE_WIDTH .req x0 |
| 2841 MAX_V_SAMP .req x1 |
| 2842 V_SAMP .req x2 |
| 2843 BLOCK_WIDTH .req x3 |
| 2844 INPUT_DATA .req x4 |
| 2845 OUTPUT_DATA .req x5 |
| 2846 OUTPTR .req x9 |
| 2847 INPTR .req x10 |
| 2848 TMP1 .req x11 |
| 2849 TMP2 .req x12 |
| 2850 TMP3 .req x13 |
| 2851 TMPDUP .req w15 |
| 2852 |
| 2853 mov TMPDUP, #0x10000 |
| 2854 lsl TMP2, BLOCK_WIDTH, #4 |
| 2855 sub TMP2, TMP2, IMAGE_WIDTH |
| 2856 adr TMP3, Ljsimd_h2_downsample_neon_consts |
| 2857 add TMP3, TMP3, TMP2, lsl #4 |
| 2858 dup v16.4s, TMPDUP |
| 2859 ld1 {v18.16b}, [TMP3] |
| 2860 |
| 2861 1: /* row loop */ |
| 2862 ldr INPTR, [INPUT_DATA], #8 |
| 2863 ldr OUTPTR, [OUTPUT_DATA], #8 |
| 2864 subs TMP1, BLOCK_WIDTH, #1 |
| 2865 b.eq 3f |
| 2866 2: /* columns */ |
| 2867 ld1 {v0.16b}, [INPTR], #16 |
| 2868 mov v4.16b, v16.16b |
| 2869 subs TMP1, TMP1, #1 |
| 2870 uadalp v4.8h, v0.16b |
| 2871 shrn v6.8b, v4.8h, #1 |
| 2872 st1 {v6.8b}, [OUTPTR], #8 |
| 2873 b.ne 2b |
| 2874 3: /* last columns */ |
| 2875 ld1 {v0.16b}, [INPTR] |
| 2876 mov v4.16b, v16.16b |
| 2877 subs V_SAMP, V_SAMP, #1 |
| 2878 /* expand right */ |
| 2879 tbl v2.16b, {v0.16b}, v18.16b |
| 2880 uadalp v4.8h, v2.16b |
| 2881 shrn v6.8b, v4.8h, #1 |
| 2882 st1 {v6.8b}, [OUTPTR], #8 |
| 2883 b.ne 1b |
| 2884 |
| 2885 br x30 |
| 2886 |
| 2887 .unreq IMAGE_WIDTH |
| 2888 .unreq MAX_V_SAMP |
| 2889 .unreq V_SAMP |
| 2890 .unreq BLOCK_WIDTH |
| 2891 .unreq INPUT_DATA |
| 2892 .unreq OUTPUT_DATA |
| 2893 .unreq OUTPTR |
| 2894 .unreq INPTR |
| 2895 .unreq TMP1 |
| 2896 .unreq TMP2 |
| 2897 .unreq TMP3 |
| 2898 .unreq TMPDUP |
| 2899 |
| 2900 |
| 2901 /*****************************************************************************/ |
| 2902 |
| 2903 /* |
| 2904 * Downsample pixel values of a single component. |
| 2905 * This version handles the common case of 2:1 horizontal and 2:1 vertical, |
| 2906 * without smoothing. |
| 2907 * |
| 2908 * GLOBAL(void) |
| 2909 * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, |
| 2910 * JDIMENSION v_samp_factor, JDIMENSION width_blocks
, |
| 2911 * JSAMPARRAY input_data, JSAMPARRAY output_data); |
| 2912 */ |
| 2913 |
| 2914 .balign 16 |
| 2915 asm_function jsimd_h2v2_downsample_neon |
| 2916 IMAGE_WIDTH .req x0 |
| 2917 MAX_V_SAMP .req x1 |
| 2918 V_SAMP .req x2 |
| 2919 BLOCK_WIDTH .req x3 |
| 2920 INPUT_DATA .req x4 |
| 2921 OUTPUT_DATA .req x5 |
| 2922 OUTPTR .req x9 |
| 2923 INPTR0 .req x10 |
| 2924 INPTR1 .req x14 |
| 2925 TMP1 .req x11 |
| 2926 TMP2 .req x12 |
| 2927 TMP3 .req x13 |
| 2928 TMPDUP .req w15 |
| 2929 |
| 2930 mov TMPDUP, #1 |
| 2931 lsl TMP2, BLOCK_WIDTH, #4 |
| 2932 lsl TMPDUP, TMPDUP, #17 |
| 2933 sub TMP2, TMP2, IMAGE_WIDTH |
| 2934 adr TMP3, Ljsimd_h2_downsample_neon_consts |
| 2935 orr TMPDUP, TMPDUP, #1 |
| 2936 add TMP3, TMP3, TMP2, lsl #4 |
| 2937 dup v16.4s, TMPDUP |
| 2938 ld1 {v18.16b}, [TMP3] |
| 2939 |
| 2940 1: /* row loop */ |
| 2941 ldr INPTR0, [INPUT_DATA], #8 |
| 2942 ldr OUTPTR, [OUTPUT_DATA], #8 |
| 2943 ldr INPTR1, [INPUT_DATA], #8 |
| 2944 subs TMP1, BLOCK_WIDTH, #1 |
| 2945 b.eq 3f |
| 2946 2: /* columns */ |
| 2947 ld1 {v0.16b}, [INPTR0], #16 |
| 2948 ld1 {v1.16b}, [INPTR1], #16 |
| 2949 mov v4.16b, v16.16b |
| 2950 subs TMP1, TMP1, #1 |
| 2951 uadalp v4.8h, v0.16b |
| 2952 uadalp v4.8h, v1.16b |
| 2953 shrn v6.8b, v4.8h, #2 |
| 2954 st1 {v6.8b}, [OUTPTR], #8 |
| 2955 b.ne 2b |
| 2956 3: /* last columns */ |
| 2957 ld1 {v0.16b}, [INPTR0], #16 |
| 2958 ld1 {v1.16b}, [INPTR1], #16 |
| 2959 mov v4.16b, v16.16b |
| 2960 subs V_SAMP, V_SAMP, #1 |
| 2961 /* expand right */ |
| 2962 tbl v2.16b, {v0.16b}, v18.16b |
| 2963 tbl v3.16b, {v1.16b}, v18.16b |
| 2964 uadalp v4.8h, v2.16b |
| 2965 uadalp v4.8h, v3.16b |
| 2966 shrn v6.8b, v4.8h, #2 |
| 2967 st1 {v6.8b}, [OUTPTR], #8 |
| 2968 b.ne 1b |
| 2969 |
| 2970 br x30 |
| 2971 |
| 2972 .unreq IMAGE_WIDTH |
| 2973 .unreq MAX_V_SAMP |
| 2974 .unreq V_SAMP |
| 2975 .unreq BLOCK_WIDTH |
| 2976 .unreq INPUT_DATA |
| 2977 .unreq OUTPUT_DATA |
| 2978 .unreq OUTPTR |
| 2979 .unreq INPTR0 |
| 2980 .unreq INPTR1 |
| 2981 .unreq TMP1 |
| 2982 .unreq TMP2 |
| 2983 .unreq TMP3 |
| 2984 .unreq TMPDUP |
| 2985 |
| 2986 |
| 2987 /*****************************************************************************/ |
| 2988 |
| 2989 /* |
| 2990 * GLOBAL(JOCTET*) |
| 2991 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, |
| 2992 * JCOEFPTR block, int last_dc_val, |
| 2993 * c_derived_tbl *dctbl, c_derived_tbl *actbl) |
| 2994 * |
| 2995 */ |
| 2996 |
| 2997 BUFFER .req x1 |
| 2998 PUT_BUFFER .req x6 |
| 2999 PUT_BITS .req x7 |
| 3000 PUT_BITSw .req w7 |
| 3001 |
| 3002 .macro emit_byte |
| 3003 sub PUT_BITS, PUT_BITS, #0x8 |
| 3004 lsr x19, PUT_BUFFER, PUT_BITS |
| 3005 uxtb w19, w19 |
| 3006 strb w19, [BUFFER, #1]! |
| 3007 cmp w19, #0xff |
| 3008 b.ne 14f |
| 3009 strb wzr, [BUFFER, #1]! |
| 3010 14: |
| 3011 .endm |
| 3012 .macro put_bits CODE, SIZE |
| 3013 lsl PUT_BUFFER, PUT_BUFFER, \SIZE |
| 3014 add PUT_BITS, PUT_BITS, \SIZE |
| 3015 orr PUT_BUFFER, PUT_BUFFER, \CODE |
| 3016 .endm |
| 3017 .macro checkbuf31 |
| 3018 cmp PUT_BITS, #0x20 |
| 3019 b.lt 31f |
| 3020 emit_byte |
| 3021 emit_byte |
| 3022 emit_byte |
| 3023 emit_byte |
| 3024 31: |
| 3025 .endm |
| 3026 .macro checkbuf47 |
| 3027 cmp PUT_BITS, #0x30 |
| 3028 b.lt 47f |
| 3029 emit_byte |
| 3030 emit_byte |
| 3031 emit_byte |
| 3032 emit_byte |
| 3033 emit_byte |
| 3034 emit_byte |
| 3035 47: |
| 3036 .endm |
| 3037 |
| 3038 .macro generate_jsimd_huff_encode_one_block fast_tbl |
| 3039 |
| 3040 .balign 16 |
| 3041 .if \fast_tbl == 1 |
| 3042 Ljsimd_huff_encode_one_block_neon_consts: |
| 3043 .else |
| 3044 Ljsimd_huff_encode_one_block_neon_slowtbl_consts: |
| 3045 .endif |
| 3046 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ |
| 3047 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 |
| 3048 .if \fast_tbl == 1 |
| 3049 .byte 0, 1, 2, 3, 16, 17, 32, 33, \ |
| 3050 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ |
| 3051 .byte 34, 35, 48, 49, 255, 255, 50, 51, \ |
| 3052 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ |
| 3053 .byte 8, 9, 22, 23, 36, 37, 50, 51, \ |
| 3054 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ |
| 3055 .byte 54, 55, 40, 41, 26, 27, 12, 13, \ |
| 3056 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ |
| 3057 .byte 6, 7, 20, 21, 34, 35, 48, 49, \ |
| 3058 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ |
| 3059 .byte 42, 43, 28, 29, 14, 15, 30, 31, \ |
| 3060 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ |
| 3061 .byte 255, 255, 255, 255, 56, 57, 42, 43, \ |
| 3062 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ |
| 3063 .byte 26, 27, 40, 41, 42, 43, 28, 29, \ |
| 3064 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ |
| 3065 .byte 255, 255, 255, 255, 0, 1, 255, 255, \ |
| 3066 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ |
| 3067 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ |
| 3068 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ |
| 3069 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ |
| 3070 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ |
| 3071 .byte 4, 5, 6, 7, 255, 255, 255, 255, \ |
| 3072 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ |
| 3073 .endif |
| 3074 |
| 3075 .if \fast_tbl == 1 |
| 3076 asm_function jsimd_huff_encode_one_block_neon |
| 3077 .else |
| 3078 asm_function jsimd_huff_encode_one_block_neon_slowtbl |
| 3079 .endif |
| 3080 sub sp, sp, 272 |
| 3081 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ |
| 3082 /* Save ARM registers */ |
| 3083 stp x19, x20, [sp], 16 |
| 3084 .if \fast_tbl == 1 |
| 3085 adr x15, Ljsimd_huff_encode_one_block_neon_consts |
| 3086 .else |
| 3087 adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts |
| 3088 .endif |
| 3089 ldr PUT_BUFFER, [x0, #0x10] |
| 3090 ldr PUT_BITSw, [x0, #0x18] |
| 3091 ldrsh w12, [x2] /* load DC coeff in w12 */ |
| 3092 /* prepare data */ |
| 3093 .if \fast_tbl == 1 |
| 3094 ld1 {v23.16b}, [x15], #16 |
| 3095 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 |
| 3096 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 |
| 3097 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 |
| 3098 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 |
| 3099 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 |
| 3100 sub w12, w12, w3 /* last_dc_val, not used afterwards */ |
| 3101 /* ZigZag 8x8 */ |
| 3102 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b |
| 3103 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b |
| 3104 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b |
| 3105 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b |
| 3106 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b |
| 3107 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b |
| 3108 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b |
| 3109 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b |
| 3110 ins v0.h[0], w12 |
| 3111 tbx v1.16b, {v28.16b}, v16.16b |
| 3112 tbx v2.16b, {v29.16b, v30.16b}, v17.16b |
| 3113 tbx v5.16b, {v29.16b, v30.16b}, v18.16b |
| 3114 tbx v6.16b, {v31.16b}, v19.16b |
| 3115 .else |
| 3116 add x13, x2, #0x22 |
| 3117 sub w12, w12, w3 /* last_dc_val, not used afterwards */ |
| 3118 ld1 {v23.16b}, [x15] |
| 3119 add x14, x2, #0x18 |
| 3120 add x3, x2, #0x36 |
| 3121 ins v0.h[0], w12 |
| 3122 add x9, x2, #0x2 |
| 3123 ld1 {v1.h}[0], [x13] |
| 3124 add x15, x2, #0x30 |
| 3125 ld1 {v2.h}[0], [x14] |
| 3126 add x19, x2, #0x26 |
| 3127 ld1 {v3.h}[0], [x3] |
| 3128 add x20, x2, #0x28 |
| 3129 ld1 {v0.h}[1], [x9] |
| 3130 add x12, x2, #0x10 |
| 3131 ld1 {v1.h}[1], [x15] |
| 3132 add x13, x2, #0x40 |
| 3133 ld1 {v2.h}[1], [x19] |
| 3134 add x14, x2, #0x34 |
| 3135 ld1 {v3.h}[1], [x20] |
| 3136 add x3, x2, #0x1a |
| 3137 ld1 {v0.h}[2], [x12] |
| 3138 add x9, x2, #0x20 |
| 3139 ld1 {v1.h}[2], [x13] |
| 3140 add x15, x2, #0x32 |
| 3141 ld1 {v2.h}[2], [x14] |
| 3142 add x19, x2, #0x42 |
| 3143 ld1 {v3.h}[2], [x3] |
| 3144 add x20, x2, #0xc |
| 3145 ld1 {v0.h}[3], [x9] |
| 3146 add x12, x2, #0x12 |
| 3147 ld1 {v1.h}[3], [x15] |
| 3148 add x13, x2, #0x24 |
| 3149 ld1 {v2.h}[3], [x19] |
| 3150 add x14, x2, #0x50 |
| 3151 ld1 {v3.h}[3], [x20] |
| 3152 add x3, x2, #0xe |
| 3153 ld1 {v0.h}[4], [x12] |
| 3154 add x9, x2, #0x4 |
| 3155 ld1 {v1.h}[4], [x13] |
| 3156 add x15, x2, #0x16 |
| 3157 ld1 {v2.h}[4], [x14] |
| 3158 add x19, x2, #0x60 |
| 3159 ld1 {v3.h}[4], [x3] |
| 3160 add x20, x2, #0x1c |
| 3161 ld1 {v0.h}[5], [x9] |
| 3162 add x12, x2, #0x6 |
| 3163 ld1 {v1.h}[5], [x15] |
| 3164 add x13, x2, #0x8 |
| 3165 ld1 {v2.h}[5], [x19] |
| 3166 add x14, x2, #0x52 |
| 3167 ld1 {v3.h}[5], [x20] |
| 3168 add x3, x2, #0x2a |
| 3169 ld1 {v0.h}[6], [x12] |
| 3170 add x9, x2, #0x14 |
| 3171 ld1 {v1.h}[6], [x13] |
| 3172 add x15, x2, #0xa |
| 3173 ld1 {v2.h}[6], [x14] |
| 3174 add x19, x2, #0x44 |
| 3175 ld1 {v3.h}[6], [x3] |
| 3176 add x20, x2, #0x38 |
| 3177 ld1 {v0.h}[7], [x9] |
| 3178 add x12, x2, #0x46 |
| 3179 ld1 {v1.h}[7], [x15] |
| 3180 add x13, x2, #0x3a |
| 3181 ld1 {v2.h}[7], [x19] |
| 3182 add x14, x2, #0x74 |
| 3183 ld1 {v3.h}[7], [x20] |
| 3184 add x3, x2, #0x6a |
| 3185 ld1 {v4.h}[0], [x12] |
| 3186 add x9, x2, #0x54 |
| 3187 ld1 {v5.h}[0], [x13] |
| 3188 add x15, x2, #0x2c |
| 3189 ld1 {v6.h}[0], [x14] |
| 3190 add x19, x2, #0x76 |
| 3191 ld1 {v7.h}[0], [x3] |
| 3192 add x20, x2, #0x78 |
| 3193 ld1 {v4.h}[1], [x9] |
| 3194 add x12, x2, #0x62 |
| 3195 ld1 {v5.h}[1], [x15] |
| 3196 add x13, x2, #0x1e |
| 3197 ld1 {v6.h}[1], [x19] |
| 3198 add x14, x2, #0x68 |
| 3199 ld1 {v7.h}[1], [x20] |
| 3200 add x3, x2, #0x7a |
| 3201 ld1 {v4.h}[2], [x12] |
| 3202 add x9, x2, #0x70 |
| 3203 ld1 {v5.h}[2], [x13] |
| 3204 add x15, x2, #0x2e |
| 3205 ld1 {v6.h}[2], [x14] |
| 3206 add x19, x2, #0x5a |
| 3207 ld1 {v7.h}[2], [x3] |
| 3208 add x20, x2, #0x6c |
| 3209 ld1 {v4.h}[3], [x9] |
| 3210 add x12, x2, #0x72 |
| 3211 ld1 {v5.h}[3], [x15] |
| 3212 add x13, x2, #0x3c |
| 3213 ld1 {v6.h}[3], [x19] |
| 3214 add x14, x2, #0x4c |
| 3215 ld1 {v7.h}[3], [x20] |
| 3216 add x3, x2, #0x5e |
| 3217 ld1 {v4.h}[4], [x12] |
| 3218 add x9, x2, #0x64 |
| 3219 ld1 {v5.h}[4], [x13] |
| 3220 add x15, x2, #0x4a |
| 3221 ld1 {v6.h}[4], [x14] |
| 3222 add x19, x2, #0x3e |
| 3223 ld1 {v7.h}[4], [x3] |
| 3224 add x20, x2, #0x6e |
| 3225 ld1 {v4.h}[5], [x9] |
| 3226 add x12, x2, #0x56 |
| 3227 ld1 {v5.h}[5], [x15] |
| 3228 add x13, x2, #0x58 |
| 3229 ld1 {v6.h}[5], [x19] |
| 3230 add x14, x2, #0x4e |
| 3231 ld1 {v7.h}[5], [x20] |
| 3232 add x3, x2, #0x7c |
| 3233 ld1 {v4.h}[6], [x12] |
| 3234 add x9, x2, #0x48 |
| 3235 ld1 {v5.h}[6], [x13] |
| 3236 add x15, x2, #0x66 |
| 3237 ld1 {v6.h}[6], [x14] |
| 3238 add x19, x2, #0x5c |
| 3239 ld1 {v7.h}[6], [x3] |
| 3240 add x20, x2, #0x7e |
| 3241 ld1 {v4.h}[7], [x9] |
| 3242 ld1 {v5.h}[7], [x15] |
| 3243 ld1 {v6.h}[7], [x19] |
| 3244 ld1 {v7.h}[7], [x20] |
| 3245 .endif |
| 3246 cmlt v24.8h, v0.8h, #0 |
| 3247 cmlt v25.8h, v1.8h, #0 |
| 3248 cmlt v26.8h, v2.8h, #0 |
| 3249 cmlt v27.8h, v3.8h, #0 |
| 3250 cmlt v28.8h, v4.8h, #0 |
| 3251 cmlt v29.8h, v5.8h, #0 |
| 3252 cmlt v30.8h, v6.8h, #0 |
| 3253 cmlt v31.8h, v7.8h, #0 |
| 3254 abs v0.8h, v0.8h |
| 3255 abs v1.8h, v1.8h |
| 3256 abs v2.8h, v2.8h |
| 3257 abs v3.8h, v3.8h |
| 3258 abs v4.8h, v4.8h |
| 3259 abs v5.8h, v5.8h |
| 3260 abs v6.8h, v6.8h |
| 3261 abs v7.8h, v7.8h |
| 3262 eor v24.16b, v24.16b, v0.16b |
| 3263 eor v25.16b, v25.16b, v1.16b |
| 3264 eor v26.16b, v26.16b, v2.16b |
| 3265 eor v27.16b, v27.16b, v3.16b |
| 3266 eor v28.16b, v28.16b, v4.16b |
| 3267 eor v29.16b, v29.16b, v5.16b |
| 3268 eor v30.16b, v30.16b, v6.16b |
| 3269 eor v31.16b, v31.16b, v7.16b |
| 3270 cmeq v16.8h, v0.8h, #0 |
| 3271 cmeq v17.8h, v1.8h, #0 |
| 3272 cmeq v18.8h, v2.8h, #0 |
| 3273 cmeq v19.8h, v3.8h, #0 |
| 3274 cmeq v20.8h, v4.8h, #0 |
| 3275 cmeq v21.8h, v5.8h, #0 |
| 3276 cmeq v22.8h, v6.8h, #0 |
| 3277 xtn v16.8b, v16.8h |
| 3278 xtn v18.8b, v18.8h |
| 3279 xtn v20.8b, v20.8h |
| 3280 xtn v22.8b, v22.8h |
| 3281 umov w14, v0.h[0] |
| 3282 xtn2 v16.16b, v17.8h |
| 3283 umov w13, v24.h[0] |
| 3284 xtn2 v18.16b, v19.8h |
| 3285 clz w14, w14 |
| 3286 xtn2 v20.16b, v21.8h |
| 3287 lsl w13, w13, w14 |
| 3288 cmeq v17.8h, v7.8h, #0 |
| 3289 sub w12, w14, #32 |
| 3290 xtn2 v22.16b, v17.8h |
| 3291 lsr w13, w13, w14 |
| 3292 and v16.16b, v16.16b, v23.16b |
| 3293 neg w12, w12 |
| 3294 and v18.16b, v18.16b, v23.16b |
| 3295 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ |
| 3296 and v20.16b, v20.16b, v23.16b |
| 3297 add x15, sp, #0x80 /* x15 = t2 */ |
| 3298 and v22.16b, v22.16b, v23.16b |
| 3299 ldr w10, [x4, x12, lsl #2] |
| 3300 addp v16.16b, v16.16b, v18.16b |
| 3301 ldrb w11, [x3, x12] |
| 3302 addp v20.16b, v20.16b, v22.16b |
| 3303 checkbuf47 |
| 3304 addp v16.16b, v16.16b, v20.16b |
| 3305 put_bits x10, x11 |
| 3306 addp v16.16b, v16.16b, v18.16b |
| 3307 checkbuf47 |
| 3308 umov x9,v16.D[0] |
| 3309 put_bits x13, x12 |
| 3310 cnt v17.8b, v16.8b |
| 3311 mvn x9, x9 |
| 3312 addv B18, v17.8b |
| 3313 add x4, x5, #0x400 /* x4 = actbl->ehufsi */ |
| 3314 umov w12, v18.b[0] |
| 3315 lsr x9, x9, #0x1 /* clear AC coeff */ |
| 3316 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ |
| 3317 rbit x9, x9 /* x9 = index0 */ |
| 3318 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ |
| 3319 cmp w12, #(64-8) |
| 3320 mov x11, sp |
| 3321 b.lt 4f |
| 3322 cbz x9, 6f |
| 3323 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 |
| 3324 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 |
| 3325 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 |
| 3326 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 |
| 3327 1: |
| 3328 clz x2, x9 |
| 3329 add x15, x15, x2, lsl #1 |
| 3330 lsl x9, x9, x2 |
| 3331 ldrh w20, [x15, #-126] |
| 3332 2: |
| 3333 cmp x2, #0x10 |
| 3334 b.lt 3f |
| 3335 sub x2, x2, #0x10 |
| 3336 checkbuf47 |
| 3337 put_bits x13, x14 |
| 3338 b 2b |
| 3339 3: |
| 3340 clz w20, w20 |
| 3341 ldrh w3, [x15, #2]! |
| 3342 sub w11, w20, #32 |
| 3343 lsl w3, w3, w20 |
| 3344 neg w11, w11 |
| 3345 lsr w3, w3, w20 |
| 3346 add x2, x11, x2, lsl #4 |
| 3347 lsl x9, x9, #0x1 |
| 3348 ldr w12, [x5, x2, lsl #2] |
| 3349 ldrb w10, [x4, x2] |
| 3350 checkbuf31 |
| 3351 put_bits x12, x10 |
| 3352 put_bits x3, x11 |
| 3353 cbnz x9, 1b |
| 3354 b 6f |
| 3355 4: |
| 3356 movi v21.8h, #0x0010 |
| 3357 clz v0.8h, v0.8h |
| 3358 clz v1.8h, v1.8h |
| 3359 clz v2.8h, v2.8h |
| 3360 clz v3.8h, v3.8h |
| 3361 clz v4.8h, v4.8h |
| 3362 clz v5.8h, v5.8h |
| 3363 clz v6.8h, v6.8h |
| 3364 clz v7.8h, v7.8h |
| 3365 ushl v24.8h, v24.8h, v0.8h |
| 3366 ushl v25.8h, v25.8h, v1.8h |
| 3367 ushl v26.8h, v26.8h, v2.8h |
| 3368 ushl v27.8h, v27.8h, v3.8h |
| 3369 ushl v28.8h, v28.8h, v4.8h |
| 3370 ushl v29.8h, v29.8h, v5.8h |
| 3371 ushl v30.8h, v30.8h, v6.8h |
| 3372 ushl v31.8h, v31.8h, v7.8h |
| 3373 neg v0.8h, v0.8h |
| 3374 neg v1.8h, v1.8h |
| 3375 neg v2.8h, v2.8h |
| 3376 neg v3.8h, v3.8h |
| 3377 neg v4.8h, v4.8h |
| 3378 neg v5.8h, v5.8h |
| 3379 neg v6.8h, v6.8h |
| 3380 neg v7.8h, v7.8h |
| 3381 ushl v24.8h, v24.8h, v0.8h |
| 3382 ushl v25.8h, v25.8h, v1.8h |
| 3383 ushl v26.8h, v26.8h, v2.8h |
| 3384 ushl v27.8h, v27.8h, v3.8h |
| 3385 ushl v28.8h, v28.8h, v4.8h |
| 3386 ushl v29.8h, v29.8h, v5.8h |
| 3387 ushl v30.8h, v30.8h, v6.8h |
| 3388 ushl v31.8h, v31.8h, v7.8h |
| 3389 add v0.8h, v21.8h, v0.8h |
| 3390 add v1.8h, v21.8h, v1.8h |
| 3391 add v2.8h, v21.8h, v2.8h |
| 3392 add v3.8h, v21.8h, v3.8h |
| 3393 add v4.8h, v21.8h, v4.8h |
| 3394 add v5.8h, v21.8h, v5.8h |
| 3395 add v6.8h, v21.8h, v6.8h |
| 3396 add v7.8h, v21.8h, v7.8h |
| 3397 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 |
| 3398 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 |
| 3399 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 |
| 3400 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 |
| 3401 1: |
| 3402 clz x2, x9 |
| 3403 add x15, x15, x2, lsl #1 |
| 3404 lsl x9, x9, x2 |
| 3405 ldrh w11, [x15, #-126] |
| 3406 2: |
| 3407 cmp x2, #0x10 |
| 3408 b.lt 3f |
| 3409 sub x2, x2, #0x10 |
| 3410 checkbuf47 |
| 3411 put_bits x13, x14 |
| 3412 b 2b |
| 3413 3: |
| 3414 ldrh w3, [x15, #2]! |
| 3415 add x2, x11, x2, lsl #4 |
| 3416 lsl x9, x9, #0x1 |
| 3417 ldr w12, [x5, x2, lsl #2] |
| 3418 ldrb w10, [x4, x2] |
| 3419 checkbuf31 |
| 3420 put_bits x12, x10 |
| 3421 put_bits x3, x11 |
| 3422 cbnz x9, 1b |
| 3423 6: |
| 3424 add x13, sp, #0xfe |
| 3425 cmp x15, x13 |
| 3426 b.hs 1f |
| 3427 ldr w12, [x5] |
| 3428 ldrb w14, [x4] |
| 3429 checkbuf47 |
| 3430 put_bits x12, x14 |
| 3431 1: |
| 3432 sub sp, sp, 16 |
| 3433 str PUT_BUFFER, [x0, #0x10] |
| 3434 str PUT_BITSw, [x0, #0x18] |
| 3435 ldp x19, x20, [sp], 16 |
| 3436 add x0, BUFFER, #0x1 |
| 3437 add sp, sp, 256 |
| 3438 br x30 |
| 3439 |
| 3440 .endm |
| 3441 |
| 3442 generate_jsimd_huff_encode_one_block 1 |
| 3443 generate_jsimd_huff_encode_one_block 0 |
| 3444 |
| 3445 .unreq BUFFER |
| 3446 .unreq PUT_BUFFER |
| 3447 .unreq PUT_BITS |
| 3448 .unreq PUT_BITSw |
| 3449 |
| 3450 .purgem emit_byte |
| 3451 .purgem put_bits |
| 3452 .purgem checkbuf31 |
| 3453 .purgem checkbuf47 |
OLD | NEW |