simd/jsimd_arm64_neon.S - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jsimd_arm64_neon.S

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * ARMv8 NEON optimizations for libjpeg-turbo	2 * ARMv8 NEON optimizations for libjpeg-turbo

3 *	3 *

4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).	4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).

5 * All rights reserved.	5 * All rights reserved.

6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>	6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>

7 * Copyright (C) 2013-2014, Linaro Limited	7 * Copyright (C) 2013-2014, Linaro Limited

8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>	8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>

	9 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.

	10 * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved.

	11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.

9 *	12 *

10 * This software is provided 'as-is', without any express or implied	13 * This software is provided 'as-is', without any express or implied

11 * warranty. In no event will the authors be held liable for any damages	14 * warranty. In no event will the authors be held liable for any damages

12 * arising from the use of this software.	15 * arising from the use of this software.

13 *	16 *

14 * Permission is granted to anyone to use this software for any purpose,	17 * Permission is granted to anyone to use this software for any purpose,

15 * including commercial applications, and to alter it and redistribute it	18 * including commercial applications, and to alter it and redistribute it

16 * freely, subject to the following restrictions:	19 * freely, subject to the following restrictions:

17 *	20 *

18 * 1. The origin of this software must not be misrepresented; you must not	21 * 1. The origin of this software must not be misrepresented; you must not

19 * claim that you wrote the original software. If you use this software	22 * claim that you wrote the original software. If you use this software

20 * in a product, an acknowledgment in the product documentation would be	23 * in a product, an acknowledgment in the product documentation would be

21 * appreciated but is not required.	24 * appreciated but is not required.

22 * 2. Altered source versions must be plainly marked as such, and must not be	25 * 2. Altered source versions must be plainly marked as such, and must not be

23 * misrepresented as being the original software.	26 * misrepresented as being the original software.

24 * 3. This notice may not be removed or altered from any source distribution.	27 * 3. This notice may not be removed or altered from any source distribution.

25 */	28 */

26	29

27 #if defined(__linux__) && defined(__ELF__)	30 #if defined(__linux__) && defined(__ELF__)

28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */	31 .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */

29 #endif	32 #endif

30	33

31 .text	34 .text

32 .arch armv8-a+fp+simd

33	35

34	36

35 #define RESPECT_STRICT_ALIGNMENT 1	37 #define RESPECT_STRICT_ALIGNMENT 1

36	38

37	39

38 /*****************************************************************************/	40 /*****************************************************************************/

39	41

40 /* Supplementary macro for setting function attributes */	42 /* Supplementary macro for setting function attributes */

41 .macro asm_function fname	43 .macro asm_function fname

42 #ifdef __APPLE__	44 #ifdef __APPLE__

43 .globl _\fname	45 .globl _\fname

44 _\fname:	46 _\fname:

45 #else	47 #else

46 .global \fname	48 .global \fname

47 #ifdef __ELF__	49 #ifdef __ELF__

48 .hidden \fname	50 .hidden \fname

49 .type \fname, %function	51 .type \fname, %function

50 #endif	52 #endif

51 \fname:	53 \fname:

52 #endif	54 #endif

53 .endm	55 .endm

54	56

55 /* Transpose elements of single 128 bit registers */	57 /* Transpose elements of single 128 bit registers */

56 .macro transpose_single x0,x1,xi,xilen,literal	58 .macro transpose_single x0, x1, xi, xilen, literal

57 ins \xi\xilen[0], \x0\xilen[0]	59 ins \xi\xilen[0], \x0\xilen[0]

58 ins \x1\xilen[0], \x0\xilen[1]	60 ins \x1\xilen[0], \x0\xilen[1]

59 trn1 \x0\literal, \x0\literal, \x1\literal	61 trn1 \x0\literal, \x0\literal, \x1\literal

60 trn2 \x1\literal, \xi\literal, \x1\literal	62 trn2 \x1\literal, \xi\literal, \x1\literal

61 .endm	63 .endm

62	64

63 /* Transpose elements of 2 differnet registers */	65 /* Transpose elements of 2 differnet registers */

64 .macro transpose x0,x1,xi,xilen,literal	66 .macro transpose x0, x1, xi, xilen, literal

65 mov \xi\xilen, \x0\xilen	67 mov \xi\xilen, \x0\xilen

66 trn1 \x0\literal, \x0\literal, \x1\literal	68 trn1 \x0\literal, \x0\literal, \x1\literal

67 trn2 \x1\literal, \xi\literal, \x1\literal	69 trn2 \x1\literal, \xi\literal, \x1\literal

68 .endm	70 .endm

69	71

70 /* Transpose a block of 4x4 coefficients in four 64-bit registers */	72 /* Transpose a block of 4x4 coefficients in four 64-bit registers */

71 .macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen	73 .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen

72 mov \xi\xilen, \x0\xilen	74 mov \xi\xilen, \x0\xilen

73 trn1 \x0\x0len, \x0\x0len, \x2\x2len	75 trn1 \x0\x0len, \x0\x0len, \x2\x2len

74 trn2 \x2\x2len, \xi\x0len, \x2\x2len	76 trn2 \x2\x2len, \xi\x0len, \x2\x2len

75 mov \xi\xilen, \x1\xilen	77 mov \xi\xilen, \x1\xilen

76 trn1 \x1\x1len, \x1\x1len, \x3\x3len	78 trn1 \x1\x1len, \x1\x1len, \x3\x3len

77 trn2 \x3\x3len, \xi\x1len, \x3\x3len	79 trn2 \x3\x3len, \xi\x1len, \x3\x3len

78 .endm	80 .endm

79	81

80 .macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen	82 .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen

81 mov \xi\xilen, \x0\xilen	83 mov \xi\xilen, \x0\xilen

82 trn1 \x0\x0len, \x0\x0len, \x1\x1len	84 trn1 \x0\x0len, \x0\x0len, \x1\x1len

83 trn2 \x1\x2len, \xi\x0len, \x1\x2len	85 trn2 \x1\x2len, \xi\x0len, \x1\x2len

84 mov \xi\xilen, \x2\xilen	86 mov \xi\xilen, \x2\xilen

85 trn1 \x2\x2len, \x2\x2len, \x3\x3len	87 trn1 \x2\x2len, \x2\x2len, \x3\x3len

86 trn2 \x3\x2len, \xi\x1len, \x3\x3len	88 trn2 \x3\x2len, \xi\x1len, \x3\x3len

87 .endm	89 .endm

88	90

89 .macro transpose_4x4 x0, x1, x2, x3,x5	91 .macro transpose_4x4 x0, x1, x2, x3, x5

90 transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b	92 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b

91 transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b	93 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b

	94 .endm

	95

	96 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3

	97 trn1 \t0\().8h, \l0\().8h, \l1\().8h

	98 trn1 \t1\().8h, \l2\().8h, \l3\().8h

	99 trn1 \t2\().8h, \l4\().8h, \l5\().8h

	100 trn1 \t3\().8h, \l6\().8h, \l7\().8h

	101 trn2 \l1\().8h, \l0\().8h, \l1\().8h

	102 trn2 \l3\().8h, \l2\().8h, \l3\().8h

	103 trn2 \l5\().8h, \l4\().8h, \l5\().8h

	104 trn2 \l7\().8h, \l6\().8h, \l7\().8h

	105

	106 trn1 \l4\().4s, \t2\().4s, \t3\().4s

	107 trn2 \t3\().4s, \t2\().4s, \t3\().4s

	108 trn1 \t2\().4s, \t0\().4s, \t1\().4s

	109 trn2 \l2\().4s, \t0\().4s, \t1\().4s

	110 trn1 \t0\().4s, \l1\().4s, \l3\().4s

	111 trn2 \l3\().4s, \l1\().4s, \l3\().4s

	112 trn2 \t1\().4s, \l5\().4s, \l7\().4s

	113 trn1 \l5\().4s, \l5\().4s, \l7\().4s

	114

	115 trn2 \l6\().2d, \l2\().2d, \t3\().2d

	116 trn1 \l0\().2d, \t2\().2d, \l4\().2d

	117 trn1 \l1\().2d, \t0\().2d, \l5\().2d

	118 trn2 \l7\().2d, \l3\().2d, \t1\().2d

	119 trn1 \l2\().2d, \l2\().2d, \t3\().2d

	120 trn2 \l4\().2d, \t2\().2d, \l4\().2d

	121 trn1 \l3\().2d, \l3\().2d, \t1\().2d

	122 trn2 \l5\().2d, \t0\().2d, \l5\().2d

92 .endm	123 .endm

93	124

94	125

95 #define CENTERJSAMPLE 128	126 #define CENTERJSAMPLE 128

96	127

97 /*****************************************************************************/	128 /*****************************************************************************/

98	129

99 /*	130 /*

100 * Perform dequantization and inverse DCT on one block of coefficients.	131 * Perform dequantization and inverse DCT on one block of coefficients.

101 *	132 *

102 * GLOBAL(void)	133 * GLOBAL(void)

103 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,	134 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,

104 * JSAMPARRAY output_buf, JDIMENSION output_col)	135 * JSAMPARRAY output_buf, JDIMENSION output_col)

105 */	136 */

106	137

107 #define FIX_0_298631336 (2446)	138 #define CONST_BITS 13

108 #define FIX_0_390180644 (3196)	139 #define PASS1_BITS 2

109 #define FIX_0_541196100 (4433)	140

110 #define FIX_0_765366865 (6270)	141 #define F_0_298 2446 /* FIX(0.298631336) */

111 #define FIX_0_899976223 (7373)	142 #define F_0_390 3196 /* FIX(0.390180644) */

112 #define FIX_1_175875602 (9633)	143 #define F_0_541 4433 /* FIX(0.541196100) */

113 #define FIX_1_501321110 (12299)	144 #define F_0_765 6270 /* FIX(0.765366865) */

114 #define FIX_1_847759065 (15137)	145 #define F_0_899 7373 /* FIX(0.899976223) */

115 #define FIX_1_961570560 (16069)	146 #define F_1_175 9633 /* FIX(1.175875602) */

116 #define FIX_2_053119869 (16819)	147 #define F_1_501 12299 /* FIX(1.501321110) */

117 #define FIX_2_562915447 (20995)	148 #define F_1_847 15137 /* FIX(1.847759065) */

118 #define FIX_3_072711026 (25172)	149 #define F_1_961 16069 /* FIX(1.961570560) */

119	150 #define F_2_053 16819 /* FIX(2.053119869) */

120 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)	151 #define F_2_562 20995 /* FIX(2.562915447) */

121 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)	152 #define F_3_072 25172 /* FIX(3.072711026) */

122 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)

123 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)

124 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)

125 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)

126 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)

127 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)

128

129 /*

130 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.

131 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'

132 */

133 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \

134 { \

135 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \

136 INT32 q1, q2, q3, q4, q5, q6, q7; \

137 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \

138 \

139 /* 1-D iDCT input data */ \

140 row0 = xrow0; \

141 row1 = xrow1; \

142 row2 = xrow2; \

143 row3 = xrow3; \

144 row4 = xrow4; \

145 row5 = xrow5; \

146 row6 = xrow6; \

147 row7 = xrow7; \

148 \

149 q5 = row7 + row3; \

150 q4 = row5 + row1; \

151 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \

152 MULTIPLY(q4, FIX_1_175875602); \

153 q7 = MULTIPLY(q5, FIX_1_175875602) + \

154 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \

155 q2 = MULTIPLY(row2, FIX_0_541196100) + \

156 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \

157 q4 = q6; \

158 q3 = ((INT32) row0 - (INT32) row4) << 13; \

159 q6 += MULTIPLY(row5, -FIX_2_562915447) + \

160 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \

161 /* now we can use q1 (reloadable constants have been used up) */ \

162 q1 = q3 + q2; \

163 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \

164 MULTIPLY(row1, -FIX_0_899976223); \

165 q5 = q7; \

166 q1 = q1 + q6; \

167 q7 += MULTIPLY(row7, -FIX_0_899976223) + \

168 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \

169 \

170 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \

171 tmp11_plus_tmp2 = q1; \

172 row1 = 0; \

173 \

174 q1 = q1 - q6; \

175 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \

176 MULTIPLY(row3, -FIX_2_562915447); \

177 q1 = q1 - q6; \

178 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \

179 MULTIPLY(row6, FIX_0_541196100); \

180 q3 = q3 - q2; \

181 \

182 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \

183 tmp11_minus_tmp2 = q1; \

184 \

185 q1 = ((INT32) row0 + (INT32) row4) << 13; \

186 q2 = q1 + q6; \

187 q1 = q1 - q6; \

188 \

189 /* pick up the results */ \

190 tmp0 = q4; \

191 tmp1 = q5; \

192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \

193 tmp3 = q7; \

194 tmp10 = q2; \

195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \

196 tmp12 = q3; \

197 tmp13 = q1; \

198 }

199

200 #define XFIX_0_899976223 v0.4h[0]

201 #define XFIX_0_541196100 v0.4h[1]

202 #define XFIX_2_562915447 v0.4h[2]

203 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]

204 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]

205 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]

206 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]

207 #define XFIX_1_175875602 v1.4h[3]

208 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]

209 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]

210 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]

211 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]

212	153

213 .balign 16	154 .balign 16

214 jsimd_idct_islow_neon_consts:	155 Ljsimd_idct_islow_neon_consts:

215 .short FIX_0_899976223 /* d0[0] */	156 .short F_0_298

216 .short FIX_0_541196100 /* d0[1] */	157 .short -F_0_390

217 .short FIX_2_562915447 /* d0[2] */	158 .short F_0_541

218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */	159 .short F_0_765

219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */	160 .short - F_0_899

220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */	161 .short F_1_175

221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */	162 .short F_1_501

222 .short FIX_1_175875602 /* d1[3] */	163 .short - F_1_847

223 /* reloadable constants */	164 .short - F_1_961

224 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */	165 .short F_2_053

225 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */	166 .short - F_2_562

226 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */	167 .short F_3_072

227 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */	168 .short 0 /* padding */

	169 .short 0

	170 .short 0

	171 .short 0

	172

	173 #undef F_0_298

	174 #undef F_0_390

	175 #undef F_0_541

	176 #undef F_0_765

	177 #undef F_0_899

	178 #undef F_1_175

	179 #undef F_1_501

	180 #undef F_1_847

	181 #undef F_1_961

	182 #undef F_2_053

	183 #undef F_2_562

	184 #undef F_3_072

	185

	186 #define XFIX_P_0_298 v0.h[0]

	187 #define XFIX_N_0_390 v0.h[1]

	188 #define XFIX_P_0_541 v0.h[2]

	189 #define XFIX_P_0_765 v0.h[3]

	190 #define XFIX_N_0_899 v0.h[4]

	191 #define XFIX_P_1_175 v0.h[5]

	192 #define XFIX_P_1_501 v0.h[6]

	193 #define XFIX_N_1_847 v0.h[7]

	194 #define XFIX_N_1_961 v1.h[0]

	195 #define XFIX_P_2_053 v1.h[1]

	196 #define XFIX_N_2_562 v1.h[2]

	197 #define XFIX_P_3_072 v1.h[3]

228	198

229 asm_function jsimd_idct_islow_neon	199 asm_function jsimd_idct_islow_neon

230

231 DCT_TABLE .req x0	200 DCT_TABLE .req x0

232 COEF_BLOCK .req x1	201 COEF_BLOCK .req x1

233 OUTPUT_BUF .req x2	202 OUTPUT_BUF .req x2

234 OUTPUT_COL .req x3	203 OUTPUT_COL .req x3

235 TMP1 .req x0	204 TMP1 .req x0

236 TMP2 .req x1	205 TMP2 .req x1

237 TMP3 .req x2	206 TMP3 .req x9

238 TMP4 .req x15	207 TMP4 .req x10

239	208 TMP5 .req x11

240 ROW0L .req v16	209 TMP6 .req x12

241 ROW0R .req v17	210 TMP7 .req x13

242 ROW1L .req v18	211 TMP8 .req x14

243 ROW1R .req v19	212

244 ROW2L .req v20	213 sub sp, sp, #64

245 ROW2R .req v21	214 adr x15, Ljsimd_idct_islow_neon_consts

246 ROW3L .req v22	215 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32

247 ROW3R .req v23	216 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32

248 ROW4L .req v24	217 ld1 {v0.8h, v1.8h}, [x15]

249 ROW4R .req v25	218 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64

250 ROW5L .req v26	219 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64

251 ROW5R .req v27	220 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64

252 ROW6L .req v28	221 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64

253 ROW6R .req v29	222

254 ROW7L .req v30	223 cmeq v16.8h, v3.8h, #0

255 ROW7R .req v31	224 cmeq v26.8h, v4.8h, #0

256 /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */	225 cmeq v27.8h, v5.8h, #0

257 sub sp, sp, 272	226 cmeq v28.8h, v6.8h, #0

258 str x15, [sp], 16	227 cmeq v29.8h, v7.8h, #0

259 adr x15, jsimd_idct_islow_neon_consts	228 cmeq v30.8h, v8.8h, #0

260 st1 {v0.8b - v3.8b}, [sp], 32	229 cmeq v31.8h, v9.8h, #0

261 st1 {v4.8b - v7.8b}, [sp], 32	230

262 st1 {v8.8b - v11.8b}, [sp], 32	231 and v10.16b, v16.16b, v26.16b

263 st1 {v12.8b - v15.8b}, [sp], 32	232 and v11.16b, v27.16b, v28.16b

264 st1 {v16.8b - v19.8b}, [sp], 32	233 and v12.16b, v29.16b, v30.16b

265 st1 {v20.8b - v23.8b}, [sp], 32	234 and v13.16b, v31.16b, v10.16b

266 st1 {v24.8b - v27.8b}, [sp], 32	235 and v14.16b, v11.16b, v12.16b

267 st1 {v28.8b - v31.8b}, [sp], 32	236 mul v2.8h, v2.8h, v18.8h

268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32	237 and v15.16b, v13.16b, v14.16b

269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32	238 shl v10.8h, v2.8h, #(PASS1_BITS)

270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32	239 sqxtn v16.8b, v15.8h

271 mul v16.4h, v16.4h, v0.4h	240 mov TMP1, v16.d[0]

272 mul v17.4h, v17.4h, v1.4h	241 sub sp, sp, #64

273 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */	242 mvn TMP2, TMP1

274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32	243

275 mul v18.4h, v18.4h, v2.4h	244 cbnz TMP2, 2f

276 mul v19.4h, v19.4h, v3.4h	245 /* case all AC coeffs are zeros */

277 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */	246 dup v2.2d, v10.d[0]

278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32	247 dup v6.2d, v10.d[1]

279 mul v20.4h, v20.4h, v4.4h	248 mov v3.16b, v2.16b

280 mul v21.4h, v21.4h, v5.4h	249 mov v7.16b, v6.16b

281 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */	250 mov v4.16b, v2.16b

282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32	251 mov v8.16b, v6.16b

283 mul v22.4h, v22.4h, v6.4h	252 mov v5.16b, v2.16b

284 mul v23.4h, v23.4h, v7.4h	253 mov v9.16b, v6.16b

285 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */	254 1:

286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]	255 /* for this transpose, we should organise data like this:

287 mul v24.4h, v24.4h, v0.4h	256 * 00, 01, 02, 03, 40, 41, 42, 43

288 mul v25.4h, v25.4h, v1.4h	257 * 10, 11, 12, 13, 50, 51, 52, 53

289 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */	258 * 20, 21, 22, 23, 60, 61, 62, 63

290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32	259 * 30, 31, 32, 33, 70, 71, 72, 73

291 mul v28.4h, v28.4h, v4.4h	260 * 04, 05, 06, 07, 44, 45, 46, 47

292 mul v29.4h, v29.4h, v5.4h	261 * 14, 15, 16, 17, 54, 55, 56, 57

293 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */	262 * 24, 25, 26, 27, 64, 65, 66, 67

294 mul v26.4h, v26.4h, v2.4h	263 * 34, 35, 36, 37, 74, 75, 76, 77

295 mul v27.4h, v27.4h, v3.4h	264 */

296 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */	265 trn1 v28.8h, v2.8h, v3.8h

297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */	266 trn1 v29.8h, v4.8h, v5.8h

298 add x15, x15, #16	267 trn1 v30.8h, v6.8h, v7.8h

299 mul v30.4h, v30.4h, v6.4h	268 trn1 v31.8h, v8.8h, v9.8h

300 mul v31.4h, v31.4h, v7.4h	269 trn2 v16.8h, v2.8h, v3.8h

301 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */	270 trn2 v17.8h, v4.8h, v5.8h

302 /* Go to the bottom of the stack */	271 trn2 v18.8h, v6.8h, v7.8h

303 sub sp, sp, 352	272 trn2 v19.8h, v8.8h, v9.8h

304 stp x4, x5, [sp], 16	273 trn1 v2.4s, v28.4s, v29.4s

305 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */	274 trn1 v6.4s, v30.4s, v31.4s

306 st1 {v12.4h - v15.4h}, [sp], 32	275 trn1 v3.4s, v16.4s, v17.4s

307 /* 1-D IDCT, pass 1, left 4x8 half */	276 trn1 v7.4s, v18.4s, v19.4s

308 add v4.4h, ROW7L.4h, ROW3L.4h	277 trn2 v4.4s, v28.4s, v29.4s

309 add v5.4h, ROW5L.4h, ROW1L.4h	278 trn2 v8.4s, v30.4s, v31.4s

310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560	279 trn2 v5.4s, v16.4s, v17.4s

311 smlal v12.4s, v5.4h, XFIX_1_175875602	280 trn2 v9.4s, v18.4s, v19.4s

312 smull v14.4s, v4.4h, XFIX_1_175875602	281 /* Even part: reverse the even part of the forward DCT. */

313 /* Check for the zero coefficients in the right 4x8 half */	282 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE2], quantptr[DCTSIZE2]) + DEQUANTIZE(inptr[DCTSIZE6], quantptr[DCTSIZ E6]) */

314 smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644	283 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE0], quantptr[DCTSIZE0]) + DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZ E4]) */

315 ssubl v6.4s, ROW0L.4h, ROW4L.4h	284 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */

316 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]	285 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr [DCTSIZE0], quantptr[DCTSIZE0]) - DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZ E4]) */

317 smull v4.4s, ROW2L.4h, XFIX_0_541196100	286 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */

318 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065	287 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */

319 orr x0, x4, x5	288 mov v21.16b, v19.16b /* tmp3 = z1 */

320 mov v8.16b, v12.16b	289 mov v20.16b, v18.16b /* tmp3 = z1 */

321 smlsl v12.4s, ROW5L.4h, XFIX_2_562915447	290 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */

322 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]	291 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */

323 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447	292 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */

324 shl v6.4s, v6.4s, #13	293 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */

325 orr x0, x0, x4	294 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */

326 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223	295 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */

327 orr x0, x0 , x5	296 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */

328 add v2.4s, v6.4s, v4.4s	297 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3 ; */

329 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]	298 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3 ; */

330 mov v10.16b, v14.16b	299 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2 ; */

331 add v2.4s, v2.4s, v12.4s	300 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2 ; */

332 orr x0, x0, x4	301 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3 ; */

333 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223	302 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3 ; */

334 orr x0, x0, x5	303 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2 ; */

335 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223	304 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2 ; */

336 rshrn ROW1L.4h, v2.4s, #11	305

337 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]	306 /* Odd part per figure 8; the matrix is unitary and hence its

338 sub v2.4s, v2.4s, v12.4s	307 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.

339 smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447	308 */

340 orr x0, x0, x4	309

341 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447	310 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE7], quantptr[DCTSIZE7]) + DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTS IZE3]) */

342 orr x0, x0, x5	311 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE5], quantptr[DCTSIZE5]) + DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTS IZE1]) */

343 sub v2.4s, v2.4s, v12.4s	312 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE7], quantptr[DCTSIZE7]) + DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTS IZE1]) */

344 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865	313 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE5], quantptr[DCTSIZE5]) + DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTS IZE3]) */

345 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]	314 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */

346 smlal v12.4s, ROW6L.4h, XFIX_0_541196100	315

347 sub v6.4s, v6.4s, v4.4s	316 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */

348 orr x0, x0, x4	317 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */

349 rshrn ROW6L.4h, v2.4s, #11	318 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */

350 orr x0, x0, x5	319 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */

351 add v2.4s, v6.4s, v10.4s	320 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */

352 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]	321 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */

353 sub v6.4s, v6.4s, v10.4s	322 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */

354 saddl v10.4s, ROW0L.4h, ROW4L.4h	323 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */

355 orr x0, x0, x4	324 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */

356 rshrn ROW2L.4h, v2.4s, #11	325

357 orr x0, x0, x5	326 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */

358 rshrn ROW5L.4h, v6.4s, #11	327 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */

359 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]	328 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */

360 shl v10.4s, v10.4s, #13	329 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */

361 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223	330 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */

362 orr x0, x0, x4	331 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */

363 add v4.4s, v10.4s, v12.4s	332 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */

364 orr x0, x0, x5	333 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */

365 cmp x0, #0 /* orrs instruction removed */	334 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */

366 sub v2.4s, v10.4s, v12.4s	335

367 add v12.4s, v4.4s, v14.4s	336 add v23.4s, v23.4s, v27.4s /* z3 += z5 */

368 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]	337 add v22.4s, v22.4s, v26.4s /* z3 += z5 */

369 sub v4.4s, v4.4s, v14.4s	338 add v25.4s, v25.4s, v27.4s /* z4 += z5 */

370 add v10.4s, v2.4s, v8.4s	339 add v24.4s, v24.4s, v26.4s /* z4 += z5 */

371 orr x0, x4, x5	340

372 sub v6.4s, v2.4s, v8.4s	341 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */

373 /* pop {x4, x5} */	342 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */

374 sub sp, sp, 80	343 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */

375 ldp x4, x5, [sp], 16	344 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */

376 rshrn ROW7L.4h, v4.4s, #11	345 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */

377 rshrn ROW3L.4h, v10.4s, #11	346 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */

378 rshrn ROW0L.4h, v12.4s, #11	347 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */

379 rshrn ROW4L.4h, v6.4s, #11	348 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */

380	349

381 beq 3f /* Go to do some special handling for the sparse right 4x8 half */	350 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */

382	351 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */

383 /* 1-D IDCT, pass 1, right 4x8 half */	352 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */

384 ld1 {v2.4h}, [x15] /* reload constants */	353 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */

385 add v10.4h, ROW7R.4h, ROW3R.4h	354 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */

386 add v8.4h, ROW5R.4h, ROW1R.4h	355 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */

387 /* Transpose ROW6L <-> ROW7L (v3 available free register) */	356 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */

388 transpose ROW6L, ROW7L, v3, .16b, .4h	357 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */

389 smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560	358

390 smlal v12.4s, v8.4h, XFIX_1_175875602	359 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

391 /* Transpose ROW2L <-> ROW3L (v3 available free register) */	360

392 transpose ROW2L, ROW3L, v3, .16b, .4h	361 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */

393 smull v14.4s, v10.4h, XFIX_1_175875602	362 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */

394 smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644	363 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */

395 /* Transpose ROW0L <-> ROW1L (v3 available free register) */	364 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */

396 transpose ROW0L, ROW1L, v3, .16b, .4h	365 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */

397 ssubl v6.4s, ROW0R.4h, ROW4R.4h	366 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */

398 smull v4.4s, ROW2R.4h, XFIX_0_541196100	367 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */

399 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065	368 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */

400 /* Transpose ROW4L <-> ROW5L (v3 available free register) */	369 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */

401 transpose ROW4L, ROW5L, v3, .16b, .4h	370 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */

402 mov v8.16b, v12.16b	371 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */

403 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447	372 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */

404 smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447	373 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */

405 /* Transpose ROW1L <-> ROW3L (v3 available free register) */	374 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */

406 transpose ROW1L, ROW3L, v3, .16b, .2s	375 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */

407 shl v6.4s, v6.4s, #13	376 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */

408 smlsl v8.4s, ROW1R.4h, XFIX_0_899976223	377

409 /* Transpose ROW4L <-> ROW6L (v3 available free register) */	378 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE0] = (int) DESCALE(tmp1 0 + tmp3, CONST_BITS+PASS1_BITS+3) /

410 transpose ROW4L, ROW6L, v3, .16b, .2s	379 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE7] = (int) DESCALE(tmp1 0 - tmp3, CONST_BITS+PASS1_BITS+3) /

411 add v2.4s, v6.4s, v4.4s	380 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE1] = (int) DESCALE(tmp1 1 + tmp2, CONST_BITS+PASS1_BITS+3) /

412 mov v10.16b, v14.16b	381 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE6] = (int) DESCALE(tmp1 1 - tmp2, CONST_BITS+PASS1_BITS+3) /

413 add v2.4s, v2.4s, v12.4s	382 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE2] = (int) DESCALE(tmp1 2 + tmp1, CONST_BITS+PASS1_BITS+3) /

414 /* Transpose ROW0L <-> ROW2L (v3 available free register) */	383 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE5] = (int) DESCALE(tmp1 2 - tmp1, CONST_BITS+PASS1_BITS+3) /

415 transpose ROW0L, ROW2L, v3, .16b, .2s	384 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE3] = (int) DESCALE(tmp1 3 + tmp0, CONST_BITS+PASS1_BITS+3) /

416 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223	385 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE4] = (int) DESCALE(tmp1 3 - tmp0, CONST_BITS+PASS1_BITS+3) /

417 smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223	386 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE0] = (int) DESCALE(tmp1 0 + tmp3, CONST_BITS+PASS1_BITS+3) /

418 rshrn ROW1R.4h, v2.4s, #11	387 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE7] = (int) DESCALE(tmp1 0 - tmp3, CONST_BITS+PASS1_BITS+3) /

419 /* Transpose ROW5L <-> ROW7L (v3 available free register) */	388 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE1] = (int) DESCALE(tmp1 1 + tmp2, CONST_BITS+PASS1_BITS+3) /

420 transpose ROW5L, ROW7L, v3, .16b, .2s	389 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE6] = (int) DESCALE(tmp1 1 - tmp2, CONST_BITS+PASS1_BITS+3) /

421 sub v2.4s, v2.4s, v12.4s	390 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE2] = (int) DESCALE(tmp1 2 + tmp1, CONST_BITS+PASS1_BITS+3) /

422 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447	391 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE5] = (int) DESCALE(tmp1 2 - tmp1, CONST_BITS+PASS1_BITS+3) /

423 smlsl v10.4s, ROW3R.4h, XFIX_2_562915447	392 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE3] = (int) DESCALE(tmp1 3 + tmp0, CONST_BITS+PASS1_BITS+3) /

424 sub v2.4s, v2.4s, v12.4s	393 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE4] = (int) DESCALE(tmp1 3 - tmp0, CONST_BITS+PASS1_BITS+3) /

425 smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865	394 movi v0.16b, #(CENTERJSAMPLE)

426 smlal v12.4s, ROW6R.4h, XFIX_0_541196100	395 /* Prepare pointers (dual-issue with NEON instructions) */

427 sub v6.4s, v6.4s, v4.4s	396 ldp TMP1, TMP2, [OUTPUT_BUF], 16

428 rshrn ROW6R.4h, v2.4s, #11	397 sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)

429 add v2.4s, v6.4s, v10.4s	398 ldp TMP3, TMP4, [OUTPUT_BUF], 16

430 sub v6.4s, v6.4s, v10.4s	399 sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)

431 saddl v10.4s, ROW0R.4h, ROW4R.4h	400 add TMP1, TMP1, OUTPUT_COL

432 rshrn ROW2R.4h, v2.4s, #11	401 sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)

433 rshrn ROW5R.4h, v6.4s, #11	402 add TMP2, TMP2, OUTPUT_COL

434 shl v10.4s, v10.4s, #13	403 sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)

435 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223	404 add TMP3, TMP3, OUTPUT_COL

436 add v4.4s, v10.4s, v12.4s	405 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)

437 sub v2.4s, v10.4s, v12.4s	406 add TMP4, TMP4, OUTPUT_COL

438 add v12.4s, v4.4s, v14.4s	407 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)

439 sub v4.4s, v4.4s, v14.4s	408 ldp TMP5, TMP6, [OUTPUT_BUF], 16

440 add v10.4s, v2.4s, v8.4s	409 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)

441 sub v6.4s, v2.4s, v8.4s	410 ldp TMP7, TMP8, [OUTPUT_BUF], 16

442 rshrn ROW7R.4h, v4.4s, #11	411 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)

443 rshrn ROW3R.4h, v10.4s, #11	412 add TMP5, TMP5, OUTPUT_COL

444 rshrn ROW0R.4h, v12.4s, #11	413 add v16.16b, v28.16b, v0.16b

445 rshrn ROW4R.4h, v6.4s, #11	414 add TMP6, TMP6, OUTPUT_COL

446 /* Transpose right 4x8 half */	415 add v18.16b, v29.16b, v0.16b

447 transpose ROW6R, ROW7R, v3, .16b, .4h	416 add TMP7, TMP7, OUTPUT_COL

448 transpose ROW2R, ROW3R, v3, .16b, .4h	417 add v20.16b, v30.16b, v0.16b

449 transpose ROW0R, ROW1R, v3, .16b, .4h	418 add TMP8, TMP8, OUTPUT_COL

450 transpose ROW4R, ROW5R, v3, .16b, .4h	419 add v22.16b, v31.16b, v0.16b

451 transpose ROW1R, ROW3R, v3, .16b, .2s	420

452 transpose ROW4R, ROW6R, v3, .16b, .2s	421 /* Transpose the final 8-bit samples */

453 transpose ROW0R, ROW2R, v3, .16b, .2s	422 trn1 v28.16b, v16.16b, v18.16b

454 transpose ROW5R, ROW7R, v3, .16b, .2s	423 trn1 v30.16b, v20.16b, v22.16b

455	424 trn2 v29.16b, v16.16b, v18.16b

456 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */	425 trn2 v31.16b, v20.16b, v22.16b

457 ld1 {v2.4h}, [x15] /* reload constants */	426

458 smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */	427 trn1 v16.8h, v28.8h, v30.8h

459 smlal v12.4s, ROW1L.4h, XFIX_1_175875602	428 trn2 v18.8h, v28.8h, v30.8h

460 smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */	429 trn1 v20.8h, v29.8h, v31.8h

461 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560	430 trn2 v22.8h, v29.8h, v31.8h

462 smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */	431

463 smlal v14.4s, ROW3L.4h, XFIX_1_175875602	432 uzp1 v28.4s, v16.4s, v18.4s

464 smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */	433 uzp2 v30.4s, v16.4s, v18.4s

465 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644	434 uzp1 v29.4s, v20.4s, v22.4s

466 ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */	435 uzp2 v31.4s, v20.4s, v22.4s

467 smull v4.4s, ROW2L.4h, XFIX_0_541196100	436

468 smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* RO W6L.4h <-> ROW2R.4h */

469 mov v8.16b, v12.16b

470 smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4 h */

471 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

472 shl v6.4s, v6.4s, #13

473 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

474 add v2.4s, v6.4s, v4.4s

475 mov v10.16b, v14.16b

476 add v2.4s, v2.4s, v12.4s

477 smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4 h */

478 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

479 shrn ROW1L.4h, v2.4s, #16

480 sub v2.4s, v2.4s, v12.4s

481 smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* RO W5L.4h <-> ROW1R.4h */

482 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

483 sub v2.4s, v2.4s, v12.4s

484 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

485 smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */

486 sub v6.4s, v6.4s, v4.4s

487 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

488 add v2.4s, v6.4s, v10.4s

489 sub v6.4s, v6.4s, v10.4s

490 saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */

491 shrn ROW2L.4h, v2.4s, #16

492 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

493 shl v10.4s, v10.4s, #13

494 smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* RO W7L.4h <-> ROW3R.4h */

495 add v4.4s, v10.4s, v12.4s

496 sub v2.4s, v10.4s, v12.4s

497 add v12.4s, v4.4s, v14.4s

498 sub v4.4s, v4.4s, v14.4s

499 add v10.4s, v2.4s, v8.4s

500 sub v6.4s, v2.4s, v8.4s

501 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

502 shrn ROW3L.4h, v10.4s, #16

503 shrn ROW0L.4h, v12.4s, #16

504 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

505 /* 1-D IDCT, pass 2, right 4x8 half */

506 ld1 {v2.4h}, [x15] /* reload constants */

507 smull v12.4s, ROW5R.4h, XFIX_1_175875602

508 smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4 h */

509 smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560

510 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* RO W7L.4h <-> ROW3R.4h */

511 smull v14.4s, ROW7R.4h, XFIX_1_175875602

512 smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4 h */

513 smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644

514 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* RO W5L.4h <-> ROW1R.4h */

515 ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */

516 smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4 h */

517 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065

518 mov v8.16b, v12.16b

519 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447

520 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* RO W7L.4h <-> ROW3R.4h */

521 shl v6.4s, v6.4s, #13

522 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4 h */

523 add v2.4s, v6.4s, v4.4s

524 mov v10.16b, v14.16b

525 add v2.4s, v2.4s, v12.4s

526 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223

527 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* RO W5L.4h <-> ROW1R.4h */

528 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

529 sub v2.4s, v2.4s, v12.4s

530 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447

531 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4 h */

532 sub v2.4s, v2.4s, v12.4s

533 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW 6L.4h <-> ROW2R.4h */

534 smlal v12.4s, ROW6R.4h, XFIX_0_541196100

535 sub v6.4s, v6.4s, v4.4s

536 shrn ROW6R.4h, v2.4s, #16

537 add v2.4s, v6.4s, v10.4s

538 sub v6.4s, v6.4s, v10.4s

539 saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */

540 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

541 shrn ROW5R.4h, v6.4s, #16

542 shl v10.4s, v10.4s, #13

543 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223

544 add v4.4s, v10.4s, v12.4s

545 sub v2.4s, v10.4s, v12.4s

546 add v12.4s, v4.4s, v14.4s

547 sub v4.4s, v4.4s, v14.4s

548 add v10.4s, v2.4s, v8.4s

549 sub v6.4s, v2.4s, v8.4s

550 shrn ROW7R.4h, v4.4s, #16

551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

553 shrn ROW4R.4h, v6.4s, #16

554

555 2: /* Descale to 8-bit and range limit */

556 ins v16.2d[1], v17.2d[0]

557 ins v18.2d[1], v19.2d[0]

558 ins v20.2d[1], v21.2d[0]

559 ins v22.2d[1], v23.2d[0]

560 sqrshrn v16.8b, v16.8h, #2

561 sqrshrn2 v16.16b, v18.8h, #2

562 sqrshrn v18.8b, v20.8h, #2

563 sqrshrn2 v18.16b, v22.8h, #2

564

565 /* vpop {v8.4h - d15.4h} / / restore NEON registers */

566 ld1 {v8.4h - v11.4h}, [sp], 32

567 ld1 {v12.4h - v15.4h}, [sp], 32

568 ins v24.2d[1], v25.2d[0]

569

570 sqrshrn v20.8b, v24.8h, #2

571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */

572 /* trn1 v16.8h, v16.8h, v18.8h */

573 transpose v16, v18, v3, .16b, .8h

574 ins v26.2d[1], v27.2d[0]

575 ins v28.2d[1], v29.2d[0]

576 ins v30.2d[1], v31.2d[0]

577 sqrshrn2 v20.16b, v26.8h, #2

578 sqrshrn v22.8b, v28.8h, #2

579 movi v0.16b, #(CENTERJSAMPLE)

580 sqrshrn2 v22.16b, v30.8h, #2

581 transpose_single v16, v17, v3, .2d, .8b

582 transpose_single v18, v19, v3, .2d, .8b

583 add v16.8b, v16.8b, v0.8b

584 add v17.8b, v17.8b, v0.8b

585 add v18.8b, v18.8b, v0.8b

586 add v19.8b, v19.8b, v0.8b

587 transpose v20, v22, v3, .16b, .8h

588 /* Store results to the output buffer */	437 /* Store results to the output buffer */

589 ldp TMP1, TMP2, [OUTPUT_BUF], 16	438 st1 {v28.d}[0], [TMP1]

590 add TMP1, TMP1, OUTPUT_COL	439 st1 {v29.d}[0], [TMP2]

591 add TMP2, TMP2, OUTPUT_COL	440 st1 {v28.d}[1], [TMP3]

592 st1 {v16.8b}, [TMP1]	441 st1 {v29.d}[1], [TMP4]

593 transpose_single v20, v21, v3, .2d, .8b	442 st1 {v30.d}[0], [TMP5]

594 st1 {v17.8b}, [TMP2]	443 st1 {v31.d}[0], [TMP6]

595 ldp TMP1, TMP2, [OUTPUT_BUF], 16	444 st1 {v30.d}[1], [TMP7]

596 add TMP1, TMP1, OUTPUT_COL	445 st1 {v31.d}[1], [TMP8]

597 add TMP2, TMP2, OUTPUT_COL	446 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32

598 st1 {v18.8b}, [TMP1]	447 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32

599 add v20.8b, v20.8b, v0.8b

600 add v21.8b, v21.8b, v0.8b

601 st1 {v19.8b}, [TMP2]

602 ldp TMP1, TMP2, [OUTPUT_BUF], 16

603 ldp TMP3, TMP4, [OUTPUT_BUF]

604 add TMP1, TMP1, OUTPUT_COL

605 add TMP2, TMP2, OUTPUT_COL

606 add TMP3, TMP3, OUTPUT_COL

607 add TMP4, TMP4, OUTPUT_COL

608 transpose_single v22, v23, v3, .2d, .8b

609 st1 {v20.8b}, [TMP1]

610 add v22.8b, v22.8b, v0.8b

611 add v23.8b, v23.8b, v0.8b

612 st1 {v21.8b}, [TMP2]

613 st1 {v22.8b}, [TMP3]

614 st1 {v23.8b}, [TMP4]

615 ldr x15, [sp], 16

616 ld1 {v0.8b - v3.8b}, [sp], 32

617 ld1 {v4.8b - v7.8b}, [sp], 32

618 ld1 {v8.8b - v11.8b}, [sp], 32

619 ld1 {v12.8b - v15.8b}, [sp], 32

620 ld1 {v16.8b - v19.8b}, [sp], 32

621 ld1 {v20.8b - v23.8b}, [sp], 32

622 ld1 {v24.8b - v27.8b}, [sp], 32

623 ld1 {v28.8b - v31.8b}, [sp], 32

624 blr x30	448 blr x30

625	449

626 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */	450 .balign 16

627	451 2:

628 /* Transpose left 4x8 half */	452 mul v3.8h, v3.8h, v19.8h

629 transpose ROW6L, ROW7L, v3, .16b, .4h	453 mul v4.8h, v4.8h, v20.8h

630 transpose ROW2L, ROW3L, v3, .16b, .4h	454 mul v5.8h, v5.8h, v21.8h

631 transpose ROW0L, ROW1L, v3, .16b, .4h	455 add TMP4, xzr, TMP2, LSL #32

632 transpose ROW4L, ROW5L, v3, .16b, .4h	456 mul v6.8h, v6.8h, v22.8h

633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */	457 mul v7.8h, v7.8h, v23.8h

634 transpose ROW1L, ROW3L, v3, .16b, .2s	458 adds TMP3, xzr, TMP2, LSR #32

635 transpose ROW4L, ROW6L, v3, .16b, .2s	459 mul v8.8h, v8.8h, v24.8h

636 transpose ROW0L, ROW2L, v3, .16b, .2s	460 mul v9.8h, v9.8h, v25.8h

637 transpose ROW5L, ROW7L, v3, .16b, .2s	461 b.ne 3f

638 cmp x0, #0	462 /* Right AC coef is zero */

639 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */	463 dup v15.2d, v10.d[1]

640	464 /* Even part: reverse the even part of the forward DCT. */

641 /* Only row 0 is non-zero for the right 4x8 half */	465 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE2], quantptr[DCTSIZE2]) + DEQUANTIZE(inptr[DCTSIZE6], quantptr[DCTSIZ E6]) */

642 dup ROW1R.4h, ROW0R.4h[1]	466 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE0], quantptr[DCTSIZE0]) + DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZ E4]) */

643 dup ROW2R.4h, ROW0R.4h[2]	467 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr [DCTSIZE0], quantptr[DCTSIZE0]) - DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZ E4]) */

644 dup ROW3R.4h, ROW0R.4h[3]	468 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */

645 dup ROW4R.4h, ROW0R.4h[0]	469 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */

646 dup ROW5R.4h, ROW0R.4h[1]	470 mov v20.16b, v18.16b /* tmp3 = z1 */

647 dup ROW6R.4h, ROW0R.4h[2]	471 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */

648 dup ROW7R.4h, ROW0R.4h[3]	472 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */

649 dup ROW0R.4h, ROW0R.4h[0]	473 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */

650 b 1b /* Go to 'normal' second pass */	474 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3 ; */

651	475 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3 ; */

652 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */	476 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2 ; */

653 ld1 {v2.4h}, [x15] /* reload constants */	477 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2 ; */

654 smull v12.4s, ROW1L.4h, XFIX_1_175875602	478

655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560	479 /* Odd part per figure 8; the matrix is unitary and hence its

656 smull v14.4s, ROW3L.4h, XFIX_1_175875602	480 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.

657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644	481 */

658 smull v4.4s, ROW2L.4h, XFIX_0_541196100	482

659 sshll v6.4s, ROW0L.4h, #13	483 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE7], quantptr[DCTSIZE7]) + DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTS IZE3]) */

660 mov v8.16b, v12.16b	484 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE5], quantptr[DCTSIZE5]) + DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTS IZE1]) */

661 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447	485 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE7], quantptr[DCTSIZE7]) + DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTS IZE1]) */

662 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223	486 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE5], quantptr[DCTSIZE5]) + DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTS IZE3]) */

663 add v2.4s, v6.4s, v4.4s	487 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */

664 mov v10.16b, v14.16b	488

665 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223	489 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */

666 add v2.4s, v2.4s, v12.4s	490 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */

667 add v12.4s, v12.4s, v12.4s	491 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */

668 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447	492 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */

669 shrn ROW1L.4h, v2.4s, #16	493 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */

670 sub v2.4s, v2.4s, v12.4s	494 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */

671 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865	495 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */

672 sub v6.4s, v6.4s, v4.4s	496 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */

673 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */	497 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */

674 add v2.4s, v6.4s, v10.4s	498

675 sub v6.4s, v6.4s, v10.4s	499 add v22.4s, v22.4s, v26.4s /* z3 += z5 */

676 sshll v10.4s, ROW0L.4h, #13	500 add v24.4s, v24.4s, v26.4s /* z4 += z5 */

677 shrn ROW2L.4h, v2.4s, #16	501

678 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */	502 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */

679 add v4.4s, v10.4s, v12.4s	503 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */

680 sub v2.4s, v10.4s, v12.4s	504 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */

681 add v12.4s, v4.4s, v14.4s	505 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */

682 sub v4.4s, v4.4s, v14.4s	506

683 add v10.4s, v2.4s, v8.4s	507 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */

684 sub v6.4s, v2.4s, v8.4s	508 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */

685 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */	509 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */

686 shrn ROW3L.4h, v10.4s, #16	510 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */

687 shrn ROW0L.4h, v12.4s, #16	511

688 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */	512 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

689 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */	513

690 ld1 {v2.4h}, [x15] /* reload constants */	514 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */

691 smull v12.4s, ROW5L.4h, XFIX_1_175875602	515 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */

692 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560	516 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */

693 smull v14.4s, ROW7L.4h, XFIX_1_175875602	517 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */

694 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644	518 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */

695 smull v4.4s, ROW6L.4h, XFIX_0_541196100	519 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */

696 sshll v6.4s, ROW4L.4h, #13	520 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */

697 mov v8.16b, v12.16b	521 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */

698 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447	522

699 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223	523 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) /

700 add v2.4s, v6.4s, v4.4s	524 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) /

701 mov v10.16b, v14.16b	525 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) /

702 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223	526 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) /

703 add v2.4s, v2.4s, v12.4s	527 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) /

704 add v12.4s, v12.4s, v12.4s	528 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) /

705 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447	529 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) /

706 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */	530 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) /

707 sub v2.4s, v2.4s, v12.4s	531 mov v6.16b, v15.16b

708 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865	532 mov v7.16b, v15.16b

709 sub v6.4s, v6.4s, v4.4s	533 mov v8.16b, v15.16b

710 shrn ROW6R.4h, v2.4s, #16	534 mov v9.16b, v15.16b

711 add v2.4s, v6.4s, v10.4s	535 b 1b

712 sub v6.4s, v6.4s, v10.4s	536

713 sshll v10.4s, ROW4L.4h, #13	537 .balign 16

714 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */	538 3:

715 shrn ROW5R.4h, v6.4s, #16	539 cbnz TMP4, 4f

716 add v4.4s, v10.4s, v12.4s	540 /* Left AC coef is zero */

717 sub v2.4s, v10.4s, v12.4s	541 dup v14.2d, v10.d[0]

718 add v12.4s, v4.4s, v14.4s	542 /* Even part: reverse the even part of the forward DCT. */

719 sub v4.4s, v4.4s, v14.4s	543 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE2], quantptr[DCTSIZE2]) + DEQUANTIZE(inptr[DCTSIZE6], quantptr[DCTSIZ E6]) */

720 add v10.4s, v2.4s, v8.4s	544 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE0], quantptr[DCTSIZE0]) + DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZ E4]) */

721 sub v6.4s, v2.4s, v8.4s	545 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */

722 shrn ROW7R.4h, v4.4s, #16	546 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr [DCTSIZE0], quantptr[DCTSIZE0]) - DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZ E4]) */

723 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */	547 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */

724 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */	548 mov v21.16b, v19.16b /* tmp3 = z1 */

725 shrn ROW4R.4h, v6.4s, #16	549 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */

726 b 2b /* Go to epilogue */	550 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */

	551 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */

	552 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3 ; */

	553 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3 ; */

	554 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2 ; */

	555 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2 ; */

	556

	557 /* Odd part per figure 8; the matrix is unitary and hence its

	558 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.

	559 */

	560

	561 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE7], quantptr[DCTSIZE7]) + DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTS IZE3]) */

	562 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE5], quantptr[DCTSIZE5]) + DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTS IZE1]) */

	563 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE7], quantptr[DCTSIZE7]) + DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTS IZE1]) */

	564 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE5], quantptr[DCTSIZE5]) + DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTS IZE3]) */

	565 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */

	566

	567 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */

	568 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */

	569 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */

	570 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */

	571 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */

	572 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */

	573 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */

	574 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */

	575 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */

	576

	577 add v23.4s, v23.4s, v27.4s /* z3 += z5 */

	578 add v22.4s, v22.4s, v26.4s /* z3 += z5 */

	579 add v25.4s, v25.4s, v27.4s /* z4 += z5 */

	580 add v24.4s, v24.4s, v26.4s /* z4 += z5 */

	581

	582 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */

	583 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */

	584 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */

	585 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */

	586

	587 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */

	588 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */

	589 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */

	590 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */

	591

	592 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

	593

	594 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */

	595 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */

	596 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */

	597 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */

	598 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */

	599 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */

	600 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */

	601 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */

	602

	603 mov v2.16b, v14.16b

	604 mov v3.16b, v14.16b

	605 mov v4.16b, v14.16b

	606 mov v5.16b, v14.16b

	607 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) /

	608 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) /

	609 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) /

	610 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) /

	611 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) /

	612 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) /

	613 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) /

	614 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) /

	615 b 1b

	616

	617 .balign 16

	618 4:

	619 /* "No" AC coef is zero */

	620 /* Even part: reverse the even part of the forward DCT. */

	621 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE2], quantptr[DCTSIZE2]) + DEQUANTIZE(inptr[DCTSIZE6], quantptr[DCTSIZ E6]) */

	622 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr [DCTSIZE0], quantptr[DCTSIZE0]) + DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZ E4]) */

	623 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */

	624 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr [DCTSIZE0], quantptr[DCTSIZE0]) - DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZ E4]) */

	625 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */

	626 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */

	627 mov v21.16b, v19.16b /* tmp3 = z1 */

	628 mov v20.16b, v18.16b /* tmp3 = z1 */

	629 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */

	630 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY (z3, - FIX_1_847759065); */

	631 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */

	632 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */

	633 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); */

	634 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */

	635 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */

	636 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3 ; */

	637 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3 ; */

	638 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2 ; */

	639 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2 ; */

	640 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3 ; */

	641 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3 ; */

	642 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2 ; */

	643 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2 ; */

	644

	645 /* Odd part per figure 8; the matrix is unitary and hence its

	646 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.

	647 */

	648

	649 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE7], quantptr[DCTSIZE7]) + DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTS IZE3]) */

	650 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE5], quantptr[DCTSIZE5]) + DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTS IZE1]) */

	651 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inp tr[DCTSIZE7], quantptr[DCTSIZE7]) + DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTS IZE1]) */

	652 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inp tr[DCTSIZE5], quantptr[DCTSIZE5]) + DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTS IZE3]) */

	653 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */

	654

	655 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */

	656 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */

	657 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */

	658 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */

	659 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */

	660 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */

	661 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */

	662 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */

	663 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */

	664

	665 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0 _298631336) */

	666 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2 _053119869) */

	667 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3 _072711026) */

	668 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1 _501321110) */

	669 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */

	670 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560) */

	671 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644) */

	672 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223) */

	673 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447) */

	674

	675 add v23.4s, v23.4s, v27.4s /* z3 += z5 */

	676 add v22.4s, v22.4s, v26.4s /* z3 += z5 */

	677 add v25.4s, v25.4s, v27.4s /* z4 += z5 */

	678 add v24.4s, v24.4s, v26.4s /* z4 += z5 */

	679

	680 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */

	681 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */

	682 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */

	683 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */

	684 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */

	685 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */

	686 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */

	687 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */

	688

	689 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */

	690 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */

	691 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */

	692 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */

	693 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */

	694 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */

	695 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */

	696 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */

	697

	698 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

	699

	700 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */

	701 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */

	702 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */

	703 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */

	704 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */

	705 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */

	706 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */

	707 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */

	708 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */

	709 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */

	710 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */

	711 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */

	712 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */

	713 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */

	714 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */

	715 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */

	716

	717 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) /

	718 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) /

	719 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) /

	720 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) /

	721 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) /

	722 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) /

	723 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) /

	724 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) /

	725 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) /

	726 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) /

	727 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) /

	728 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) /

	729 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) /

	730 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) /

	731 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) /

	732 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) /

	733 b 1b

727	734

728 .unreq DCT_TABLE	735 .unreq DCT_TABLE

729 .unreq COEF_BLOCK	736 .unreq COEF_BLOCK

730 .unreq OUTPUT_BUF	737 .unreq OUTPUT_BUF

731 .unreq OUTPUT_COL	738 .unreq OUTPUT_COL

732 .unreq TMP1	739 .unreq TMP1

733 .unreq TMP2	740 .unreq TMP2

734 .unreq TMP3	741 .unreq TMP3

735 .unreq TMP4	742 .unreq TMP4

	743 .unreq TMP5

	744 .unreq TMP6

	745 .unreq TMP7

	746 .unreq TMP8

736	747

737 .unreq ROW0L	748 #undef CENTERJSAMPLE

738 .unreq ROW0R	749 #undef CONST_BITS

739 .unreq ROW1L	750 #undef PASS1_BITS

740 .unreq ROW1R	751 #undef XFIX_P_0_298

741 .unreq ROW2L	752 #undef XFIX_N_0_390

742 .unreq ROW2R	753 #undef XFIX_P_0_541

743 .unreq ROW3L	754 #undef XFIX_P_0_765

744 .unreq ROW3R	755 #undef XFIX_N_0_899

745 .unreq ROW4L	756 #undef XFIX_P_1_175

746 .unreq ROW4R	757 #undef XFIX_P_1_501

747 .unreq ROW5L	758 #undef XFIX_N_1_847

748 .unreq ROW5R	759 #undef XFIX_N_1_961

749 .unreq ROW6L	760 #undef XFIX_P_2_053

750 .unreq ROW6R	761 #undef XFIX_N_2_562

751 .unreq ROW7L	762 #undef XFIX_P_3_072

752 .unreq ROW7R

753	763

754	764

755 /*****************************************************************************/	765 /*****************************************************************************/

756	766

757 /*	767 /*

758 * jsimd_idct_ifast_neon	768 * jsimd_idct_ifast_neon

759 *	769 *

760 * This function contains a fast, not so accurate integer implementation of	770 * This function contains a fast, not so accurate integer implementation of

761 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations	771 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations

762 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'	772 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'

763 * function from jidctfst.c	773 * function from jidctfst.c

764 *	774 *

765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.	775 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.

766 * But in ARM NEON case some extra additions are required because VQDMULH	776 * But in ARM NEON case some extra additions are required because VQDMULH

767 * instruction can't handle the constants larger than 1. So the expressions	777 * instruction can't handle the constants larger than 1. So the expressions

768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",	778 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",

769 * which introduces an extra addition. Overall, there are 6 extra additions	779 * which introduces an extra addition. Overall, there are 6 extra additions

770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.	780 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.

771 */	781 */

772	782

773 #define XFIX_1_082392200 v0.4h[0]	783 #define XFIX_1_082392200 v0.h[0]

774 #define XFIX_1_414213562 v0.4h[1]	784 #define XFIX_1_414213562 v0.h[1]

775 #define XFIX_1_847759065 v0.4h[2]	785 #define XFIX_1_847759065 v0.h[2]

776 #define XFIX_2_613125930 v0.4h[3]	786 #define XFIX_2_613125930 v0.h[3]

777	787

778 .balign 16	788 .balign 16

779 jsimd_idct_ifast_neon_consts:	789 Ljsimd_idct_ifast_neon_consts:

780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */	790 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */

781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */	791 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */

782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */	792 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */

783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */	793 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */

784	794

785 asm_function jsimd_idct_ifast_neon	795 asm_function jsimd_idct_ifast_neon

786	796

787 DCT_TABLE .req x0	797 DCT_TABLE .req x0

788 COEF_BLOCK .req x1	798 COEF_BLOCK .req x1

789 OUTPUT_BUF .req x2	799 OUTPUT_BUF .req x2

790 OUTPUT_COL .req x3	800 OUTPUT_COL .req x3

791 TMP1 .req x0	801 TMP1 .req x0

792 TMP2 .req x1	802 TMP2 .req x1

793 TMP3 .req x2	803 TMP3 .req x9

794 TMP4 .req x22	804 TMP4 .req x10

795 TMP5 .req x23	805 TMP5 .req x11

	806 TMP6 .req x12

	807 TMP7 .req x13

	808 TMP8 .req x14

796	809

797 /* Load and dequantize coefficients into NEON registers	810 /* Load and dequantize coefficients into NEON registers

798 * with the following allocation:	811 * with the following allocation:

799 * 0 1 2 3 \| 4 5 6 7	812 * 0 1 2 3 \| 4 5 6 7

800 * ---------+--------	813 * ---------+--------

801 * 0 \| d16 \| d17 ( v8.8h )	814 * 0 \| d16 \| d17 ( v16.8h )

802 * 1 \| d18 \| d19 ( v9.8h )	815 * 1 \| d18 \| d19 ( v17.8h )

803 * 2 \| d20 \| d21 ( v10.8h )	816 * 2 \| d20 \| d21 ( v18.8h )

804 * 3 \| d22 \| d23 ( v11.8h )	817 * 3 \| d22 \| d23 ( v19.8h )

805 * 4 \| d24 \| d25 ( v12.8h )	818 * 4 \| d24 \| d25 ( v20.8h )

806 * 5 \| d26 \| d27 ( v13.8h )	819 * 5 \| d26 \| d27 ( v21.8h )

807 * 6 \| d28 \| d29 ( v14.8h )	820 * 6 \| d28 \| d29 ( v22.8h )

808 * 7 \| d30 \| d31 ( v15.8h )	821 * 7 \| d30 \| d31 ( v23.8h )

809 */	822 */

810 /* Save NEON registers used in fast IDCT */	823 /* Save NEON registers used in fast IDCT */

811 sub sp, sp, #176	824 adr TMP5, Ljsimd_idct_ifast_neon_consts

812 stp x22, x23, [sp], 16	825 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32

813 adr x23, jsimd_idct_ifast_neon_consts

814 st1 {v0.8b - v3.8b}, [sp], 32

815 st1 {v4.8b - v7.8b}, [sp], 32

816 st1 {v8.8b - v11.8b}, [sp], 32

817 st1 {v12.8b - v15.8b}, [sp], 32

818 st1 {v16.8b - v19.8b}, [sp], 32

819 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32

820 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32	826 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32

821 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32	827 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32

822 mul v8.8h, v8.8h, v0.8h	828 mul v16.8h, v16.8h, v0.8h

823 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32	829 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32

824 mul v9.8h, v9.8h, v1.8h	830 mul v17.8h, v17.8h, v1.8h

825 ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32	831 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32

826 mul v10.8h, v10.8h, v2.8h	832 mul v18.8h, v18.8h, v2.8h

827 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32	833 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32

828 mul v11.8h, v11.8h, v3.8h	834 mul v19.8h, v19.8h, v3.8h

829 ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32	835 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32

830 mul v12.8h, v12.8h, v0.8h	836 mul v20.8h, v20.8h, v0.8h

831 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32	837 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32

832 mul v14.8h, v14.8h, v2.8h	838 mul v22.8h, v22.8h, v2.8h

833 mul v13.8h, v13.8h, v1.8h	839 mul v21.8h, v21.8h, v1.8h

834 ld1 {v0.4h}, [x23] /* load constants */	840 ld1 {v0.4h}, [TMP5] /* load constants */

835 mul v15.8h, v15.8h, v3.8h	841 mul v23.8h, v23.8h, v3.8h

836	842

837 /* 1-D IDCT, pass 1 */	843 /* 1-D IDCT, pass 1 */

838 sub v2.8h, v10.8h, v14.8h	844 sub v2.8h, v18.8h, v22.8h

839 add v14.8h, v10.8h, v14.8h	845 add v22.8h, v18.8h, v22.8h

840 sub v1.8h, v11.8h, v13.8h	846 sub v1.8h, v19.8h, v21.8h

841 add v13.8h, v11.8h, v13.8h	847 add v21.8h, v19.8h, v21.8h

842 sub v5.8h, v9.8h, v15.8h	848 sub v5.8h, v17.8h, v23.8h

843 add v15.8h, v9.8h, v15.8h	849 add v23.8h, v17.8h, v23.8h

844 sqdmulh v4.8h, v2.8h, XFIX_1_414213562	850 sqdmulh v4.8h, v2.8h, XFIX_1_414213562

845 sqdmulh v6.8h, v1.8h, XFIX_2_613125930	851 sqdmulh v6.8h, v1.8h, XFIX_2_613125930

846 add v3.8h, v1.8h, v1.8h	852 add v3.8h, v1.8h, v1.8h

847 sub v1.8h, v5.8h, v1.8h	853 sub v1.8h, v5.8h, v1.8h

848 add v10.8h, v2.8h, v4.8h	854 add v18.8h, v2.8h, v4.8h

849 sqdmulh v4.8h, v1.8h, XFIX_1_847759065	855 sqdmulh v4.8h, v1.8h, XFIX_1_847759065

850 sub v2.8h, v15.8h, v13.8h	856 sub v2.8h, v23.8h, v21.8h

851 add v3.8h, v3.8h, v6.8h	857 add v3.8h, v3.8h, v6.8h

852 sqdmulh v6.8h, v2.8h, XFIX_1_414213562	858 sqdmulh v6.8h, v2.8h, XFIX_1_414213562

853 add v1.8h, v1.8h, v4.8h	859 add v1.8h, v1.8h, v4.8h

854 sqdmulh v4.8h, v5.8h, XFIX_1_082392200	860 sqdmulh v4.8h, v5.8h, XFIX_1_082392200

855 sub v10.8h, v10.8h, v14.8h	861 sub v18.8h, v18.8h, v22.8h

856 add v2.8h, v2.8h, v6.8h	862 add v2.8h, v2.8h, v6.8h

857 sub v6.8h, v8.8h, v12.8h	863 sub v6.8h, v16.8h, v20.8h

858 add v12.8h, v8.8h, v12.8h	864 add v20.8h, v16.8h, v20.8h

859 add v9.8h, v5.8h, v4.8h	865 add v17.8h, v5.8h, v4.8h

860 add v5.8h, v6.8h, v10.8h	866 add v5.8h, v6.8h, v18.8h

861 sub v10.8h, v6.8h, v10.8h	867 sub v18.8h, v6.8h, v18.8h

862 add v6.8h, v15.8h, v13.8h	868 add v6.8h, v23.8h, v21.8h

863 add v8.8h, v12.8h, v14.8h	869 add v16.8h, v20.8h, v22.8h

864 sub v3.8h, v6.8h, v3.8h	870 sub v3.8h, v6.8h, v3.8h

865 sub v12.8h, v12.8h, v14.8h	871 sub v20.8h, v20.8h, v22.8h

866 sub v3.8h, v3.8h, v1.8h	872 sub v3.8h, v3.8h, v1.8h

867 sub v1.8h, v9.8h, v1.8h	873 sub v1.8h, v17.8h, v1.8h

868 add v2.8h, v3.8h, v2.8h	874 add v2.8h, v3.8h, v2.8h

869 sub v15.8h, v8.8h, v6.8h	875 sub v23.8h, v16.8h, v6.8h

870 add v1.8h, v1.8h, v2.8h	876 add v1.8h, v1.8h, v2.8h

871 add v8.8h, v8.8h, v6.8h	877 add v16.8h, v16.8h, v6.8h

872 add v14.8h, v5.8h, v3.8h	878 add v22.8h, v5.8h, v3.8h

873 sub v9.8h, v5.8h, v3.8h	879 sub v17.8h, v5.8h, v3.8h

874 sub v13.8h, v10.8h, v2.8h	880 sub v21.8h, v18.8h, v2.8h

875 add v10.8h, v10.8h, v2.8h	881 add v18.8h, v18.8h, v2.8h

876 /* Transpose q8-q9 */	882 sub v19.8h, v20.8h, v1.8h

877 mov v18.16b, v8.16b	883 add v20.8h, v20.8h, v1.8h

878 trn1 v8.8h, v8.8h, v9.8h	884 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31

879 trn2 v9.8h, v18.8h, v9.8h	885 /* 1-D IDCT, pass 2 */

880 sub v11.8h, v12.8h, v1.8h	886 sub v2.8h, v18.8h, v22.8h

881 /* Transpose q14-q15 */	887 add v22.8h, v18.8h, v22.8h

882 mov v18.16b, v14.16b	888 sub v1.8h, v19.8h, v21.8h

883 trn1 v14.8h, v14.8h, v15.8h	889 add v21.8h, v19.8h, v21.8h

884 trn2 v15.8h, v18.8h, v15.8h	890 sub v5.8h, v17.8h, v23.8h

885 add v12.8h, v12.8h, v1.8h	891 add v23.8h, v17.8h, v23.8h

886 /* Transpose q10-q11 */	892 sqdmulh v4.8h, v2.8h, XFIX_1_414213562

887 mov v18.16b, v10.16b	893 sqdmulh v6.8h, v1.8h, XFIX_2_613125930

888 trn1 v10.8h, v10.8h, v11.8h	894 add v3.8h, v1.8h, v1.8h

889 trn2 v11.8h, v18.8h, v11.8h	895 sub v1.8h, v5.8h, v1.8h

890 /* Transpose q12-q13 */	896 add v18.8h, v2.8h, v4.8h

891 mov v18.16b, v12.16b	897 sqdmulh v4.8h, v1.8h, XFIX_1_847759065

892 trn1 v12.8h, v12.8h, v13.8h	898 sub v2.8h, v23.8h, v21.8h

893 trn2 v13.8h, v18.8h, v13.8h	899 add v3.8h, v3.8h, v6.8h

894 /* Transpose q9-q11 */	900 sqdmulh v6.8h, v2.8h, XFIX_1_414213562

895 mov v18.16b, v9.16b	901 add v1.8h, v1.8h, v4.8h

896 trn1 v9.4s, v9.4s, v11.4s	902 sqdmulh v4.8h, v5.8h, XFIX_1_082392200

897 trn2 v11.4s, v18.4s, v11.4s	903 sub v18.8h, v18.8h, v22.8h

898 /* Transpose q12-q14 */	904 add v2.8h, v2.8h, v6.8h

899 mov v18.16b, v12.16b	905 sub v6.8h, v16.8h, v20.8h

900 trn1 v12.4s, v12.4s, v14.4s	906 add v20.8h, v16.8h, v20.8h

901 trn2 v14.4s, v18.4s, v14.4s	907 add v17.8h, v5.8h, v4.8h

902 /* Transpose q8-q10 */	908 add v5.8h, v6.8h, v18.8h

903 mov v18.16b, v8.16b	909 sub v18.8h, v6.8h, v18.8h

904 trn1 v8.4s, v8.4s, v10.4s	910 add v6.8h, v23.8h, v21.8h

905 trn2 v10.4s, v18.4s, v10.4s	911 add v16.8h, v20.8h, v22.8h

906 /* Transpose q13-q15 */	912 sub v3.8h, v6.8h, v3.8h

907 mov v18.16b, v13.16b	913 sub v20.8h, v20.8h, v22.8h

908 trn1 v13.4s, v13.4s, v15.4s	914 sub v3.8h, v3.8h, v1.8h

909 trn2 v15.4s, v18.4s, v15.4s	915 sub v1.8h, v17.8h, v1.8h

910 /* vswp v14.4h, v10-MSB.4h */	916 add v2.8h, v3.8h, v2.8h

911 umov x22, v14.d[0]	917 sub v23.8h, v16.8h, v6.8h

912 ins v14.2d[0], v10.2d[1]	918 add v1.8h, v1.8h, v2.8h

913 ins v10.2d[1], x22	919 add v16.8h, v16.8h, v6.8h

914 /* vswp v13.4h, v9MSB.4h */	920 add v22.8h, v5.8h, v3.8h

	921 sub v17.8h, v5.8h, v3.8h

	922 sub v21.8h, v18.8h, v2.8h

	923 add v18.8h, v18.8h, v2.8h

	924 sub v19.8h, v20.8h, v1.8h

	925 add v20.8h, v20.8h, v1.8h

	926 /* Descale to 8-bit and range limit */

	927 movi v0.16b, #0x80

	928 /* Prepare pointers (dual-issue with NEON instructions) */

	929 ldp TMP1, TMP2, [OUTPUT_BUF], 16

	930 sqshrn v28.8b, v16.8h, #5

	931 ldp TMP3, TMP4, [OUTPUT_BUF], 16

	932 sqshrn v29.8b, v17.8h, #5

	933 add TMP1, TMP1, OUTPUT_COL

	934 sqshrn v30.8b, v18.8h, #5

	935 add TMP2, TMP2, OUTPUT_COL

	936 sqshrn v31.8b, v19.8h, #5

	937 add TMP3, TMP3, OUTPUT_COL

	938 sqshrn2 v28.16b, v20.8h, #5

	939 add TMP4, TMP4, OUTPUT_COL

	940 sqshrn2 v29.16b, v21.8h, #5

	941 ldp TMP5, TMP6, [OUTPUT_BUF], 16

	942 sqshrn2 v30.16b, v22.8h, #5

	943 ldp TMP7, TMP8, [OUTPUT_BUF], 16

	944 sqshrn2 v31.16b, v23.8h, #5

	945 add TMP5, TMP5, OUTPUT_COL

	946 add v16.16b, v28.16b, v0.16b

	947 add TMP6, TMP6, OUTPUT_COL

	948 add v18.16b, v29.16b, v0.16b

	949 add TMP7, TMP7, OUTPUT_COL

	950 add v20.16b, v30.16b, v0.16b

	951 add TMP8, TMP8, OUTPUT_COL

	952 add v22.16b, v31.16b, v0.16b

915	953

916 umov x22, v13.d[0]

917 ins v13.2d[0], v9.2d[1]

918 ins v9.2d[1], x22

919 /* 1-D IDCT, pass 2 */

920 sub v2.8h, v10.8h, v14.8h

921 /* vswp v15.4h, v11MSB.4h */

922 umov x22, v15.d[0]

923 ins v15.2d[0], v11.2d[1]

924 ins v11.2d[1], x22

925 add v14.8h, v10.8h, v14.8h

926 /* vswp v12.4h, v8-MSB.4h */

927 umov x22, v12.d[0]

928 ins v12.2d[0], v8.2d[1]

929 ins v8.2d[1], x22

930 sub v1.8h, v11.8h, v13.8h

931 add v13.8h, v11.8h, v13.8h

932 sub v5.8h, v9.8h, v15.8h

933 add v15.8h, v9.8h, v15.8h

934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562

935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930

936 add v3.8h, v1.8h, v1.8h

937 sub v1.8h, v5.8h, v1.8h

938 add v10.8h, v2.8h, v4.8h

939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065

940 sub v2.8h, v15.8h, v13.8h

941 add v3.8h, v3.8h, v6.8h

942 sqdmulh v6.8h, v2.8h, XFIX_1_414213562

943 add v1.8h, v1.8h, v4.8h

944 sqdmulh v4.8h, v5.8h, XFIX_1_082392200

945 sub v10.8h, v10.8h, v14.8h

946 add v2.8h, v2.8h, v6.8h

947 sub v6.8h, v8.8h, v12.8h

948 add v12.8h, v8.8h, v12.8h

949 add v9.8h, v5.8h, v4.8h

950 add v5.8h, v6.8h, v10.8h

951 sub v10.8h, v6.8h, v10.8h

952 add v6.8h, v15.8h, v13.8h

953 add v8.8h, v12.8h, v14.8h

954 sub v3.8h, v6.8h, v3.8h

955 sub v12.8h, v12.8h, v14.8h

956 sub v3.8h, v3.8h, v1.8h

957 sub v1.8h, v9.8h, v1.8h

958 add v2.8h, v3.8h, v2.8h

959 sub v15.8h, v8.8h, v6.8h

960 add v1.8h, v1.8h, v2.8h

961 add v8.8h, v8.8h, v6.8h

962 add v14.8h, v5.8h, v3.8h

963 sub v9.8h, v5.8h, v3.8h

964 sub v13.8h, v10.8h, v2.8h

965 add v10.8h, v10.8h, v2.8h

966 sub v11.8h, v12.8h, v1.8h

967 add v12.8h, v12.8h, v1.8h

968 /* Descale to 8-bit and range limit */

969 movi v0.16b, #0x80

970 sqshrn v8.8b, v8.8h, #5

971 sqshrn2 v8.16b, v9.8h, #5

972 sqshrn v9.8b, v10.8h, #5

973 sqshrn2 v9.16b, v11.8h, #5

974 sqshrn v10.8b, v12.8h, #5

975 sqshrn2 v10.16b, v13.8h, #5

976 sqshrn v11.8b, v14.8h, #5

977 sqshrn2 v11.16b, v15.8h, #5

978 add v8.16b, v8.16b, v0.16b

979 add v9.16b, v9.16b, v0.16b

980 add v10.16b, v10.16b, v0.16b

981 add v11.16b, v11.16b, v0.16b

982 /* Transpose the final 8-bit samples */	954 /* Transpose the final 8-bit samples */

983 /* Transpose q8-q9 */	955 trn1 v28.16b, v16.16b, v18.16b

984 mov v18.16b, v8.16b	956 trn1 v30.16b, v20.16b, v22.16b

985 trn1 v8.8h, v8.8h, v9.8h	957 trn2 v29.16b, v16.16b, v18.16b

986 trn2 v9.8h, v18.8h, v9.8h	958 trn2 v31.16b, v20.16b, v22.16b

987 /* Transpose q10-q11 */	959

988 mov v18.16b, v10.16b	960 trn1 v16.8h, v28.8h, v30.8h

989 trn1 v10.8h, v10.8h, v11.8h	961 trn2 v18.8h, v28.8h, v30.8h

990 trn2 v11.8h, v18.8h, v11.8h	962 trn1 v20.8h, v29.8h, v31.8h

991 /* Transpose q8-q10 */	963 trn2 v22.8h, v29.8h, v31.8h

992 mov v18.16b, v8.16b	964

993 trn1 v8.4s, v8.4s, v10.4s	965 uzp1 v28.4s, v16.4s, v18.4s

994 trn2 v10.4s, v18.4s, v10.4s	966 uzp2 v30.4s, v16.4s, v18.4s

995 /* Transpose q9-q11 */	967 uzp1 v29.4s, v20.4s, v22.4s

996 mov v18.16b, v9.16b	968 uzp2 v31.4s, v20.4s, v22.4s

997 trn1 v9.4s, v9.4s, v11.4s	969

998 trn2 v11.4s, v18.4s, v11.4s

999 /* make copy */

1000 ins v17.2d[0], v8.2d[1]

1001 /* Transpose d16-d17-msb */

1002 mov v18.16b, v8.16b

1003 trn1 v8.8b, v8.8b, v17.8b

1004 trn2 v17.8b, v18.8b, v17.8b

1005 /* make copy */

1006 ins v19.2d[0], v9.2d[1]

1007 mov v18.16b, v9.16b

1008 trn1 v9.8b, v9.8b, v19.8b

1009 trn2 v19.8b, v18.8b, v19.8b

1010 /* Store results to the output buffer */	970 /* Store results to the output buffer */

1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16	971 st1 {v28.d}[0], [TMP1]

1012 add TMP1, TMP1, OUTPUT_COL	972 st1 {v29.d}[0], [TMP2]

1013 add TMP2, TMP2, OUTPUT_COL	973 st1 {v28.d}[1], [TMP3]

1014 st1 {v8.8b}, [TMP1]	974 st1 {v29.d}[1], [TMP4]

1015 st1 {v17.8b}, [TMP2]	975 st1 {v30.d}[0], [TMP5]

1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16	976 st1 {v31.d}[0], [TMP6]

1017 add TMP1, TMP1, OUTPUT_COL	977 st1 {v30.d}[1], [TMP7]

1018 add TMP2, TMP2, OUTPUT_COL	978 st1 {v31.d}[1], [TMP8]

1019 st1 {v9.8b}, [TMP1]

1020 /* make copy */

1021 ins v7.2d[0], v10.2d[1]

1022 mov v18.16b, v10.16b

1023 trn1 v10.8b, v10.8b, v7.8b

1024 trn2 v7.8b, v18.8b, v7.8b

1025 st1 {v19.8b}, [TMP2]

1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16

1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16

1028 add TMP1, TMP1, OUTPUT_COL

1029 add TMP2, TMP2, OUTPUT_COL

1030 add TMP4, TMP4, OUTPUT_COL

1031 add TMP5, TMP5, OUTPUT_COL

1032 st1 {v10.8b}, [TMP1]

1033 /* make copy */

1034 ins v16.2d[0], v11.2d[1]

1035 mov v18.16b, v11.16b

1036 trn1 v11.8b, v11.8b, v16.8b

1037 trn2 v16.8b, v18.8b, v16.8b

1038 st1 {v7.8b}, [TMP2]

1039 st1 {v11.8b}, [TMP4]

1040 st1 {v16.8b}, [TMP5]

1041 sub sp, sp, #176

1042 ldp x22, x23, [sp], 16

1043 ld1 {v0.8b - v3.8b}, [sp], 32

1044 ld1 {v4.8b - v7.8b}, [sp], 32

1045 ld1 {v8.8b - v11.8b}, [sp], 32

1046 ld1 {v12.8b - v15.8b}, [sp], 32

1047 ld1 {v16.8b - v19.8b}, [sp], 32

1048 blr x30	979 blr x30

1049	980

1050 .unreq DCT_TABLE	981 .unreq DCT_TABLE

1051 .unreq COEF_BLOCK	982 .unreq COEF_BLOCK

1052 .unreq OUTPUT_BUF	983 .unreq OUTPUT_BUF

1053 .unreq OUTPUT_COL	984 .unreq OUTPUT_COL

1054 .unreq TMP1	985 .unreq TMP1

1055 .unreq TMP2	986 .unreq TMP2

1056 .unreq TMP3	987 .unreq TMP3

1057 .unreq TMP4	988 .unreq TMP4

	989 .unreq TMP5

	990 .unreq TMP6

	991 .unreq TMP7

	992 .unreq TMP8

1058	993

1059	994

1060 /*****************************************************************************/	995 /*****************************************************************************/

1061	996

1062 /*	997 /*

1063 * jsimd_idct_4x4_neon	998 * jsimd_idct_4x4_neon

1064 *	999 *

1065 * This function contains inverse-DCT code for getting reduced-size	1000 * This function contains inverse-DCT code for getting reduced-size

1066 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations	1001 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations

1067 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'	1002 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'

1068 * function from jpeg-6b (jidctred.c).	1003 * function from jpeg-6b (jidctred.c).

1069 *	1004 *

1070 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which	1005 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which

1071 * requires much less arithmetic operations and hence should be faster.	1006 * requires much less arithmetic operations and hence should be faster.

1072 * The primary purpose of this particular NEON optimized function is	1007 * The primary purpose of this particular NEON optimized function is

1073 * bit exact compatibility with jpeg-6b.	1008 * bit exact compatibility with jpeg-6b.

1074 *	1009 *

1075 * TODO: a bit better instructions scheduling can be achieved by expanding	1010 * TODO: a bit better instructions scheduling can be achieved by expanding

1076 * idct_helper/transpose_4x4 macros and reordering instructions,	1011 * idct_helper/transpose_4x4 macros and reordering instructions,

1077 * but readability will suffer somewhat.	1012 * but readability will suffer somewhat.

1078 */	1013 */

1079	1014

1080 #define CONST_BITS 13	1015 #define CONST_BITS 13

1081	1016

1082 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */	1017 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */

1083 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */	1018 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */

1084 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */	1019 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */

1085 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */	1020 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */

1086 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */	1021 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */

1087 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */	1022 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */

1088 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */	1023 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */

1089 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */	1024 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */

1090 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */	1025 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */

1091 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */	1026 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */

1092 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */	1027 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */

1093 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */	1028 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */

1094 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */	1029 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */

1095 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */	1030 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */

1096	1031

1097 .balign 16	1032 .balign 16

1098 jsimd_idct_4x4_neon_consts:	1033 Ljsimd_idct_4x4_neon_consts:

1099 .short FIX_1_847759065 /* v0.4h[0] */	1034 .short FIX_1_847759065 /* v0.h[0] */

1100 .short -FIX_0_765366865 /* v0.4h[1] */	1035 .short -FIX_0_765366865 /* v0.h[1] */

1101 .short -FIX_0_211164243 /* v0.4h[2] */	1036 .short -FIX_0_211164243 /* v0.h[2] */

1102 .short FIX_1_451774981 /* v0.4h[3] */	1037 .short FIX_1_451774981 /* v0.h[3] */

1103 .short -FIX_2_172734803 /* d1[0] */	1038 .short -FIX_2_172734803 /* d1[0] */

1104 .short FIX_1_061594337 /* d1[1] */	1039 .short FIX_1_061594337 /* d1[1] */

1105 .short -FIX_0_509795579 /* d1[2] */	1040 .short -FIX_0_509795579 /* d1[2] */

1106 .short -FIX_0_601344887 /* d1[3] */	1041 .short -FIX_0_601344887 /* d1[3] */

1107 .short FIX_0_899976223 /* v2.4h[0] */	1042 .short FIX_0_899976223 /* v2.h[0] */

1108 .short FIX_2_562915447 /* v2.4h[1] */	1043 .short FIX_2_562915447 /* v2.h[1] */

1109 .short 1 << (CONST_BITS+1) /* v2.4h[2] */	1044 .short 1 << (CONST_BITS+1) /* v2.h[2] */

1110 .short 0 /* v2.4h[3] */	1045 .short 0 /* v2.h[3] */

1111	1046

1112 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29	1047 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29

1113 smull v28.4s, \x4, v2.4h[2]	1048 smull v28.4s, \x4, v2.h[2]

1114 smlal v28.4s, \x8, v0.4h[0]	1049 smlal v28.4s, \x8, v0.h[0]

1115 smlal v28.4s, \x14, v0.4h[1]	1050 smlal v28.4s, \x14, v0.h[1]

1116	1051

1117 smull v26.4s, \x16, v1.4h[2]	1052 smull v26.4s, \x16, v1.h[2]

1118 smlal v26.4s, \x12, v1.4h[3]	1053 smlal v26.4s, \x12, v1.h[3]

1119 smlal v26.4s, \x10, v2.4h[0]	1054 smlal v26.4s, \x10, v2.h[0]

1120 smlal v26.4s, \x6, v2.4h[1]	1055 smlal v26.4s, \x6, v2.h[1]

1121	1056

1122 smull v30.4s, \x4, v2.4h[2]	1057 smull v30.4s, \x4, v2.h[2]

1123 smlsl v30.4s, \x8, v0.4h[0]	1058 smlsl v30.4s, \x8, v0.h[0]

1124 smlsl v30.4s, \x14, v0.4h[1]	1059 smlsl v30.4s, \x14, v0.h[1]

1125	1060

1126 smull v24.4s, \x16, v0.4h[2]	1061 smull v24.4s, \x16, v0.h[2]

1127 smlal v24.4s, \x12, v0.4h[3]	1062 smlal v24.4s, \x12, v0.h[3]

1128 smlal v24.4s, \x10, v1.4h[0]	1063 smlal v24.4s, \x10, v1.h[0]

1129 smlal v24.4s, \x6, v1.4h[1]	1064 smlal v24.4s, \x6, v1.h[1]

1130	1065

1131 add v20.4s, v28.4s, v26.4s	1066 add v20.4s, v28.4s, v26.4s

1132 sub v28.4s, v28.4s, v26.4s	1067 sub v28.4s, v28.4s, v26.4s

1133	1068

1134 .if \shift > 16	1069 .if \shift > 16

1135 srshr v20.4s, v20.4s, #\shift	1070 srshr v20.4s, v20.4s, #\shift

1136 srshr v28.4s, v28.4s, #\shift	1071 srshr v28.4s, v28.4s, #\shift

1137 xtn \y26, v20.4s	1072 xtn \y26, v20.4s

1138 xtn \y29, v28.4s	1073 xtn \y29, v28.4s

1139 .else	1074 .else

1140 rshrn \y26, v20.4s, #\shift	1075 rshrn \y26, v20.4s, #\shift

1141 rshrn \y29, v28.4s, #\shift	1076 rshrn \y29, v28.4s, #\shift

1142 .endif	1077 .endif

1143	1078

1144 add v20.4s, v30.4s, v24.4s	1079 add v20.4s, v30.4s, v24.4s

1145 sub v30.4s, v30.4s, v24.4s	1080 sub v30.4s, v30.4s, v24.4s

1146	1081

1147 .if \shift > 16	1082 .if \shift > 16

1148 srshr v20.4s, v20.4s, #\shift	1083 srshr v20.4s, v20.4s, #\shift

1149 srshr v30.4s, v30.4s, #\shift	1084 srshr v30.4s, v30.4s, #\shift

1150 xtn \y27, v20.4s	1085 xtn \y27, v20.4s

1151 xtn \y28, v30.4s	1086 xtn \y28, v30.4s

1152 .else	1087 .else

1153 rshrn \y27, v20.4s, #\shift	1088 rshrn \y27, v20.4s, #\shift

1154 rshrn \y28, v30.4s, #\shift	1089 rshrn \y28, v30.4s, #\shift

1155 .endif	1090 .endif

1156

1157 .endm	1091 .endm

1158	1092

1159 asm_function jsimd_idct_4x4_neon	1093 asm_function jsimd_idct_4x4_neon

1160	1094

1161 DCT_TABLE .req x0	1095 DCT_TABLE .req x0

1162 COEF_BLOCK .req x1	1096 COEF_BLOCK .req x1

1163 OUTPUT_BUF .req x2	1097 OUTPUT_BUF .req x2

1164 OUTPUT_COL .req x3	1098 OUTPUT_COL .req x3

1165 TMP1 .req x0	1099 TMP1 .req x0

1166 TMP2 .req x1	1100 TMP2 .req x1

1167 TMP3 .req x2	1101 TMP3 .req x2

1168 TMP4 .req x15	1102 TMP4 .req x15

1169	1103

1170 /* Save all used NEON registers */	1104 /* Save all used NEON registers */

1171 sub sp, sp, 272	1105 sub sp, sp, 272

1172 str x15, [sp], 16	1106 str x15, [sp], 16

1173 /* Load constants (v3.4h is just used for padding) */	1107 /* Load constants (v3.4h is just used for padding) */

1174 adr TMP4, jsimd_idct_4x4_neon_consts	1108 adr TMP4, Ljsimd_idct_4x4_neon_consts

1175 st1 {v0.8b - v3.8b}, [sp], 32	1109 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32

1176 st1 {v4.8b - v7.8b}, [sp], 32	1110 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32

1177 st1 {v8.8b - v11.8b}, [sp], 32	1111 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

1178 st1 {v12.8b - v15.8b}, [sp], 32	1112 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

1179 st1 {v16.8b - v19.8b}, [sp], 32	1113 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32

1180 st1 {v20.8b - v23.8b}, [sp], 32	1114 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32

1181 st1 {v24.8b - v27.8b}, [sp], 32	1115 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32

1182 st1 {v28.8b - v31.8b}, [sp], 32	1116 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32

1183 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]	1117 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]

1184	1118

1185 /* Load all COEF_BLOCK into NEON registers with the following allocation:	1119 /* Load all COEF_BLOCK into NEON registers with the following allocation:

1186 * 0 1 2 3 \| 4 5 6 7	1120 * 0 1 2 3 \| 4 5 6 7

1187 * ---------+--------	1121 * ---------+--------

1188 * 0 \| v4.4h \| v5.4h	1122 * 0 \| v4.4h \| v5.4h

1189 * 1 \| v6.4h \| v7.4h	1123 * 1 \| v6.4h \| v7.4h

1190 * 2 \| v8.4h \| v9.4h	1124 * 2 \| v8.4h \| v9.4h

1191 * 3 \| v10.4h \| v11.4h	1125 * 3 \| v10.4h \| v11.4h

1192 * 4 \| - \| -	1126 * 4 \| - \| -

1193 * 5 \| v12.4h \| v13.4h	1127 * 5 \| v12.4h \| v13.4h

1194 * 6 \| v14.4h \| v15.4h	1128 * 6 \| v14.4h \| v15.4h

1195 * 7 \| v16.4h \| v17.4h	1129 * 7 \| v16.4h \| v17.4h

1196 */	1130 */

1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32	1131 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32

1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32	1132 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32

1199 add COEF_BLOCK, COEF_BLOCK, #16	1133 add COEF_BLOCK, COEF_BLOCK, #16

1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32	1134 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32

1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16	1135 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16

1202 /* dequantize */	1136 /* dequantize */

1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32	1137 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32

1204 mul v4.4h, v4.4h, v18.4h	1138 mul v4.4h, v4.4h, v18.4h

1205 mul v5.4h, v5.4h, v19.4h	1139 mul v5.4h, v5.4h, v19.4h

1206 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */	1140 ins v4.d[1], v5.d[0] /* 128 bit q4 */

1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32	1141 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32

1208 mul v6.4h, v6.4h, v20.4h	1142 mul v6.4h, v6.4h, v20.4h

1209 mul v7.4h, v7.4h, v21.4h	1143 mul v7.4h, v7.4h, v21.4h

1210 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */	1144 ins v6.d[1], v7.d[0] /* 128 bit q6 */

1211 mul v8.4h, v8.4h, v22.4h	1145 mul v8.4h, v8.4h, v22.4h

1212 mul v9.4h, v9.4h, v23.4h	1146 mul v9.4h, v9.4h, v23.4h

1213 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */	1147 ins v8.d[1], v9.d[0] /* 128 bit q8 */

1214 add DCT_TABLE, DCT_TABLE, #16	1148 add DCT_TABLE, DCT_TABLE, #16

1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32	1149 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32

1216 mul v10.4h, v10.4h, v24.4h	1150 mul v10.4h, v10.4h, v24.4h

1217 mul v11.4h, v11.4h, v25.4h	1151 mul v11.4h, v11.4h, v25.4h

1218 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */	1152 ins v10.d[1], v11.d[0] /* 128 bit q10 */

1219 mul v12.4h, v12.4h, v26.4h	1153 mul v12.4h, v12.4h, v26.4h

1220 mul v13.4h, v13.4h, v27.4h	1154 mul v13.4h, v13.4h, v27.4h

1221 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */	1155 ins v12.d[1], v13.d[0] /* 128 bit q12 */

1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16	1156 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16

1223 mul v14.4h, v14.4h, v28.4h	1157 mul v14.4h, v14.4h, v28.4h

1224 mul v15.4h, v15.4h, v29.4h	1158 mul v15.4h, v15.4h, v29.4h

1225 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */	1159 ins v14.d[1], v15.d[0] /* 128 bit q14 */

1226 mul v16.4h, v16.4h, v30.4h	1160 mul v16.4h, v16.4h, v30.4h

1227 mul v17.4h, v17.4h, v31.4h	1161 mul v17.4h, v17.4h, v31.4h

1228 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */	1162 ins v16.d[1], v17.d[0] /* 128 bit q16 */

1229	1163

1230 /* Pass 1 */	1164 /* Pass 1 */

1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h	1165 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \

	1166 v4.4h, v6.4h, v8.4h, v10.4h

1232 transpose_4x4 v4, v6, v8, v10, v3	1167 transpose_4x4 v4, v6, v8, v10, v3

1233 ins v10.2d[1], v11.2d[0]	1168 ins v10.d[1], v11.d[0]

1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h	1169 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \

	1170 v5.4h, v7.4h, v9.4h, v11.4h

1235 transpose_4x4 v5, v7, v9, v11, v3	1171 transpose_4x4 v5, v7, v9, v11, v3

1236 ins v10.2d[1], v11.2d[0]	1172 ins v10.d[1], v11.d[0]

	1173

1237 /* Pass 2 */	1174 /* Pass 2 */

1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h	1175 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \

	1176 v26.4h, v27.4h, v28.4h, v29.4h

1239 transpose_4x4 v26, v27, v28, v29, v3	1177 transpose_4x4 v26, v27, v28, v29, v3

1240	1178

1241 /* Range limit */	1179 /* Range limit */

1242 movi v30.8h, #0x80	1180 movi v30.8h, #0x80

1243 ins v26.2d[1], v27.2d[0]	1181 ins v26.d[1], v27.d[0]

1244 ins v28.2d[1], v29.2d[0]	1182 ins v28.d[1], v29.d[0]

1245 add v26.8h, v26.8h, v30.8h	1183 add v26.8h, v26.8h, v30.8h

1246 add v28.8h, v28.8h, v30.8h	1184 add v28.8h, v28.8h, v30.8h

1247 sqxtun v26.8b, v26.8h	1185 sqxtun v26.8b, v26.8h

1248 sqxtun v27.8b, v28.8h	1186 sqxtun v27.8b, v28.8h

1249	1187

1250 /* Store results to the output buffer */	1188 /* Store results to the output buffer */

1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16	1189 ldp TMP1, TMP2, [OUTPUT_BUF], 16

1252 ldp TMP3, TMP4, [OUTPUT_BUF]	1190 ldp TMP3, TMP4, [OUTPUT_BUF]

1253 add TMP1, TMP1, OUTPUT_COL	1191 add TMP1, TMP1, OUTPUT_COL

1254 add TMP2, TMP2, OUTPUT_COL	1192 add TMP2, TMP2, OUTPUT_COL

(...skipping 24 matching lines...) Expand all Loading...
1279 st1 {v27.b}[5], [TMP4], 1	1217 st1 {v27.b}[5], [TMP4], 1

1280 st1 {v26.b}[6], [TMP2], 1	1218 st1 {v26.b}[6], [TMP2], 1

1281 st1 {v27.b}[6], [TMP4], 1	1219 st1 {v27.b}[6], [TMP4], 1

1282 st1 {v26.b}[7], [TMP2], 1	1220 st1 {v26.b}[7], [TMP2], 1

1283 st1 {v27.b}[7], [TMP4], 1	1221 st1 {v27.b}[7], [TMP4], 1

1284 #endif	1222 #endif

1285	1223

1286 /* vpop {v8.4h - v15.4h} ;not available */	1224 /* vpop {v8.4h - v15.4h} ;not available */

1287 sub sp, sp, #272	1225 sub sp, sp, #272

1288 ldr x15, [sp], 16	1226 ldr x15, [sp], 16

1289 ld1 {v0.8b - v3.8b}, [sp], 32	1227 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32

1290 ld1 {v4.8b - v7.8b}, [sp], 32	1228 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32

1291 ld1 {v8.8b - v11.8b}, [sp], 32	1229 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

1292 ld1 {v12.8b - v15.8b}, [sp], 32	1230 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

1293 ld1 {v16.8b - v19.8b}, [sp], 32	1231 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32

1294 ld1 {v20.8b - v23.8b}, [sp], 32	1232 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32

1295 ld1 {v24.8b - v27.8b}, [sp], 32	1233 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32

1296 ld1 {v28.8b - v31.8b}, [sp], 32	1234 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32

1297 blr x30	1235 blr x30

1298	1236

1299 .unreq DCT_TABLE	1237 .unreq DCT_TABLE

1300 .unreq COEF_BLOCK	1238 .unreq COEF_BLOCK

1301 .unreq OUTPUT_BUF	1239 .unreq OUTPUT_BUF

1302 .unreq OUTPUT_COL	1240 .unreq OUTPUT_COL

1303 .unreq TMP1	1241 .unreq TMP1

1304 .unreq TMP2	1242 .unreq TMP2

1305 .unreq TMP3	1243 .unreq TMP3

1306 .unreq TMP4	1244 .unreq TMP4

(...skipping 11 matching lines...) Expand all Loading...
1318 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'	1256 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'

1319 * function from jpeg-6b (jidctred.c).	1257 * function from jpeg-6b (jidctred.c).

1320 *	1258 *

1321 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which	1259 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which

1322 * requires much less arithmetic operations and hence should be faster.	1260 * requires much less arithmetic operations and hence should be faster.

1323 * The primary purpose of this particular NEON optimized function is	1261 * The primary purpose of this particular NEON optimized function is

1324 * bit exact compatibility with jpeg-6b.	1262 * bit exact compatibility with jpeg-6b.

1325 */	1263 */

1326	1264

1327 .balign 8	1265 .balign 8

1328 jsimd_idct_2x2_neon_consts:	1266 Ljsimd_idct_2x2_neon_consts:

1329 .short -FIX_0_720959822 /* v14[0] */	1267 .short -FIX_0_720959822 /* v14[0] */

1330 .short FIX_0_850430095 /* v14[1] */	1268 .short FIX_0_850430095 /* v14[1] */

1331 .short -FIX_1_272758580 /* v14[2] */	1269 .short -FIX_1_272758580 /* v14[2] */

1332 .short FIX_3_624509785 /* v14[3] */	1270 .short FIX_3_624509785 /* v14[3] */

1333	1271

1334 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27	1272 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27

1335 sshll v15.4s, \x4, #15	1273 sshll v15.4s, \x4, #15

1336 smull v26.4s, \x6, v14.4h[3]	1274 smull v26.4s, \x6, v14.h[3]

1337 smlal v26.4s, \x10, v14.4h[2]	1275 smlal v26.4s, \x10, v14.h[2]

1338 smlal v26.4s, \x12, v14.4h[1]	1276 smlal v26.4s, \x12, v14.h[1]

1339 smlal v26.4s, \x16, v14.4h[0]	1277 smlal v26.4s, \x16, v14.h[0]

1340	1278

1341 add v20.4s, v15.4s, v26.4s	1279 add v20.4s, v15.4s, v26.4s

1342 sub v15.4s, v15.4s, v26.4s	1280 sub v15.4s, v15.4s, v26.4s

1343	1281

1344 .if \shift > 16	1282 .if \shift > 16

1345 srshr v20.4s, v20.4s, #\shift	1283 srshr v20.4s, v20.4s, #\shift

1346 srshr v15.4s, v15.4s, #\shift	1284 srshr v15.4s, v15.4s, #\shift

1347 xtn \y26, v20.4s	1285 xtn \y26, v20.4s

1348 xtn \y27, v15.4s	1286 xtn \y27, v15.4s

1349 .else	1287 .else

1350 rshrn \y26, v20.4s, #\shift	1288 rshrn \y26, v20.4s, #\shift

1351 rshrn \y27, v15.4s, #\shift	1289 rshrn \y27, v15.4s, #\shift

1352 .endif	1290 .endif

1353

1354 .endm	1291 .endm

1355	1292

1356 asm_function jsimd_idct_2x2_neon	1293 asm_function jsimd_idct_2x2_neon

1357	1294

1358 DCT_TABLE .req x0	1295 DCT_TABLE .req x0

1359 COEF_BLOCK .req x1	1296 COEF_BLOCK .req x1

1360 OUTPUT_BUF .req x2	1297 OUTPUT_BUF .req x2

1361 OUTPUT_COL .req x3	1298 OUTPUT_COL .req x3

1362 TMP1 .req x0	1299 TMP1 .req x0

1363 TMP2 .req x15	1300 TMP2 .req x15

1364	1301

1365 /* vpush {v8.4h - v15.4h} ; not available */	1302 /* vpush {v8.4h - v15.4h} ; not available */

1366 sub sp, sp, 208	1303 sub sp, sp, 208

1367 str x15, [sp], 16	1304 str x15, [sp], 16

1368	1305

1369 /* Load constants */	1306 /* Load constants */

1370 adr TMP2, jsimd_idct_2x2_neon_consts	1307 adr TMP2, Ljsimd_idct_2x2_neon_consts

1371 st1 {v4.8b - v7.8b}, [sp], 32	1308 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32

1372 st1 {v8.8b - v11.8b}, [sp], 32	1309 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

1373 st1 {v12.8b - v15.8b}, [sp], 32	1310 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

1374 st1 {v16.8b - v19.8b}, [sp], 32	1311 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32

1375 st1 {v21.8b - v22.8b}, [sp], 16	1312 st1 {v21.8b, v22.8b}, [sp], 16

1376 st1 {v24.8b - v27.8b}, [sp], 32	1313 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32

1377 st1 {v30.8b - v31.8b}, [sp], 16	1314 st1 {v30.8b, v31.8b}, [sp], 16

1378 ld1 {v14.4h}, [TMP2]	1315 ld1 {v14.4h}, [TMP2]

1379	1316

1380 /* Load all COEF_BLOCK into NEON registers with the following allocation:	1317 /* Load all COEF_BLOCK into NEON registers with the following allocation:

1381 * 0 1 2 3 \| 4 5 6 7	1318 * 0 1 2 3 \| 4 5 6 7

1382 * ---------+--------	1319 * ---------+--------

1383 * 0 \| v4.4h \| v5.4h	1320 * 0 \| v4.4h \| v5.4h

1384 * 1 \| v6.4h \| v7.4h	1321 * 1 \| v6.4h \| v7.4h

1385 * 2 \| - \| -	1322 * 2 \| - \| -

1386 * 3 \| v10.4h \| v11.4h	1323 * 3 \| v10.4h \| v11.4h

1387 * 4 \| - \| -	1324 * 4 \| - \| -

1388 * 5 \| v12.4h \| v13.4h	1325 * 5 \| v12.4h \| v13.4h

1389 * 6 \| - \| -	1326 * 6 \| - \| -

1390 * 7 \| v16.4h \| v17.4h	1327 * 7 \| v16.4h \| v17.4h

1391 */	1328 */

1392 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32	1329 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32

1393 add COEF_BLOCK, COEF_BLOCK, #16	1330 add COEF_BLOCK, COEF_BLOCK, #16

1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16	1331 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16

1395 add COEF_BLOCK, COEF_BLOCK, #16	1332 add COEF_BLOCK, COEF_BLOCK, #16

1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16	1333 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16

1397 add COEF_BLOCK, COEF_BLOCK, #16	1334 add COEF_BLOCK, COEF_BLOCK, #16

1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16	1335 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16

1399 /* Dequantize */	1336 /* Dequantize */

1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32	1337 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32

1401 mul v4.4h, v4.4h, v18.4h	1338 mul v4.4h, v4.4h, v18.4h

1402 mul v5.4h, v5.4h, v19.4h	1339 mul v5.4h, v5.4h, v19.4h

1403 ins v4.2d[1], v5.2d[0]	1340 ins v4.d[1], v5.d[0]

1404 mul v6.4h, v6.4h, v20.4h	1341 mul v6.4h, v6.4h, v20.4h

1405 mul v7.4h, v7.4h, v21.4h	1342 mul v7.4h, v7.4h, v21.4h

1406 ins v6.2d[1], v7.2d[0]	1343 ins v6.d[1], v7.d[0]

1407 add DCT_TABLE, DCT_TABLE, #16	1344 add DCT_TABLE, DCT_TABLE, #16

1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16	1345 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16

1409 mul v10.4h, v10.4h, v24.4h	1346 mul v10.4h, v10.4h, v24.4h

1410 mul v11.4h, v11.4h, v25.4h	1347 mul v11.4h, v11.4h, v25.4h

1411 ins v10.2d[1], v11.2d[0]	1348 ins v10.d[1], v11.d[0]

1412 add DCT_TABLE, DCT_TABLE, #16	1349 add DCT_TABLE, DCT_TABLE, #16

1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16	1350 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16

1414 mul v12.4h, v12.4h, v26.4h	1351 mul v12.4h, v12.4h, v26.4h

1415 mul v13.4h, v13.4h, v27.4h	1352 mul v13.4h, v13.4h, v27.4h

1416 ins v12.2d[1], v13.2d[0]	1353 ins v12.d[1], v13.d[0]

1417 add DCT_TABLE, DCT_TABLE, #16	1354 add DCT_TABLE, DCT_TABLE, #16

1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16	1355 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16

1419 mul v16.4h, v16.4h, v30.4h	1356 mul v16.4h, v16.4h, v30.4h

1420 mul v17.4h, v17.4h, v31.4h	1357 mul v17.4h, v17.4h, v31.4h

1421 ins v16.2d[1], v17.2d[0]	1358 ins v16.d[1], v17.d[0]

1422	1359

1423 /* Pass 1 */	1360 /* Pass 1 */

1424 #if 0	1361 #if 0

1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h	1362 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h

1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h	1363 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h

1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h	1364 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h

1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h	1365 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h

1429 #else	1366 #else

1430 smull v26.4s, v6.4h, v14.4h[3]	1367 smull v26.4s, v6.4h, v14.h[3]

1431 smlal v26.4s, v10.4h, v14.4h[2]	1368 smlal v26.4s, v10.4h, v14.h[2]

1432 smlal v26.4s, v12.4h, v14.4h[1]	1369 smlal v26.4s, v12.4h, v14.h[1]

1433 smlal v26.4s, v16.4h, v14.4h[0]	1370 smlal v26.4s, v16.4h, v14.h[0]

1434 smull v24.4s, v7.4h, v14.4h[3]	1371 smull v24.4s, v7.4h, v14.h[3]

1435 smlal v24.4s, v11.4h, v14.4h[2]	1372 smlal v24.4s, v11.4h, v14.h[2]

1436 smlal v24.4s, v13.4h, v14.4h[1]	1373 smlal v24.4s, v13.4h, v14.h[1]

1437 smlal v24.4s, v17.4h, v14.4h[0]	1374 smlal v24.4s, v17.4h, v14.h[0]

1438 sshll v15.4s, v4.4h, #15	1375 sshll v15.4s, v4.4h, #15

1439 sshll v30.4s, v5.4h, #15	1376 sshll v30.4s, v5.4h, #15

1440 add v20.4s, v15.4s, v26.4s	1377 add v20.4s, v15.4s, v26.4s

1441 sub v15.4s, v15.4s, v26.4s	1378 sub v15.4s, v15.4s, v26.4s

1442 rshrn v4.4h, v20.4s, #13	1379 rshrn v4.4h, v20.4s, #13

1443 rshrn v6.4h, v15.4s, #13	1380 rshrn v6.4h, v15.4s, #13

1444 add v20.4s, v30.4s, v24.4s	1381 add v20.4s, v30.4s, v24.4s

1445 sub v15.4s, v30.4s, v24.4s	1382 sub v15.4s, v30.4s, v24.4s

1446 rshrn v5.4h, v20.4s, #13	1383 rshrn v5.4h, v20.4s, #13

1447 rshrn v7.4h, v15.4s, #13	1384 rshrn v7.4h, v15.4s, #13

1448 ins v4.2d[1], v5.2d[0]	1385 ins v4.d[1], v5.d[0]

1449 ins v6.2d[1], v7.2d[0]	1386 ins v6.d[1], v7.d[0]

1450 transpose v4, v6, v3, .16b, .8h	1387 transpose v4, v6, v3, .16b, .8h

1451 transpose v6, v10, v3, .16b, .4s	1388 transpose v6, v10, v3, .16b, .4s

1452 ins v11.2d[0], v10.2d[1]	1389 ins v11.d[0], v10.d[1]

1453 ins v7.2d[0], v6.2d[1]	1390 ins v7.d[0], v6.d[1]

1454 #endif	1391 #endif

1455	1392

1456 /* Pass 2 */	1393 /* Pass 2 */

1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h	1394 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h

1458	1395

1459 /* Range limit */	1396 /* Range limit */

1460 movi v30.8h, #0x80	1397 movi v30.8h, #0x80

1461 ins v26.2d[1], v27.2d[0]	1398 ins v26.d[1], v27.d[0]

1462 add v26.8h, v26.8h, v30.8h	1399 add v26.8h, v26.8h, v30.8h

1463 sqxtun v30.8b, v26.8h	1400 sqxtun v30.8b, v26.8h

1464 ins v26.2d[0], v30.2d[0]	1401 ins v26.d[0], v30.d[0]

1465 sqxtun v27.8b, v26.8h	1402 sqxtun v27.8b, v26.8h

1466	1403

1467 /* Store results to the output buffer */	1404 /* Store results to the output buffer */

1468 ldp TMP1, TMP2, [OUTPUT_BUF]	1405 ldp TMP1, TMP2, [OUTPUT_BUF]

1469 add TMP1, TMP1, OUTPUT_COL	1406 add TMP1, TMP1, OUTPUT_COL

1470 add TMP2, TMP2, OUTPUT_COL	1407 add TMP2, TMP2, OUTPUT_COL

1471	1408

1472 st1 {v26.b}[0], [TMP1], 1	1409 st1 {v26.b}[0], [TMP1], 1

1473 st1 {v27.b}[4], [TMP1], 1	1410 st1 {v27.b}[4], [TMP1], 1

1474 st1 {v26.b}[1], [TMP2], 1	1411 st1 {v26.b}[1], [TMP2], 1

1475 st1 {v27.b}[5], [TMP2], 1	1412 st1 {v27.b}[5], [TMP2], 1

1476	1413

1477 sub sp, sp, #208	1414 sub sp, sp, #208

1478 ldr x15, [sp], 16	1415 ldr x15, [sp], 16

1479 ld1 {v4.8b - v7.8b}, [sp], 32	1416 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32

1480 ld1 {v8.8b - v11.8b}, [sp], 32	1417 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

1481 ld1 {v12.8b - v15.8b}, [sp], 32	1418 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

1482 ld1 {v16.8b - v19.8b}, [sp], 32	1419 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32

1483 ld1 {v21.8b - v22.8b}, [sp], 16	1420 ld1 {v21.8b, v22.8b}, [sp], 16

1484 ld1 {v24.8b - v27.8b}, [sp], 32	1421 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32

1485 ld1 {v30.8b - v31.8b}, [sp], 16	1422 ld1 {v30.8b, v31.8b}, [sp], 16

1486 blr x30	1423 blr x30

1487	1424

1488 .unreq DCT_TABLE	1425 .unreq DCT_TABLE

1489 .unreq COEF_BLOCK	1426 .unreq COEF_BLOCK

1490 .unreq OUTPUT_BUF	1427 .unreq OUTPUT_BUF

1491 .unreq OUTPUT_COL	1428 .unreq OUTPUT_COL

1492 .unreq TMP1	1429 .unreq TMP1

1493 .unreq TMP2	1430 .unreq TMP2

1494	1431

1495 .purgem idct_helper	1432 .purgem idct_helper

1496	1433

1497	1434

1498 /*****************************************************************************/	1435 /*****************************************************************************/

1499	1436

1500 /*	1437 /*

1501 * jsimd_ycc_extrgb_convert_neon	1438 * jsimd_ycc_extrgb_convert_neon

1502 * jsimd_ycc_extbgr_convert_neon	1439 * jsimd_ycc_extbgr_convert_neon

1503 * jsimd_ycc_extrgbx_convert_neon	1440 * jsimd_ycc_extrgbx_convert_neon

1504 * jsimd_ycc_extbgrx_convert_neon	1441 * jsimd_ycc_extbgrx_convert_neon

1505 * jsimd_ycc_extxbgr_convert_neon	1442 * jsimd_ycc_extxbgr_convert_neon

1506 * jsimd_ycc_extxrgb_convert_neon	1443 * jsimd_ycc_extxrgb_convert_neon

1507 *	1444 *

1508 * Colorspace conversion YCbCr -> RGB	1445 * Colorspace conversion YCbCr -> RGB

1509 */	1446 */

1510	1447

1511

1512 .macro do_load size	1448 .macro do_load size

	1449 .if \size == 8

	1450 ld1 {v4.8b}, [U], 8

	1451 ld1 {v5.8b}, [V], 8

	1452 ld1 {v0.8b}, [Y], 8

	1453 prfm pldl1keep, [U, #64]

	1454 prfm pldl1keep, [V, #64]

	1455 prfm pldl1keep, [Y, #64]

	1456 .elseif \size == 4

	1457 ld1 {v4.b}[0], [U], 1

	1458 ld1 {v4.b}[1], [U], 1

	1459 ld1 {v4.b}[2], [U], 1

	1460 ld1 {v4.b}[3], [U], 1

	1461 ld1 {v5.b}[0], [V], 1

	1462 ld1 {v5.b}[1], [V], 1

	1463 ld1 {v5.b}[2], [V], 1

	1464 ld1 {v5.b}[3], [V], 1

	1465 ld1 {v0.b}[0], [Y], 1

	1466 ld1 {v0.b}[1], [Y], 1

	1467 ld1 {v0.b}[2], [Y], 1

	1468 ld1 {v0.b}[3], [Y], 1

	1469 .elseif \size == 2

	1470 ld1 {v4.b}[4], [U], 1

	1471 ld1 {v4.b}[5], [U], 1

	1472 ld1 {v5.b}[4], [V], 1

	1473 ld1 {v5.b}[5], [V], 1

	1474 ld1 {v0.b}[4], [Y], 1

	1475 ld1 {v0.b}[5], [Y], 1

	1476 .elseif \size == 1

	1477 ld1 {v4.b}[6], [U], 1

	1478 ld1 {v5.b}[6], [V], 1

	1479 ld1 {v0.b}[6], [Y], 1

	1480 .else

	1481 .error unsupported macroblock size

	1482 .endif

	1483 .endm

	1484

	1485 .macro do_store bpp, size, fast_st3

	1486 .if \bpp == 24

1513 .if \size == 8	1487 .if \size == 8

1514 ld1 {v4.8b}, [U], 8	1488 .if \fast_st3 == 1

1515 ld1 {v5.8b}, [V], 8	1489 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24

1516 ld1 {v0.8b}, [Y], 8	1490 .else

1517 prfm PLDL1KEEP, [U, #64]	1491 st1 {v10.b}[0], [RGB], #1

1518 prfm PLDL1KEEP, [V, #64]	1492 st1 {v11.b}[0], [RGB], #1

1519 prfm PLDL1KEEP, [Y, #64]	1493 st1 {v12.b}[0], [RGB], #1

	1494

	1495 st1 {v10.b}[1], [RGB], #1

	1496 st1 {v11.b}[1], [RGB], #1

	1497 st1 {v12.b}[1], [RGB], #1

	1498

	1499 st1 {v10.b}[2], [RGB], #1

	1500 st1 {v11.b}[2], [RGB], #1

	1501 st1 {v12.b}[2], [RGB], #1

	1502

	1503 st1 {v10.b}[3], [RGB], #1

	1504 st1 {v11.b}[3], [RGB], #1

	1505 st1 {v12.b}[3], [RGB], #1

	1506

	1507 st1 {v10.b}[4], [RGB], #1

	1508 st1 {v11.b}[4], [RGB], #1

	1509 st1 {v12.b}[4], [RGB], #1

	1510

	1511 st1 {v10.b}[5], [RGB], #1

	1512 st1 {v11.b}[5], [RGB], #1

	1513 st1 {v12.b}[5], [RGB], #1

	1514

	1515 st1 {v10.b}[6], [RGB], #1

	1516 st1 {v11.b}[6], [RGB], #1

	1517 st1 {v12.b}[6], [RGB], #1

	1518

	1519 st1 {v10.b}[7], [RGB], #1

	1520 st1 {v11.b}[7], [RGB], #1

	1521 st1 {v12.b}[7], [RGB], #1

	1522 .endif

1520 .elseif \size == 4	1523 .elseif \size == 4

1521 ld1 {v4.b}[0], [U], 1	1524 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3

1522 ld1 {v4.b}[1], [U], 1	1525 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3

1523 ld1 {v4.b}[2], [U], 1	1526 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3

1524 ld1 {v4.b}[3], [U], 1	1527 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3

1525 ld1 {v5.b}[0], [V], 1

1526 ld1 {v5.b}[1], [V], 1

1527 ld1 {v5.b}[2], [V], 1

1528 ld1 {v5.b}[3], [V], 1

1529 ld1 {v0.b}[0], [Y], 1

1530 ld1 {v0.b}[1], [Y], 1

1531 ld1 {v0.b}[2], [Y], 1

1532 ld1 {v0.b}[3], [Y], 1

1533 .elseif \size == 2	1528 .elseif \size == 2

1534 ld1 {v4.b}[4], [U], 1	1529 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3

1535 ld1 {v4.b}[5], [U], 1	1530 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3

1536 ld1 {v5.b}[4], [V], 1

1537 ld1 {v5.b}[5], [V], 1

1538 ld1 {v0.b}[4], [Y], 1

1539 ld1 {v0.b}[5], [Y], 1

1540 .elseif \size == 1	1531 .elseif \size == 1

1541 ld1 {v4.b}[6], [U], 1	1532 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3

1542 ld1 {v5.b}[6], [V], 1

1543 ld1 {v0.b}[6], [Y], 1

1544 .else	1533 .else

1545 .error unsupported macroblock size	1534 .error unsupported macroblock size

1546 .endif	1535 .endif

1547 .endm	1536 .elseif \bpp == 32

1548	1537 .if \size == 8

1549 .macro do_store bpp, size	1538 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32

1550 .if \bpp == 24	1539 .elseif \size == 4

1551 .if \size == 8	1540 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4

1552 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24	1541 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4

1553 .elseif \size == 4	1542 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4

1554 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3	1543 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4

1555 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3	1544 .elseif \size == 2

1556 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3	1545 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4

1557 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3	1546 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4

1558 .elseif \size == 2	1547 .elseif \size == 1

1559 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3	1548 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4

1560 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3	1549 .else

1561 .elseif \size == 1	1550 .error unsupported macroblock size

1562 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3

1563 .else

1564 .error unsupported macroblock size

1565 .endif

1566 .elseif \bpp == 32

1567 .if \size == 8

1568 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32

1569 .elseif \size == 4

1570 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4

1571 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4

1572 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4

1573 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4

1574 .elseif \size == 2

1575 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4

1576 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4

1577 .elseif \size == 1

1578 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4

1579 .else

1580 .error unsupported macroblock size

1581 .endif

1582 .elseif \bpp==16

1583 .if \size == 8

1584 st1 {v25.8h}, [RGB],16

1585 .elseif \size == 4

1586 st1 {v25.4h}, [RGB],8

1587 .elseif \size == 2

1588 st1 {v25.h}[4], [RGB],2

1589 st1 {v25.h}[5], [RGB],2

1590 .elseif \size == 1

1591 st1 {v25.h}[6], [RGB],2

1592 .else

1593 .error unsupported macroblock size

1594 .endif

1595 .else

1596 .error unsupported bpp

1597 .endif	1551 .endif

1598 .endm	1552 .elseif \bpp==16

1599	1553 .if \size == 8

1600 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize	1554 st1 {v25.8h}, [RGB], 16

	1555 .elseif \size == 4

	1556 st1 {v25.4h}, [RGB], 8

	1557 .elseif \size == 2

	1558 st1 {v25.h}[4], [RGB], 2

	1559 st1 {v25.h}[5], [RGB], 2

	1560 .elseif \size == 1

	1561 st1 {v25.h}[6], [RGB], 2

	1562 .else

	1563 .error unsupported macroblock size

	1564 .endif

	1565 .else

	1566 .error unsupported bpp

	1567 .endif

	1568 .endm

	1569

	1570 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \

	1571 g_offs, gsize, b_offs, bsize, \

	1572 defsize, fast_st3

1601	1573

1602 /*	1574 /*

1603 * 2-stage pipelined YCbCr->RGB conversion	1575 * 2-stage pipelined YCbCr->RGB conversion

1604 */	1576 */

1605	1577

1606 .macro do_yuv_to_rgb_stage1	1578 .macro do_yuv_to_rgb_stage1

1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */	1579 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */

1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */	1580 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

1609 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */	1581 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */

1610 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */	1582 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */

1611 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */	1583 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */

1612 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */	1584 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */

1613 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */	1585 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */

1614 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */	1586 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */

1615 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */	1587 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */

1616 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */	1588 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */

1617 .endm	1589 .endm

1618	1590

1619 .macro do_yuv_to_rgb_stage2	1591 .macro do_yuv_to_rgb_stage2

1620 rshrn v20.4h, v20.4s, #15	1592 rshrn v20.4h, v20.4s, #15

1621 rshrn2 v20.8h, v22.4s, #15	1593 rshrn2 v20.8h, v22.4s, #15

1622 rshrn v24.4h, v24.4s, #14	1594 rshrn v24.4h, v24.4s, #14

1623 rshrn2 v24.8h, v26.4s, #14	1595 rshrn2 v24.8h, v26.4s, #14

1624 rshrn v28.4h, v28.4s, #14	1596 rshrn v28.4h, v28.4s, #14

1625 rshrn2 v28.8h, v30.4s, #14	1597 rshrn2 v28.8h, v30.4s, #14

1626 uaddw v20.8h, v20.8h, v0.8b	1598 uaddw v20.8h, v20.8h, v0.8b

1627 uaddw v24.8h, v24.8h, v0.8b	1599 uaddw v24.8h, v24.8h, v0.8b

1628 uaddw v28.8h, v28.8h, v0.8b	1600 uaddw v28.8h, v28.8h, v0.8b

1629 .if \bpp != 16	1601 .if \bpp != 16

1630 sqxtun v1\g_offs\defsize, v20.8h	1602 sqxtun v1\g_offs\defsize, v20.8h

1631 sqxtun v1\r_offs\defsize, v24.8h	1603 sqxtun v1\r_offs\defsize, v24.8h

1632 sqxtun v1\b_offs\defsize, v28.8h	1604 sqxtun v1\b_offs\defsize, v28.8h

1633 .else	1605 .else

1634 sqshlu v21.8h, v20.8h, #8	1606 sqshlu v21.8h, v20.8h, #8

1635 sqshlu v25.8h, v24.8h, #8	1607 sqshlu v25.8h, v24.8h, #8

1636 sqshlu v29.8h, v28.8h, #8	1608 sqshlu v29.8h, v28.8h, #8

1637 sri v25.8h, v21.8h, #5	1609 sri v25.8h, v21.8h, #5

1638 sri v25.8h, v29.8h, #11	1610 sri v25.8h, v29.8h, #11

1639 .endif	1611 .endif

1640	1612 .endm

1641 .endm	1613

1642	1614 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3

1643 .macro do_yuv_to_rgb_stage2_store_load_stage1	1615 rshrn v20.4h, v20.4s, #15

1644 rshrn v20.4h, v20.4s, #15	1616 rshrn v24.4h, v24.4s, #14

1645 rshrn v24.4h, v24.4s, #14	1617 rshrn v28.4h, v28.4s, #14

1646 rshrn v28.4h, v28.4s, #14	1618 ld1 {v4.8b}, [U], 8

1647 ld1 {v4.8b}, [U], 8	1619 rshrn2 v20.8h, v22.4s, #15

1648 rshrn2 v20.8h, v22.4s, #15	1620 rshrn2 v24.8h, v26.4s, #14

1649 rshrn2 v24.8h, v26.4s, #14	1621 rshrn2 v28.8h, v30.4s, #14

1650 rshrn2 v28.8h, v30.4s, #14	1622 ld1 {v5.8b}, [V], 8

1651 ld1 {v5.8b}, [V], 8	1623 uaddw v20.8h, v20.8h, v0.8b

1652 uaddw v20.8h, v20.8h, v0.8b	1624 uaddw v24.8h, v24.8h, v0.8b

1653 uaddw v24.8h, v24.8h, v0.8b	1625 uaddw v28.8h, v28.8h, v0.8b

1654 uaddw v28.8h, v28.8h, v0.8b	1626 .if \bpp != 16 /************** rgb24/rgb32 ****************************/

1655 .if \bpp != 16 /************** rgb24/rgb32 *******************************/	1627 sqxtun v1\g_offs\defsize, v20.8h

1656 sqxtun v1\g_offs\defsize, v20.8h	1628 ld1 {v0.8b}, [Y], 8

1657 ld1 {v0.8b}, [Y], 8	1629 sqxtun v1\r_offs\defsize, v24.8h

1658 sqxtun v1\r_offs\defsize, v24.8h	1630 prfm pldl1keep, [U, #64]

1659 prfm PLDL1KEEP, [U, #64]	1631 prfm pldl1keep, [V, #64]

1660 prfm PLDL1KEEP, [V, #64]	1632 prfm pldl1keep, [Y, #64]

1661 prfm PLDL1KEEP, [Y, #64]	1633 sqxtun v1\b_offs\defsize, v28.8h

1662 sqxtun v1\b_offs\defsize, v28.8h	1634 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */

1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */	1635 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */	1636 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */

1665 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */	1637 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */

1666 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */	1638 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */

1667 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */	1639 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */

1668 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */	1640 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */

1669 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */	1641 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */

1670 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */	1642 .else /************************** rgb565 ******************************/

1671 .else /************************** rgb565 *********************************/	1643 sqshlu v21.8h, v20.8h, #8

1672 sqshlu v21.8h, v20.8h, #8	1644 sqshlu v25.8h, v24.8h, #8

1673 sqshlu v25.8h, v24.8h, #8	1645 sqshlu v29.8h, v28.8h, #8

1674 sqshlu v29.8h, v28.8h, #8	1646 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */

1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */	1647 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */	1648 ld1 {v0.8b}, [Y], 8

1677 ld1 {v0.8b}, [Y], 8	1649 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */

1678 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */	1650 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */

1679 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */	1651 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */

1680 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */	1652 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */

1681 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */	1653 sri v25.8h, v21.8h, #5

1682 sri v25.8h, v21.8h, #5	1654 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */

1683 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */	1655 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */

1684 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */	1656 prfm pldl1keep, [U, #64]

1685 prfm PLDL1KEEP, [U, #64]	1657 prfm pldl1keep, [V, #64]

1686 prfm PLDL1KEEP, [V, #64]	1658 prfm pldl1keep, [Y, #64]

1687 prfm PLDL1KEEP, [Y, #64]	1659 sri v25.8h, v29.8h, #11

1688 sri v25.8h, v29.8h, #11	1660 .endif

1689 .endif	1661 do_store \bpp, 8, \fast_st3

1690 do_store \bpp, 8	1662 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */

1691 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */	1663 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */

1692 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */

1693 .endm	1664 .endm

1694	1665

1695 .macro do_yuv_to_rgb	1666 .macro do_yuv_to_rgb

1696 do_yuv_to_rgb_stage1	1667 do_yuv_to_rgb_stage1

1697 do_yuv_to_rgb_stage2	1668 do_yuv_to_rgb_stage2

1698 .endm	1669 .endm

1699	1670

1700 /* Apple gas crashes on adrl, work around that by using adr.	1671 /* Apple gas crashes on adrl, work around that by using adr.

1701 * But this requires a copy of these constants for each function.	1672 * But this requires a copy of these constants for each function.

1702 */	1673 */

1703	1674

1704 .balign 16	1675 .balign 16

1705 jsimd_ycc_\colorid\()_neon_consts:	1676 .if \fast_st3 == 1

1706 .short 0, 0, 0, 0	1677 Ljsimd_ycc_\colorid\()_neon_consts:

1707 .short 22971, -11277, -23401, 29033	1678 .else

1708 .short -128, -128, -128, -128	1679 Ljsimd_ycc_\colorid\()_neon_slowst3_consts:

1709 .short -128, -128, -128, -128	1680 .endif

	1681 .short 0, 0, 0, 0

	1682 .short 22971, -11277, -23401, 29033

	1683 .short -128, -128, -128, -128

	1684 .short -128, -128, -128, -128

1710	1685

	1686 .if \fast_st3 == 1

1711 asm_function jsimd_ycc_\colorid\()_convert_neon	1687 asm_function jsimd_ycc_\colorid\()_convert_neon

	1688 .else

	1689 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3

	1690 .endif

1712 OUTPUT_WIDTH .req x0	1691 OUTPUT_WIDTH .req x0

1713 INPUT_BUF .req x1	1692 INPUT_BUF .req x1

1714 INPUT_ROW .req x2	1693 INPUT_ROW .req x2

1715 OUTPUT_BUF .req x3	1694 OUTPUT_BUF .req x3

1716 NUM_ROWS .req x4	1695 NUM_ROWS .req x4

1717	1696

1718 INPUT_BUF0 .req x5	1697 INPUT_BUF0 .req x5

1719 INPUT_BUF1 .req x6	1698 INPUT_BUF1 .req x6

1720 INPUT_BUF2 .req INPUT_BUF	1699 INPUT_BUF2 .req x1

1721	1700

1722 RGB .req x7	1701 RGB .req x7

1723 Y .req x8	1702 Y .req x8

1724 U .req x9	1703 U .req x9

1725 V .req x10	1704 V .req x10

1726 N .req x15	1705 N .req x15

1727	1706

1728 sub sp, sp, 336	1707 sub sp, sp, 336

1729 str x15, [sp], 16	1708 str x15, [sp], 16

	1709

1730 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */	1710 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */

1731 adr x15, jsimd_ycc_\colorid\()_neon_consts	1711 .if \fast_st3 == 1

	1712 adr x15, Ljsimd_ycc_\colorid\()_neon_consts

	1713 .else

	1714 adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts

	1715 .endif

	1716

1732 /* Save NEON registers */	1717 /* Save NEON registers */

1733 st1 {v0.8b - v3.8b}, [sp], 32	1718 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32

1734 st1 {v4.8b - v7.8b}, [sp], 32	1719 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32

1735 st1 {v8.8b - v11.8b}, [sp], 32	1720 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

1736 st1 {v12.8b - v15.8b}, [sp], 32	1721 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

1737 st1 {v16.8b - v19.8b}, [sp], 32	1722 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32

1738 st1 {v20.8b - v23.8b}, [sp], 32	1723 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32

1739 st1 {v24.8b - v27.8b}, [sp], 32	1724 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32

1740 st1 {v28.8b - v31.8b}, [sp], 32	1725 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32

1741 ld1 {v0.4h, v1.4h}, [x15], 16	1726 ld1 {v0.4h, v1.4h}, [x15], 16

1742 ld1 {v2.8h}, [x15]	1727 ld1 {v2.8h}, [x15]

1743	1728

1744 /* Save ARM registers and handle input arguments */	1729 /* Save ARM registers and handle input arguments */

1745 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */	1730 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */

1746 stp x4, x5, [sp], 16	1731 stp x4, x5, [sp], 16

1747 stp x6, x7, [sp], 16	1732 stp x6, x7, [sp], 16

1748 stp x8, x9, [sp], 16	1733 stp x8, x9, [sp], 16

1749 stp x10, x30, [sp], 16	1734 stp x10, x30, [sp], 16

1750 ldr INPUT_BUF0, [INPUT_BUF]	1735 ldr INPUT_BUF0, [INPUT_BUF]

1751 ldr INPUT_BUF1, [INPUT_BUF, 8]	1736 ldr INPUT_BUF1, [INPUT_BUF, #8]

1752 ldr INPUT_BUF2, [INPUT_BUF, 16]	1737 ldr INPUT_BUF2, [INPUT_BUF, #16]

1753 .unreq INPUT_BUF	1738 .unreq INPUT_BUF

1754	1739

1755 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */	1740 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */

1756 movi v10.16b, #255	1741 movi v10.16b, #255

1757 movi v13.16b, #255	1742 movi v13.16b, #255

1758	1743

1759 /* Outer loop over scanlines */	1744 /* Outer loop over scanlines */

1760 cmp NUM_ROWS, #1	1745 cmp NUM_ROWS, #1

1761 blt 9f	1746 b.lt 9f

1762 0:	1747 0:

1763 lsl x16, INPUT_ROW, #3	1748 lsl x16, INPUT_ROW, #3

1764 ldr Y, [INPUT_BUF0, x16]	1749 ldr Y, [INPUT_BUF0, x16]

1765 ldr U, [INPUT_BUF1, x16]	1750 ldr U, [INPUT_BUF1, x16]

1766 mov N, OUTPUT_WIDTH	1751 mov N, OUTPUT_WIDTH

1767 ldr V, [INPUT_BUF2, x16]	1752 ldr V, [INPUT_BUF2, x16]

1768 add INPUT_ROW, INPUT_ROW, #1	1753 add INPUT_ROW, INPUT_ROW, #1

1769 ldr RGB, [OUTPUT_BUF], #8	1754 ldr RGB, [OUTPUT_BUF], #8

1770	1755

1771 /* Inner loop over pixels */	1756 /* Inner loop over pixels */

1772 subs N, N, #8	1757 subs N, N, #8

1773 blt 3f	1758 b.lt 3f

1774 do_load 8	1759 do_load 8

1775 do_yuv_to_rgb_stage1	1760 do_yuv_to_rgb_stage1

1776 subs N, N, #8	1761 subs N, N, #8

1777 blt 2f	1762 b.lt 2f

1778 1:	1763 1:

1779 do_yuv_to_rgb_stage2_store_load_stage1	1764 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3

1780 subs N, N, #8	1765 subs N, N, #8

1781 bge 1b	1766 b.ge 1b

1782 2:	1767 2:

1783 do_yuv_to_rgb_stage2	1768 do_yuv_to_rgb_stage2

1784 do_store \bpp, 8	1769 do_store \bpp, 8, \fast_st3

1785 tst N, #7	1770 tst N, #7

1786 beq 8f	1771 b.eq 8f

1787 3:	1772 3:

1788 tst N, #4	1773 tst N, #4

1789 beq 3f	1774 b.eq 3f

1790 do_load 4	1775 do_load 4

1791 3:	1776 3:

1792 tst N, #2	1777 tst N, #2

1793 beq 4f	1778 b.eq 4f

1794 do_load 2	1779 do_load 2

1795 4:	1780 4:

1796 tst N, #1	1781 tst N, #1

1797 beq 5f	1782 b.eq 5f

1798 do_load 1	1783 do_load 1

1799 5:	1784 5:

1800 do_yuv_to_rgb	1785 do_yuv_to_rgb

1801 tst N, #4	1786 tst N, #4

1802 beq 6f	1787 b.eq 6f

1803 do_store \bpp, 4	1788 do_store \bpp, 4, \fast_st3

1804 6:	1789 6:

1805 tst N, #2	1790 tst N, #2

1806 beq 7f	1791 b.eq 7f

1807 do_store \bpp, 2	1792 do_store \bpp, 2, \fast_st3

1808 7:	1793 7:

1809 tst N, #1	1794 tst N, #1

1810 beq 8f	1795 b.eq 8f

1811 do_store \bpp, 1	1796 do_store \bpp, 1, \fast_st3

1812 8:	1797 8:

1813 subs NUM_ROWS, NUM_ROWS, #1	1798 subs NUM_ROWS, NUM_ROWS, #1

1814 bgt 0b	1799 b.gt 0b

1815 9:	1800 9:

1816 /* Restore all registers and return */	1801 /* Restore all registers and return */

1817 sub sp, sp, #336	1802 sub sp, sp, #336

1818 ldr x15, [sp], 16	1803 ldr x15, [sp], 16

1819 ld1 {v0.8b - v3.8b}, [sp], 32	1804 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32

1820 ld1 {v4.8b - v7.8b}, [sp], 32	1805 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32

1821 ld1 {v8.8b - v11.8b}, [sp], 32	1806 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

1822 ld1 {v12.8b - v15.8b}, [sp], 32	1807 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

1823 ld1 {v16.8b - v19.8b}, [sp], 32	1808 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32

1824 ld1 {v20.8b - v23.8b}, [sp], 32	1809 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32

1825 ld1 {v24.8b - v27.8b}, [sp], 32	1810 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32

1826 ld1 {v28.8b - v31.8b}, [sp], 32	1811 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32

1827 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */	1812 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */

1828 ldp x4, x5, [sp], 16	1813 ldp x4, x5, [sp], 16

1829 ldp x6, x7, [sp], 16	1814 ldp x6, x7, [sp], 16

1830 ldp x8, x9, [sp], 16	1815 ldp x8, x9, [sp], 16

1831 ldp x10, x30, [sp], 16	1816 ldp x10, x30, [sp], 16

1832 br x30	1817 br x30

1833 .unreq OUTPUT_WIDTH	1818 .unreq OUTPUT_WIDTH

1834 .unreq INPUT_ROW	1819 .unreq INPUT_ROW

1835 .unreq OUTPUT_BUF	1820 .unreq OUTPUT_BUF

1836 .unreq NUM_ROWS	1821 .unreq NUM_ROWS

1837 .unreq INPUT_BUF0	1822 .unreq INPUT_BUF0

1838 .unreq INPUT_BUF1	1823 .unreq INPUT_BUF1

1839 .unreq INPUT_BUF2	1824 .unreq INPUT_BUF2

1840 .unreq RGB	1825 .unreq RGB

1841 .unreq Y	1826 .unreq Y

1842 .unreq U	1827 .unreq U

1843 .unreq V	1828 .unreq V

1844 .unreq N	1829 .unreq N

1845	1830

1846 .purgem do_yuv_to_rgb	1831 .purgem do_yuv_to_rgb

1847 .purgem do_yuv_to_rgb_stage1	1832 .purgem do_yuv_to_rgb_stage1

1848 .purgem do_yuv_to_rgb_stage2	1833 .purgem do_yuv_to_rgb_stage2

1849 .purgem do_yuv_to_rgb_stage2_store_load_stage1	1834 .purgem do_yuv_to_rgb_stage2_store_load_stage1

	1835

1850 .endm	1836 .endm

1851	1837

1852 /--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize /	1838 /--------------------------------- id ----- bpp R rsize G gsize B bsize defs ize fast_st3/

1853 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b	1839 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1

1854 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b	1840 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1

1855 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b	1841 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1

1856 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b	1842 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1

1857 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b	1843 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1

1858 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b	1844 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1

1859 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b	1845 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1

	1846

	1847 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0

	1848 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0

	1849

1860 .purgem do_load	1850 .purgem do_load

1861 .purgem do_store	1851 .purgem do_store

	1852

	1853

	1854 /*****************************************************************************/

	1855

	1856 /*

	1857 * jsimd_extrgb_ycc_convert_neon

	1858 * jsimd_extbgr_ycc_convert_neon

	1859 * jsimd_extrgbx_ycc_convert_neon

	1860 * jsimd_extbgrx_ycc_convert_neon

	1861 * jsimd_extxbgr_ycc_convert_neon

	1862 * jsimd_extxrgb_ycc_convert_neon

	1863 *

	1864 * Colorspace conversion RGB -> YCbCr

	1865 */

	1866

	1867 .macro do_store size

	1868 .if \size == 8

	1869 st1 {v20.8b}, [Y], #8

	1870 st1 {v21.8b}, [U], #8

	1871 st1 {v22.8b}, [V], #8

	1872 .elseif \size == 4

	1873 st1 {v20.b}[0], [Y], #1

	1874 st1 {v20.b}[1], [Y], #1

	1875 st1 {v20.b}[2], [Y], #1

	1876 st1 {v20.b}[3], [Y], #1

	1877 st1 {v21.b}[0], [U], #1

	1878 st1 {v21.b}[1], [U], #1

	1879 st1 {v21.b}[2], [U], #1

	1880 st1 {v21.b}[3], [U], #1

	1881 st1 {v22.b}[0], [V], #1

	1882 st1 {v22.b}[1], [V], #1

	1883 st1 {v22.b}[2], [V], #1

	1884 st1 {v22.b}[3], [V], #1

	1885 .elseif \size == 2

	1886 st1 {v20.b}[4], [Y], #1

	1887 st1 {v20.b}[5], [Y], #1

	1888 st1 {v21.b}[4], [U], #1

	1889 st1 {v21.b}[5], [U], #1

	1890 st1 {v22.b}[4], [V], #1

	1891 st1 {v22.b}[5], [V], #1

	1892 .elseif \size == 1

	1893 st1 {v20.b}[6], [Y], #1

	1894 st1 {v21.b}[6], [U], #1

	1895 st1 {v22.b}[6], [V], #1

	1896 .else

	1897 .error unsupported macroblock size

	1898 .endif

	1899 .endm

	1900

	1901 .macro do_load bpp, size, fast_ld3

	1902 .if \bpp == 24

	1903 .if \size == 8

	1904 .if \fast_ld3 == 1

	1905 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24

	1906 .else

	1907 ld1 {v10.b}[0], [RGB], #1

	1908 ld1 {v11.b}[0], [RGB], #1

	1909 ld1 {v12.b}[0], [RGB], #1

	1910

	1911 ld1 {v10.b}[1], [RGB], #1

	1912 ld1 {v11.b}[1], [RGB], #1

	1913 ld1 {v12.b}[1], [RGB], #1

	1914

	1915 ld1 {v10.b}[2], [RGB], #1

	1916 ld1 {v11.b}[2], [RGB], #1

	1917 ld1 {v12.b}[2], [RGB], #1

	1918

	1919 ld1 {v10.b}[3], [RGB], #1

	1920 ld1 {v11.b}[3], [RGB], #1

	1921 ld1 {v12.b}[3], [RGB], #1

	1922

	1923 ld1 {v10.b}[4], [RGB], #1

	1924 ld1 {v11.b}[4], [RGB], #1

	1925 ld1 {v12.b}[4], [RGB], #1

	1926

	1927 ld1 {v10.b}[5], [RGB], #1

	1928 ld1 {v11.b}[5], [RGB], #1

	1929 ld1 {v12.b}[5], [RGB], #1

	1930

	1931 ld1 {v10.b}[6], [RGB], #1

	1932 ld1 {v11.b}[6], [RGB], #1

	1933 ld1 {v12.b}[6], [RGB], #1

	1934

	1935 ld1 {v10.b}[7], [RGB], #1

	1936 ld1 {v11.b}[7], [RGB], #1

	1937 ld1 {v12.b}[7], [RGB], #1

	1938 .endif

	1939 prfm pldl1keep, [RGB, #128]

	1940 .elseif \size == 4

	1941 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3

	1942 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3

	1943 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3

	1944 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3

	1945 .elseif \size == 2

	1946 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3

	1947 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3

	1948 .elseif \size == 1

	1949 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3

	1950 .else

	1951 .error unsupported macroblock size

	1952 .endif

	1953 .elseif \bpp == 32

	1954 .if \size == 8

	1955 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32

	1956 prfm pldl1keep, [RGB, #128]

	1957 .elseif \size == 4

	1958 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4

	1959 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4

	1960 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4

	1961 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4

	1962 .elseif \size == 2

	1963 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4

	1964 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4

	1965 .elseif \size == 1

	1966 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4

	1967 .else

	1968 .error unsupported macroblock size

	1969 .endif

	1970 .else

	1971 .error unsupported bpp

	1972 .endif

	1973 .endm

	1974

	1975 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \

	1976 b_offs, fast_ld3

	1977

	1978 /*

	1979 * 2-stage pipelined RGB->YCbCr conversion

	1980 */

	1981

	1982 .macro do_rgb_to_yuv_stage1

	1983 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */

	1984 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */

	1985 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */

	1986 rev64 v18.4s, v1.4s

	1987 rev64 v26.4s, v1.4s

	1988 rev64 v28.4s, v1.4s

	1989 rev64 v30.4s, v1.4s

	1990 umull v14.4s, v4.4h, v0.h[0]

	1991 umull2 v16.4s, v4.8h, v0.h[0]

	1992 umlsl v18.4s, v4.4h, v0.h[3]

	1993 umlsl2 v26.4s, v4.8h, v0.h[3]

	1994 umlal v28.4s, v4.4h, v0.h[5]

	1995 umlal2 v30.4s, v4.8h, v0.h[5]

	1996 umlal v14.4s, v6.4h, v0.h[1]

	1997 umlal2 v16.4s, v6.8h, v0.h[1]

	1998 umlsl v18.4s, v6.4h, v0.h[4]

	1999 umlsl2 v26.4s, v6.8h, v0.h[4]

	2000 umlsl v28.4s, v6.4h, v0.h[6]

	2001 umlsl2 v30.4s, v6.8h, v0.h[6]

	2002 umlal v14.4s, v8.4h, v0.h[2]

	2003 umlal2 v16.4s, v8.8h, v0.h[2]

	2004 umlal v18.4s, v8.4h, v0.h[5]

	2005 umlal2 v26.4s, v8.8h, v0.h[5]

	2006 umlsl v28.4s, v8.4h, v0.h[7]

	2007 umlsl2 v30.4s, v8.8h, v0.h[7]

	2008 .endm

	2009

	2010 .macro do_rgb_to_yuv_stage2

	2011 rshrn v20.4h, v14.4s, #16

	2012 shrn v22.4h, v18.4s, #16

	2013 shrn v24.4h, v28.4s, #16

	2014 rshrn2 v20.8h, v16.4s, #16

	2015 shrn2 v22.8h, v26.4s, #16

	2016 shrn2 v24.8h, v30.4s, #16

	2017 xtn v20.8b, v20.8h /* v20 = y */

	2018 xtn v21.8b, v22.8h /* v21 = u */

	2019 xtn v22.8b, v24.8h /* v22 = v */

	2020 .endm

	2021

	2022 .macro do_rgb_to_yuv

	2023 do_rgb_to_yuv_stage1

	2024 do_rgb_to_yuv_stage2

	2025 .endm

	2026

	2027 /* TODO: expand macros and interleave instructions if some in-order

	2028 * ARM64 processor actually can dual-issue LOAD/STORE with ALU */

	2029 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3

	2030 do_rgb_to_yuv_stage2

	2031 do_load \bpp, 8, \fast_ld3

	2032 st1 {v20.8b}, [Y], #8

	2033 st1 {v21.8b}, [U], #8

	2034 st1 {v22.8b}, [V], #8

	2035 do_rgb_to_yuv_stage1

	2036 .endm

	2037

	2038 .balign 16

	2039 .if \fast_ld3 == 1

	2040 Ljsimd_\colorid\()_ycc_neon_consts:

	2041 .else

	2042 Ljsimd_\colorid\()_ycc_neon_slowld3_consts:

	2043 .endif

	2044 .short 19595, 38470, 7471, 11059

	2045 .short 21709, 32768, 27439, 5329

	2046 .short 32767, 128, 32767, 128

	2047 .short 32767, 128, 32767, 128

	2048

	2049 .if \fast_ld3 == 1

	2050 asm_function jsimd_\colorid\()_ycc_convert_neon

	2051 .else

	2052 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3

	2053 .endif

	2054 OUTPUT_WIDTH .req w0

	2055 INPUT_BUF .req x1

	2056 OUTPUT_BUF .req x2

	2057 OUTPUT_ROW .req x3

	2058 NUM_ROWS .req x4

	2059

	2060 OUTPUT_BUF0 .req x5

	2061 OUTPUT_BUF1 .req x6

	2062 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */

	2063

	2064 RGB .req x7

	2065 Y .req x9

	2066 U .req x10

	2067 V .req x11

	2068 N .req w12

	2069

	2070 /* Load constants to d0, d1, d2, d3 */

	2071 .if \fast_ld3 == 1

	2072 adr x13, Ljsimd_\colorid\()_ycc_neon_consts

	2073 .else

	2074 adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts

	2075 .endif

	2076 ld1 {v0.8h, v1.8h}, [x13]

	2077

	2078 ldr OUTPUT_BUF0, [OUTPUT_BUF]

	2079 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]

	2080 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]

	2081 .unreq OUTPUT_BUF

	2082

	2083 /* Save NEON registers */

	2084 sub sp, sp, #64

	2085 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

	2086 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

	2087

	2088 /* Outer loop over scanlines */

	2089 cmp NUM_ROWS, #1

	2090 b.lt 9f

	2091 0:

	2092 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]

	2093 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]

	2094 mov N, OUTPUT_WIDTH

	2095 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]

	2096 add OUTPUT_ROW, OUTPUT_ROW, #1

	2097 ldr RGB, [INPUT_BUF], #8

	2098

	2099 /* Inner loop over pixels */

	2100 subs N, N, #8

	2101 b.lt 3f

	2102 do_load \bpp, 8, \fast_ld3

	2103 do_rgb_to_yuv_stage1

	2104 subs N, N, #8

	2105 b.lt 2f

	2106 1:

	2107 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3

	2108 subs N, N, #8

	2109 b.ge 1b

	2110 2:

	2111 do_rgb_to_yuv_stage2

	2112 do_store 8

	2113 tst N, #7

	2114 b.eq 8f

	2115 3:

	2116 tbz N, #2, 3f

	2117 do_load \bpp, 4, \fast_ld3

	2118 3:

	2119 tbz N, #1, 4f

	2120 do_load \bpp, 2, \fast_ld3

	2121 4:

	2122 tbz N, #0, 5f

	2123 do_load \bpp, 1, \fast_ld3

	2124 5:

	2125 do_rgb_to_yuv

	2126 tbz N, #2, 6f

	2127 do_store 4

	2128 6:

	2129 tbz N, #1, 7f

	2130 do_store 2

	2131 7:

	2132 tbz N, #0, 8f

	2133 do_store 1

	2134 8:

	2135 subs NUM_ROWS, NUM_ROWS, #1

	2136 b.gt 0b

	2137 9:

	2138 /* Restore all registers and return */

	2139 sub sp, sp, #64

	2140 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

	2141 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

	2142 br x30

	2143

	2144 .unreq OUTPUT_WIDTH

	2145 .unreq OUTPUT_ROW

	2146 .unreq INPUT_BUF

	2147 .unreq NUM_ROWS

	2148 .unreq OUTPUT_BUF0

	2149 .unreq OUTPUT_BUF1

	2150 .unreq OUTPUT_BUF2

	2151 .unreq RGB

	2152 .unreq Y

	2153 .unreq U

	2154 .unreq V

	2155 .unreq N

	2156

	2157 .purgem do_rgb_to_yuv

	2158 .purgem do_rgb_to_yuv_stage1

	2159 .purgem do_rgb_to_yuv_stage2

	2160 .purgem do_rgb_to_yuv_stage2_store_load_stage1

	2161

	2162 .endm

	2163

	2164 /--------------------------------- id ----- bpp R G B Fast LD3 /

	2165 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1

	2166 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1

	2167 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1

	2168 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1

	2169 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1

	2170 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1

	2171

	2172 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0

	2173 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0

	2174

	2175 .purgem do_load

	2176 .purgem do_store

	2177

	2178

	2179 /*****************************************************************************/

	2180

	2181 /*

	2182 * Load data into workspace, applying unsigned->signed conversion

	2183 *

	2184 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get

	2185 * rid of VST1.16 instructions

	2186 */

	2187

	2188 asm_function jsimd_convsamp_neon

	2189 SAMPLE_DATA .req x0

	2190 START_COL .req x1

	2191 WORKSPACE .req x2

	2192 TMP1 .req x9

	2193 TMP2 .req x10

	2194 TMP3 .req x11

	2195 TMP4 .req x12

	2196 TMP5 .req x13

	2197 TMP6 .req x14

	2198 TMP7 .req x15

	2199 TMP8 .req x4

	2200 TMPDUP .req w3

	2201

	2202 mov TMPDUP, #128

	2203 ldp TMP1, TMP2, [SAMPLE_DATA], 16

	2204 ldp TMP3, TMP4, [SAMPLE_DATA], 16

	2205 dup v0.8b, TMPDUP

	2206 add TMP1, TMP1, START_COL

	2207 add TMP2, TMP2, START_COL

	2208 ldp TMP5, TMP6, [SAMPLE_DATA], 16

	2209 add TMP3, TMP3, START_COL

	2210 add TMP4, TMP4, START_COL

	2211 ldp TMP7, TMP8, [SAMPLE_DATA], 16

	2212 add TMP5, TMP5, START_COL

	2213 add TMP6, TMP6, START_COL

	2214 ld1 {v16.8b}, [TMP1]

	2215 add TMP7, TMP7, START_COL

	2216 add TMP8, TMP8, START_COL

	2217 ld1 {v17.8b}, [TMP2]

	2218 usubl v16.8h, v16.8b, v0.8b

	2219 ld1 {v18.8b}, [TMP3]

	2220 usubl v17.8h, v17.8b, v0.8b

	2221 ld1 {v19.8b}, [TMP4]

	2222 usubl v18.8h, v18.8b, v0.8b

	2223 ld1 {v20.8b}, [TMP5]

	2224 usubl v19.8h, v19.8b, v0.8b

	2225 ld1 {v21.8b}, [TMP6]

	2226 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64

	2227 usubl v20.8h, v20.8b, v0.8b

	2228 ld1 {v22.8b}, [TMP7]

	2229 usubl v21.8h, v21.8b, v0.8b

	2230 ld1 {v23.8b}, [TMP8]

	2231 usubl v22.8h, v22.8b, v0.8b

	2232 usubl v23.8h, v23.8b, v0.8b

	2233 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64

	2234

	2235 br x30

	2236

	2237 .unreq SAMPLE_DATA

	2238 .unreq START_COL

	2239 .unreq WORKSPACE

	2240 .unreq TMP1

	2241 .unreq TMP2

	2242 .unreq TMP3

	2243 .unreq TMP4

	2244 .unreq TMP5

	2245 .unreq TMP6

	2246 .unreq TMP7

	2247 .unreq TMP8

	2248 .unreq TMPDUP

	2249

	2250 /*****************************************************************************/

	2251

	2252 /*

	2253 * jsimd_fdct_islow_neon

	2254 *

	2255 * This file contains a slow-but-accurate integer implementation of the

	2256 * forward DCT (Discrete Cosine Transform). The following code is based

	2257 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for

	2258 * more details.

	2259 *

	2260 * TODO: can be combined with 'jsimd_convsamp_neon' to get

	2261 * rid of a bunch of VLD1.16 instructions

	2262 */

	2263

	2264 #define CONST_BITS 13

	2265 #define PASS1_BITS 2

	2266

	2267 #define DESCALE_P1 (CONST_BITS-PASS1_BITS)

	2268 #define DESCALE_P2 (CONST_BITS+PASS1_BITS)

	2269

	2270 #define F_0_298 2446 /* FIX(0.298631336) */

	2271 #define F_0_390 3196 /* FIX(0.390180644) */

	2272 #define F_0_541 4433 /* FIX(0.541196100) */

	2273 #define F_0_765 6270 /* FIX(0.765366865) */

	2274 #define F_0_899 7373 /* FIX(0.899976223) */

	2275 #define F_1_175 9633 /* FIX(1.175875602) */

	2276 #define F_1_501 12299 /* FIX(1.501321110) */

	2277 #define F_1_847 15137 /* FIX(1.847759065) */

	2278 #define F_1_961 16069 /* FIX(1.961570560) */

	2279 #define F_2_053 16819 /* FIX(2.053119869) */

	2280 #define F_2_562 20995 /* FIX(2.562915447) */

	2281 #define F_3_072 25172 /* FIX(3.072711026) */

	2282

	2283 .balign 16

	2284 Ljsimd_fdct_islow_neon_consts:

	2285 .short F_0_298

	2286 .short -F_0_390

	2287 .short F_0_541

	2288 .short F_0_765

	2289 .short - F_0_899

	2290 .short F_1_175

	2291 .short F_1_501

	2292 .short - F_1_847

	2293 .short - F_1_961

	2294 .short F_2_053

	2295 .short - F_2_562

	2296 .short F_3_072

	2297 .short 0 /* padding */

	2298 .short 0

	2299 .short 0

	2300 .short 0

	2301

	2302 #undef F_0_298

	2303 #undef F_0_390

	2304 #undef F_0_541

	2305 #undef F_0_765

	2306 #undef F_0_899

	2307 #undef F_1_175

	2308 #undef F_1_501

	2309 #undef F_1_847

	2310 #undef F_1_961

	2311 #undef F_2_053

	2312 #undef F_2_562

	2313 #undef F_3_072

	2314 #define XFIX_P_0_298 v0.h[0]

	2315 #define XFIX_N_0_390 v0.h[1]

	2316 #define XFIX_P_0_541 v0.h[2]

	2317 #define XFIX_P_0_765 v0.h[3]

	2318 #define XFIX_N_0_899 v0.h[4]

	2319 #define XFIX_P_1_175 v0.h[5]

	2320 #define XFIX_P_1_501 v0.h[6]

	2321 #define XFIX_N_1_847 v0.h[7]

	2322 #define XFIX_N_1_961 v1.h[0]

	2323 #define XFIX_P_2_053 v1.h[1]

	2324 #define XFIX_N_2_562 v1.h[2]

	2325 #define XFIX_P_3_072 v1.h[3]

	2326

	2327 asm_function jsimd_fdct_islow_neon

	2328

	2329 DATA .req x0

	2330 TMP .req x9

	2331

	2332 /* Load constants */

	2333 adr TMP, Ljsimd_fdct_islow_neon_consts

	2334 ld1 {v0.8h, v1.8h}, [TMP]

	2335

	2336 /* Save NEON registers */

	2337 sub sp, sp, #64

	2338 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

	2339 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

	2340

	2341 /* Load all DATA into NEON registers with the following allocation:

	2342 * 0 1 2 3 \| 4 5 6 7

	2343 * ---------+--------

	2344 * 0 \| d16 \| d17 \| v16.8h

	2345 * 1 \| d18 \| d19 \| v17.8h

	2346 * 2 \| d20 \| d21 \| v18.8h

	2347 * 3 \| d22 \| d23 \| v19.8h

	2348 * 4 \| d24 \| d25 \| v20.8h

	2349 * 5 \| d26 \| d27 \| v21.8h

	2350 * 6 \| d28 \| d29 \| v22.8h

	2351 * 7 \| d30 \| d31 \| v23.8h

	2352 */

	2353

	2354 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64

	2355 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]

	2356 sub DATA, DATA, #64

	2357

	2358 /* Transpose */

	2359 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4

	2360 /* 1-D FDCT */

	2361 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; * /

	2362 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; * /

	2363 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; * /

	2364 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; * /

	2365 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; * /

	2366 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; * /

	2367 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; * /

	2368 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; * /

	2369

	2370 /* even part */

	2371

	2372 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */

	2373 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */

	2374 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */

	2375 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */

	2376

	2377 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */

	2378 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */

	2379

	2380 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */

	2381

	2382 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_ SHIFT(tmp10 + tmp11, PASS1_BITS); */

	2383 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_ SHIFT(tmp10 - tmp11, PASS1_BITS); */

	2384

	2385 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tm p13, XFIX_P_0_541); */

	2386 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tm p13, XFIX_P_0_541); */

	2387 mov v22.16b, v18.16b

	2388 mov v25.16b, v24.16b

	2389

	2390 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFI X_P_0_765) */

	2391 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFI X_P_0_765) */

	2392 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFI X_N_1_847) */

	2393 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFI X_N_1_847) */

	2394

	2395 rshrn v18.4h, v18.4s, #DESCALE_P1

	2396 rshrn v22.4h, v22.4s, #DESCALE_P1

	2397 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCA LE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */

	2398 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCA LE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */

	2399

	2400 /* Odd part */

	2401

	2402 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */

	2403 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */

	2404 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */

	2405 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */

	2406 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */

	2407 smull2 v5.4s, v10.8h, XFIX_P_1_175

	2408 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1 _175875602); */

	2409 smlal2 v5.4s, v11.8h, XFIX_P_1_175

	2410

	2411 smull2 v24.4s, v28.8h, XFIX_P_0_298

	2412 smull2 v25.4s, v29.8h, XFIX_P_2_053

	2413 smull2 v26.4s, v30.8h, XFIX_P_3_072

	2414 smull2 v27.4s, v31.8h, XFIX_P_1_501

	2415 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0 _298631336); */

	2416 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2 _053119869); */

	2417 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3 _072711026); */

	2418 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1 _501321110); */

	2419

	2420 smull2 v12.4s, v8.8h, XFIX_N_0_899

	2421 smull2 v13.4s, v9.8h, XFIX_N_2_562

	2422 smull2 v14.4s, v10.8h, XFIX_N_1_961

	2423 smull2 v15.4s, v11.8h, XFIX_N_0_390

	2424 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223); */

	2425 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447); */

	2426 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560); */

	2427 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644); */

	2428

	2429 add v10.4s, v10.4s, v4.4s /* z3 += z5 */

	2430 add v14.4s, v14.4s, v5.4s

	2431 add v11.4s, v11.4s, v4.4s /* z4 += z5 */

	2432 add v15.4s, v15.4s, v5.4s

	2433

	2434 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */

	2435 add v24.4s, v24.4s, v12.4s

	2436 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */

	2437 add v25.4s, v25.4s, v13.4s

	2438 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */

	2439 add v26.4s, v26.4s, v14.4s

	2440 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */

	2441 add v27.4s, v27.4s, v15.4s

	2442

	2443 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */

	2444 add v24.4s, v24.4s, v14.4s

	2445 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */

	2446 add v25.4s, v25.4s, v15.4s

	2447 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */

	2448 add v26.4s, v26.4s, v13.4s

	2449 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */

	2450 add v27.4s, v27.4s, v12.4s

	2451

	2452 rshrn v23.4h, v28.4s, #DESCALE_P1

	2453 rshrn v21.4h, v29.4s, #DESCALE_P1

	2454 rshrn v19.4h, v30.4s, #DESCALE_P1

	2455 rshrn v17.4h, v31.4s, #DESCALE_P1

	2456 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCA LE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */

	2457 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCA LE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */

	2458 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCA LE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */

	2459 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCA LE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */

	2460

	2461 /* Transpose */

	2462 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4

	2463

	2464 /* 1-D FDCT */

	2465 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; * /

	2466 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; * /

	2467 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; * /

	2468 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; * /

	2469 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; * /

	2470 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; * /

	2471 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; * /

	2472 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; * /

	2473

	2474 /* even part */

	2475 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */

	2476 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */

	2477 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */

	2478 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */

	2479

	2480 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */

	2481 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */

	2482

	2483 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */

	2484

	2485 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCA LE(tmp10 + tmp11, PASS1_BITS); */

	2486 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCA LE(tmp10 - tmp11, PASS1_BITS); */

	2487

	2488 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tm p13, XFIX_P_0_541); */

	2489 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tm p13, XFIX_P_0_541); */

	2490 mov v22.16b, v18.16b

	2491 mov v25.16b, v24.16b

	2492

	2493 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFI X_P_0_765) */

	2494 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFI X_P_0_765) */

	2495 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFI X_N_1_847) */

	2496 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFI X_N_1_847) */

	2497

	2498 rshrn v18.4h, v18.4s, #DESCALE_P2

	2499 rshrn v22.4h, v22.4s, #DESCALE_P2

	2500 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCA LE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */

	2501 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCA LE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */

	2502

	2503 /* Odd part */

	2504 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */

	2505 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */

	2506 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */

	2507 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */

	2508

	2509 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */

	2510 smull2 v5.4s, v10.8h, XFIX_P_1_175

	2511 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1 _175875602); */

	2512 smlal2 v5.4s, v11.8h, XFIX_P_1_175

	2513

	2514 smull2 v24.4s, v28.8h, XFIX_P_0_298

	2515 smull2 v25.4s, v29.8h, XFIX_P_2_053

	2516 smull2 v26.4s, v30.8h, XFIX_P_3_072

	2517 smull2 v27.4s, v31.8h, XFIX_P_1_501

	2518 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0 _298631336); */

	2519 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2 _053119869); */

	2520 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3 _072711026); */

	2521 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1 _501321110); */

	2522

	2523 smull2 v12.4s, v8.8h, XFIX_N_0_899

	2524 smull2 v13.4s, v9.8h, XFIX_N_2_562

	2525 smull2 v14.4s, v10.8h, XFIX_N_1_961

	2526 smull2 v15.4s, v11.8h, XFIX_N_0_390

	2527 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_8 99976223); */

	2528 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_5 62915447); */

	2529 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_9 61570560); */

	2530 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_3 90180644); */

	2531

	2532 add v10.4s, v10.4s, v4.4s

	2533 add v14.4s, v14.4s, v5.4s

	2534 add v11.4s, v11.4s, v4.4s

	2535 add v15.4s, v15.4s, v5.4s

	2536

	2537 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */

	2538 add v24.4s, v24.4s, v12.4s

	2539 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */

	2540 add v25.4s, v25.4s, v13.4s

	2541 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */

	2542 add v26.4s, v26.4s, v14.4s

	2543 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */

	2544 add v27.4s, v27.4s, v15.4s

	2545

	2546 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */

	2547 add v24.4s, v24.4s, v14.4s

	2548 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */

	2549 add v25.4s, v25.4s, v15.4s

	2550 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */

	2551 add v26.4s, v26.4s, v13.4s

	2552 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */

	2553 add v27.4s, v27.4s, v12.4s

	2554

	2555 rshrn v23.4h, v28.4s, #DESCALE_P2

	2556 rshrn v21.4h, v29.4s, #DESCALE_P2

	2557 rshrn v19.4h, v30.4s, #DESCALE_P2

	2558 rshrn v17.4h, v31.4s, #DESCALE_P2

	2559 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCA LE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */

	2560 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCA LE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */

	2561 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCA LE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */

	2562 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCA LE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */

	2563

	2564 /* store results */

	2565 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64

	2566 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]

	2567

	2568 /* Restore NEON registers */

	2569 sub sp, sp, #64

	2570 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32

	2571 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32

	2572

	2573 br x30

	2574

	2575 .unreq DATA

	2576 .unreq TMP

	2577

	2578 #undef XFIX_P_0_298

	2579 #undef XFIX_N_0_390

	2580 #undef XFIX_P_0_541

	2581 #undef XFIX_P_0_765

	2582 #undef XFIX_N_0_899

	2583 #undef XFIX_P_1_175

	2584 #undef XFIX_P_1_501

	2585 #undef XFIX_N_1_847

	2586 #undef XFIX_N_1_961

	2587 #undef XFIX_P_2_053

	2588 #undef XFIX_N_2_562

	2589 #undef XFIX_P_3_072

	2590

	2591

	2592 /*****************************************************************************/

	2593

	2594 /*

	2595 * jsimd_fdct_ifast_neon

	2596 *

	2597 * This function contains a fast, not so accurate integer implementation of

	2598 * the forward DCT (Discrete Cosine Transform). It uses the same calculations

	2599 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'

	2600 * function from jfdctfst.c

	2601 *

	2602 * TODO: can be combined with 'jsimd_convsamp_neon' to get

	2603 * rid of a bunch of VLD1.16 instructions

	2604 */

	2605

	2606 #undef XFIX_0_541196100

	2607 #define XFIX_0_382683433 v0.h[0]

	2608 #define XFIX_0_541196100 v0.h[1]

	2609 #define XFIX_0_707106781 v0.h[2]

	2610 #define XFIX_1_306562965 v0.h[3]

	2611

	2612 .balign 16

	2613 Ljsimd_fdct_ifast_neon_consts:

	2614 .short (98 * 128) /* XFIX_0_382683433 */

	2615 .short (139 * 128) /* XFIX_0_541196100 */

	2616 .short (181 * 128) /* XFIX_0_707106781 */

	2617 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */

	2618

	2619 asm_function jsimd_fdct_ifast_neon

	2620

	2621 DATA .req x0

	2622 TMP .req x9

	2623

	2624 /* Load constants */

	2625 adr TMP, Ljsimd_fdct_ifast_neon_consts

	2626 ld1 {v0.4h}, [TMP]

	2627

	2628 /* Load all DATA into NEON registers with the following allocation:

	2629 * 0 1 2 3 \| 4 5 6 7

	2630 * ---------+--------

	2631 * 0 \| d16 \| d17 \| v0.8h

	2632 * 1 \| d18 \| d19 \| q9

	2633 * 2 \| d20 \| d21 \| q10

	2634 * 3 \| d22 \| d23 \| q11

	2635 * 4 \| d24 \| d25 \| q12

	2636 * 5 \| d26 \| d27 \| q13

	2637 * 6 \| d28 \| d29 \| q14

	2638 * 7 \| d30 \| d31 \| q15

	2639 */

	2640

	2641 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64

	2642 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]

	2643 mov TMP, #2

	2644 sub DATA, DATA, #64

	2645 1:

	2646 /* Transpose */

	2647 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4

	2648 subs TMP, TMP, #1

	2649 /* 1-D FDCT */

	2650 add v4.8h, v19.8h, v20.8h

	2651 sub v20.8h, v19.8h, v20.8h

	2652 sub v28.8h, v18.8h, v21.8h

	2653 add v18.8h, v18.8h, v21.8h

	2654 sub v29.8h, v17.8h, v22.8h

	2655 add v17.8h, v17.8h, v22.8h

	2656 sub v21.8h, v16.8h, v23.8h

	2657 add v16.8h, v16.8h, v23.8h

	2658 sub v6.8h, v17.8h, v18.8h

	2659 sub v7.8h, v16.8h, v4.8h

	2660 add v5.8h, v17.8h, v18.8h

	2661 add v6.8h, v6.8h, v7.8h

	2662 add v4.8h, v16.8h, v4.8h

	2663 sqdmulh v6.8h, v6.8h, XFIX_0_707106781

	2664 add v19.8h, v20.8h, v28.8h

	2665 add v16.8h, v4.8h, v5.8h

	2666 sub v20.8h, v4.8h, v5.8h

	2667 add v5.8h, v28.8h, v29.8h

	2668 add v29.8h, v29.8h, v21.8h

	2669 sqdmulh v5.8h, v5.8h, XFIX_0_707106781

	2670 sub v28.8h, v19.8h, v29.8h

	2671 add v18.8h, v7.8h, v6.8h

	2672 sqdmulh v28.8h, v28.8h, XFIX_0_382683433

	2673 sub v22.8h, v7.8h, v6.8h

	2674 sqdmulh v19.8h, v19.8h, XFIX_0_541196100

	2675 sqdmulh v7.8h, v29.8h, XFIX_1_306562965

	2676 add v6.8h, v21.8h, v5.8h

	2677 sub v5.8h, v21.8h, v5.8h

	2678 add v29.8h, v29.8h, v28.8h

	2679 add v19.8h, v19.8h, v28.8h

	2680 add v29.8h, v29.8h, v7.8h

	2681 add v21.8h, v5.8h, v19.8h

	2682 sub v19.8h, v5.8h, v19.8h

	2683 add v17.8h, v6.8h, v29.8h

	2684 sub v23.8h, v6.8h, v29.8h

	2685

	2686 b.ne 1b

	2687

	2688 /* store results */

	2689 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64

	2690 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]

	2691

	2692 br x30

	2693

	2694 .unreq DATA

	2695 .unreq TMP

	2696 #undef XFIX_0_382683433

	2697 #undef XFIX_0_541196100

	2698 #undef XFIX_0_707106781

	2699 #undef XFIX_1_306562965

	2700

	2701

	2702 /*****************************************************************************/

	2703

	2704 /*

	2705 * GLOBAL(void)

	2706 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,

	2707 * DCTELEM *workspace);

	2708 *

	2709 */

	2710 asm_function jsimd_quantize_neon

	2711

	2712 COEF_BLOCK .req x0

	2713 DIVISORS .req x1

	2714 WORKSPACE .req x2

	2715

	2716 RECIPROCAL .req DIVISORS

	2717 CORRECTION .req x9

	2718 SHIFT .req x10

	2719 LOOP_COUNT .req x11

	2720

	2721 mov LOOP_COUNT, #2

	2722 add CORRECTION, DIVISORS, #(64 * 2)

	2723 add SHIFT, DIVISORS, #(64 * 6)

	2724 1:

	2725 subs LOOP_COUNT, LOOP_COUNT, #1

	2726 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64

	2727 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64

	2728 abs v20.8h, v0.8h

	2729 abs v21.8h, v1.8h

	2730 abs v22.8h, v2.8h

	2731 abs v23.8h, v3.8h

	2732 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64

	2733 add v20.8h, v20.8h, v4.8h /* add correction */

	2734 add v21.8h, v21.8h, v5.8h

	2735 add v22.8h, v22.8h, v6.8h

	2736 add v23.8h, v23.8h, v7.8h

	2737 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */

	2738 umull2 v16.4s, v20.8h, v28.8h

	2739 umull v5.4s, v21.4h, v29.4h

	2740 umull2 v17.4s, v21.8h, v29.8h

	2741 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */

	2742 umull2 v18.4s, v22.8h, v30.8h

	2743 umull v7.4s, v23.4h, v31.4h

	2744 umull2 v19.4s, v23.8h, v31.8h

	2745 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64

	2746 shrn v4.4h, v4.4s, #16

	2747 shrn v5.4h, v5.4s, #16

	2748 shrn v6.4h, v6.4s, #16

	2749 shrn v7.4h, v7.4s, #16

	2750 shrn2 v4.8h, v16.4s, #16

	2751 shrn2 v5.8h, v17.4s, #16

	2752 shrn2 v6.8h, v18.4s, #16

	2753 shrn2 v7.8h, v19.4s, #16

	2754 neg v24.8h, v24.8h

	2755 neg v25.8h, v25.8h

	2756 neg v26.8h, v26.8h

	2757 neg v27.8h, v27.8h

	2758 sshr v0.8h, v0.8h, #15 /* extract sign */

	2759 sshr v1.8h, v1.8h, #15

	2760 sshr v2.8h, v2.8h, #15

	2761 sshr v3.8h, v3.8h, #15

	2762 ushl v4.8h, v4.8h, v24.8h /* shift */

	2763 ushl v5.8h, v5.8h, v25.8h

	2764 ushl v6.8h, v6.8h, v26.8h

	2765 ushl v7.8h, v7.8h, v27.8h

	2766

	2767 eor v4.16b, v4.16b, v0.16b /* restore sign */

	2768 eor v5.16b, v5.16b, v1.16b

	2769 eor v6.16b, v6.16b, v2.16b

	2770 eor v7.16b, v7.16b, v3.16b

	2771 sub v4.8h, v4.8h, v0.8h

	2772 sub v5.8h, v5.8h, v1.8h

	2773 sub v6.8h, v6.8h, v2.8h

	2774 sub v7.8h, v7.8h, v3.8h

	2775 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64

	2776

	2777 b.ne 1b

	2778

	2779 br x30 /* return */

	2780

	2781 .unreq COEF_BLOCK

	2782 .unreq DIVISORS

	2783 .unreq WORKSPACE

	2784 .unreq RECIPROCAL

	2785 .unreq CORRECTION

	2786 .unreq SHIFT

	2787 .unreq LOOP_COUNT

	2788

	2789

	2790 /*****************************************************************************/

	2791

	2792 /*

	2793 * Downsample pixel values of a single component.

	2794 * This version handles the common case of 2:1 horizontal and 1:1 vertical,

	2795 * without smoothing.

	2796 *

	2797 * GLOBAL(void)

	2798 * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,

	2799 * JDIMENSION v_samp_factor,

	2800 * JDIMENSION width_blocks, JSAMPARRAY input_data,

	2801 * JSAMPARRAY output_data);

	2802 */

	2803

	2804 .balign 16

	2805 Ljsimd_h2_downsample_neon_consts:

	2806 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2807 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */

	2808 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2809 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */

	2810 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2811 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */

	2812 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2813 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */

	2814 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2815 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */

	2816 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2817 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */

	2818 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2819 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */

	2820 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2821 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */

	2822 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \

	2823 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */

	2824 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \

	2825 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */

	2826 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \

	2827 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */

	2828 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \

	2829 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */

	2830 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \

	2831 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */

	2832 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \

	2833 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */

	2834 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \

	2835 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */

	2836 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \

	2837 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */

	2838

	2839 asm_function jsimd_h2v1_downsample_neon

	2840 IMAGE_WIDTH .req x0

	2841 MAX_V_SAMP .req x1

	2842 V_SAMP .req x2

	2843 BLOCK_WIDTH .req x3

	2844 INPUT_DATA .req x4

	2845 OUTPUT_DATA .req x5

	2846 OUTPTR .req x9

	2847 INPTR .req x10

	2848 TMP1 .req x11

	2849 TMP2 .req x12

	2850 TMP3 .req x13

	2851 TMPDUP .req w15

	2852

	2853 mov TMPDUP, #0x10000

	2854 lsl TMP2, BLOCK_WIDTH, #4

	2855 sub TMP2, TMP2, IMAGE_WIDTH

	2856 adr TMP3, Ljsimd_h2_downsample_neon_consts

	2857 add TMP3, TMP3, TMP2, lsl #4

	2858 dup v16.4s, TMPDUP

	2859 ld1 {v18.16b}, [TMP3]

	2860

	2861 1: /* row loop */

	2862 ldr INPTR, [INPUT_DATA], #8

	2863 ldr OUTPTR, [OUTPUT_DATA], #8

	2864 subs TMP1, BLOCK_WIDTH, #1

	2865 b.eq 3f

	2866 2: /* columns */

	2867 ld1 {v0.16b}, [INPTR], #16

	2868 mov v4.16b, v16.16b

	2869 subs TMP1, TMP1, #1

	2870 uadalp v4.8h, v0.16b

	2871 shrn v6.8b, v4.8h, #1

	2872 st1 {v6.8b}, [OUTPTR], #8

	2873 b.ne 2b

	2874 3: /* last columns */

	2875 ld1 {v0.16b}, [INPTR]

	2876 mov v4.16b, v16.16b

	2877 subs V_SAMP, V_SAMP, #1

	2878 /* expand right */

	2879 tbl v2.16b, {v0.16b}, v18.16b

	2880 uadalp v4.8h, v2.16b

	2881 shrn v6.8b, v4.8h, #1

	2882 st1 {v6.8b}, [OUTPTR], #8

	2883 b.ne 1b

	2884

	2885 br x30

	2886

	2887 .unreq IMAGE_WIDTH

	2888 .unreq MAX_V_SAMP

	2889 .unreq V_SAMP

	2890 .unreq BLOCK_WIDTH

	2891 .unreq INPUT_DATA

	2892 .unreq OUTPUT_DATA

	2893 .unreq OUTPTR

	2894 .unreq INPTR

	2895 .unreq TMP1

	2896 .unreq TMP2

	2897 .unreq TMP3

	2898 .unreq TMPDUP

	2899

	2900

	2901 /*****************************************************************************/

	2902

	2903 /*

	2904 * Downsample pixel values of a single component.

	2905 * This version handles the common case of 2:1 horizontal and 2:1 vertical,

	2906 * without smoothing.

	2907 *

	2908 * GLOBAL(void)

	2909 * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,

	2910 * JDIMENSION v_samp_factor, JDIMENSION width_blocks ,

	2911 * JSAMPARRAY input_data, JSAMPARRAY output_data);

	2912 */

	2913

	2914 .balign 16

	2915 asm_function jsimd_h2v2_downsample_neon

	2916 IMAGE_WIDTH .req x0

	2917 MAX_V_SAMP .req x1

	2918 V_SAMP .req x2

	2919 BLOCK_WIDTH .req x3

	2920 INPUT_DATA .req x4

	2921 OUTPUT_DATA .req x5

	2922 OUTPTR .req x9

	2923 INPTR0 .req x10

	2924 INPTR1 .req x14

	2925 TMP1 .req x11

	2926 TMP2 .req x12

	2927 TMP3 .req x13

	2928 TMPDUP .req w15

	2929

	2930 mov TMPDUP, #1

	2931 lsl TMP2, BLOCK_WIDTH, #4

	2932 lsl TMPDUP, TMPDUP, #17

	2933 sub TMP2, TMP2, IMAGE_WIDTH

	2934 adr TMP3, Ljsimd_h2_downsample_neon_consts

	2935 orr TMPDUP, TMPDUP, #1

	2936 add TMP3, TMP3, TMP2, lsl #4

	2937 dup v16.4s, TMPDUP

	2938 ld1 {v18.16b}, [TMP3]

	2939

	2940 1: /* row loop */

	2941 ldr INPTR0, [INPUT_DATA], #8

	2942 ldr OUTPTR, [OUTPUT_DATA], #8

	2943 ldr INPTR1, [INPUT_DATA], #8

	2944 subs TMP1, BLOCK_WIDTH, #1

	2945 b.eq 3f

	2946 2: /* columns */

	2947 ld1 {v0.16b}, [INPTR0], #16

	2948 ld1 {v1.16b}, [INPTR1], #16

	2949 mov v4.16b, v16.16b

	2950 subs TMP1, TMP1, #1

	2951 uadalp v4.8h, v0.16b

	2952 uadalp v4.8h, v1.16b

	2953 shrn v6.8b, v4.8h, #2

	2954 st1 {v6.8b}, [OUTPTR], #8

	2955 b.ne 2b

	2956 3: /* last columns */

	2957 ld1 {v0.16b}, [INPTR0], #16

	2958 ld1 {v1.16b}, [INPTR1], #16

	2959 mov v4.16b, v16.16b

	2960 subs V_SAMP, V_SAMP, #1

	2961 /* expand right */

	2962 tbl v2.16b, {v0.16b}, v18.16b

	2963 tbl v3.16b, {v1.16b}, v18.16b

	2964 uadalp v4.8h, v2.16b

	2965 uadalp v4.8h, v3.16b

	2966 shrn v6.8b, v4.8h, #2

	2967 st1 {v6.8b}, [OUTPTR], #8

	2968 b.ne 1b

	2969

	2970 br x30

	2971

	2972 .unreq IMAGE_WIDTH

	2973 .unreq MAX_V_SAMP

	2974 .unreq V_SAMP

	2975 .unreq BLOCK_WIDTH

	2976 .unreq INPUT_DATA

	2977 .unreq OUTPUT_DATA

	2978 .unreq OUTPTR

	2979 .unreq INPTR0

	2980 .unreq INPTR1

	2981 .unreq TMP1

	2982 .unreq TMP2

	2983 .unreq TMP3

	2984 .unreq TMPDUP

	2985

	2986

	2987 /*****************************************************************************/

	2988

	2989 /*

	2990 * GLOBAL(JOCTET*)

	2991 * jsimd_huff_encode_one_block (working_state state, JOCTET buffer,

	2992 * JCOEFPTR block, int last_dc_val,

	2993 * c_derived_tbl dctbl, c_derived_tbl actbl)

	2994 *

	2995 */

	2996

	2997 BUFFER .req x1

	2998 PUT_BUFFER .req x6

	2999 PUT_BITS .req x7

	3000 PUT_BITSw .req w7

	3001

	3002 .macro emit_byte

	3003 sub PUT_BITS, PUT_BITS, #0x8

	3004 lsr x19, PUT_BUFFER, PUT_BITS

	3005 uxtb w19, w19

	3006 strb w19, [BUFFER, #1]!

	3007 cmp w19, #0xff

	3008 b.ne 14f

	3009 strb wzr, [BUFFER, #1]!

	3010 14:

	3011 .endm

	3012 .macro put_bits CODE, SIZE

	3013 lsl PUT_BUFFER, PUT_BUFFER, \SIZE

	3014 add PUT_BITS, PUT_BITS, \SIZE

	3015 orr PUT_BUFFER, PUT_BUFFER, \CODE

	3016 .endm

	3017 .macro checkbuf31

	3018 cmp PUT_BITS, #0x20

	3019 b.lt 31f

	3020 emit_byte

	3021 emit_byte

	3022 emit_byte

	3023 emit_byte

	3024 31:

	3025 .endm

	3026 .macro checkbuf47

	3027 cmp PUT_BITS, #0x30

	3028 b.lt 47f

	3029 emit_byte

	3030 emit_byte

	3031 emit_byte

	3032 emit_byte

	3033 emit_byte

	3034 emit_byte

	3035 47:

	3036 .endm

	3037

	3038 .macro generate_jsimd_huff_encode_one_block fast_tbl

	3039

	3040 .balign 16

	3041 .if \fast_tbl == 1

	3042 Ljsimd_huff_encode_one_block_neon_consts:

	3043 .else

	3044 Ljsimd_huff_encode_one_block_neon_slowtbl_consts:

	3045 .endif

	3046 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \

	3047 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80

	3048 .if \fast_tbl == 1

	3049 .byte 0, 1, 2, 3, 16, 17, 32, 33, \

	3050 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */

	3051 .byte 34, 35, 48, 49, 255, 255, 50, 51, \

	3052 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */

	3053 .byte 8, 9, 22, 23, 36, 37, 50, 51, \

	3054 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */

	3055 .byte 54, 55, 40, 41, 26, 27, 12, 13, \

	3056 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */

	3057 .byte 6, 7, 20, 21, 34, 35, 48, 49, \

	3058 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */

	3059 .byte 42, 43, 28, 29, 14, 15, 30, 31, \

	3060 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */

	3061 .byte 255, 255, 255, 255, 56, 57, 42, 43, \

	3062 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */

	3063 .byte 26, 27, 40, 41, 42, 43, 28, 29, \

	3064 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */

	3065 .byte 255, 255, 255, 255, 0, 1, 255, 255, \

	3066 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */

	3067 .byte 255, 255, 255, 255, 255, 255, 255, 255, \

	3068 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */

	3069 .byte 255, 255, 255, 255, 255, 255, 255, 255, \

	3070 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */

	3071 .byte 4, 5, 6, 7, 255, 255, 255, 255, \

	3072 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */

	3073 .endif

	3074

	3075 .if \fast_tbl == 1

	3076 asm_function jsimd_huff_encode_one_block_neon

	3077 .else

	3078 asm_function jsimd_huff_encode_one_block_neon_slowtbl

	3079 .endif

	3080 sub sp, sp, 272

	3081 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */

	3082 /* Save ARM registers */

	3083 stp x19, x20, [sp], 16

	3084 .if \fast_tbl == 1

	3085 adr x15, Ljsimd_huff_encode_one_block_neon_consts

	3086 .else

	3087 adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts

	3088 .endif

	3089 ldr PUT_BUFFER, [x0, #0x10]

	3090 ldr PUT_BITSw, [x0, #0x18]

	3091 ldrsh w12, [x2] /* load DC coeff in w12 */

	3092 /* prepare data */

	3093 .if \fast_tbl == 1

	3094 ld1 {v23.16b}, [x15], #16

	3095 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64

	3096 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64

	3097 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64

	3098 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64

	3099 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64

	3100 sub w12, w12, w3 /* last_dc_val, not used afterwards */

	3101 /* ZigZag 8x8 */

	3102 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b

	3103 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b

	3104 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b

	3105 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b

	3106 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b

	3107 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b

	3108 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b

	3109 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b

	3110 ins v0.h[0], w12

	3111 tbx v1.16b, {v28.16b}, v16.16b

	3112 tbx v2.16b, {v29.16b, v30.16b}, v17.16b

	3113 tbx v5.16b, {v29.16b, v30.16b}, v18.16b

	3114 tbx v6.16b, {v31.16b}, v19.16b

	3115 .else

	3116 add x13, x2, #0x22

	3117 sub w12, w12, w3 /* last_dc_val, not used afterwards */

	3118 ld1 {v23.16b}, [x15]

	3119 add x14, x2, #0x18

	3120 add x3, x2, #0x36

	3121 ins v0.h[0], w12

	3122 add x9, x2, #0x2

	3123 ld1 {v1.h}[0], [x13]

	3124 add x15, x2, #0x30

	3125 ld1 {v2.h}[0], [x14]

	3126 add x19, x2, #0x26

	3127 ld1 {v3.h}[0], [x3]

	3128 add x20, x2, #0x28

	3129 ld1 {v0.h}[1], [x9]

	3130 add x12, x2, #0x10

	3131 ld1 {v1.h}[1], [x15]

	3132 add x13, x2, #0x40

	3133 ld1 {v2.h}[1], [x19]

	3134 add x14, x2, #0x34

	3135 ld1 {v3.h}[1], [x20]

	3136 add x3, x2, #0x1a

	3137 ld1 {v0.h}[2], [x12]

	3138 add x9, x2, #0x20

	3139 ld1 {v1.h}[2], [x13]

	3140 add x15, x2, #0x32

	3141 ld1 {v2.h}[2], [x14]

	3142 add x19, x2, #0x42

	3143 ld1 {v3.h}[2], [x3]

	3144 add x20, x2, #0xc

	3145 ld1 {v0.h}[3], [x9]

	3146 add x12, x2, #0x12

	3147 ld1 {v1.h}[3], [x15]

	3148 add x13, x2, #0x24

	3149 ld1 {v2.h}[3], [x19]

	3150 add x14, x2, #0x50

	3151 ld1 {v3.h}[3], [x20]

	3152 add x3, x2, #0xe

	3153 ld1 {v0.h}[4], [x12]

	3154 add x9, x2, #0x4

	3155 ld1 {v1.h}[4], [x13]

	3156 add x15, x2, #0x16

	3157 ld1 {v2.h}[4], [x14]

	3158 add x19, x2, #0x60

	3159 ld1 {v3.h}[4], [x3]

	3160 add x20, x2, #0x1c

	3161 ld1 {v0.h}[5], [x9]

	3162 add x12, x2, #0x6

	3163 ld1 {v1.h}[5], [x15]

	3164 add x13, x2, #0x8

	3165 ld1 {v2.h}[5], [x19]

	3166 add x14, x2, #0x52

	3167 ld1 {v3.h}[5], [x20]

	3168 add x3, x2, #0x2a

	3169 ld1 {v0.h}[6], [x12]

	3170 add x9, x2, #0x14

	3171 ld1 {v1.h}[6], [x13]

	3172 add x15, x2, #0xa

	3173 ld1 {v2.h}[6], [x14]

	3174 add x19, x2, #0x44

	3175 ld1 {v3.h}[6], [x3]

	3176 add x20, x2, #0x38

	3177 ld1 {v0.h}[7], [x9]

	3178 add x12, x2, #0x46

	3179 ld1 {v1.h}[7], [x15]

	3180 add x13, x2, #0x3a

	3181 ld1 {v2.h}[7], [x19]

	3182 add x14, x2, #0x74

	3183 ld1 {v3.h}[7], [x20]

	3184 add x3, x2, #0x6a

	3185 ld1 {v4.h}[0], [x12]

	3186 add x9, x2, #0x54

	3187 ld1 {v5.h}[0], [x13]

	3188 add x15, x2, #0x2c

	3189 ld1 {v6.h}[0], [x14]

	3190 add x19, x2, #0x76

	3191 ld1 {v7.h}[0], [x3]

	3192 add x20, x2, #0x78

	3193 ld1 {v4.h}[1], [x9]

	3194 add x12, x2, #0x62

	3195 ld1 {v5.h}[1], [x15]

	3196 add x13, x2, #0x1e

	3197 ld1 {v6.h}[1], [x19]

	3198 add x14, x2, #0x68

	3199 ld1 {v7.h}[1], [x20]

	3200 add x3, x2, #0x7a

	3201 ld1 {v4.h}[2], [x12]

	3202 add x9, x2, #0x70

	3203 ld1 {v5.h}[2], [x13]

	3204 add x15, x2, #0x2e

	3205 ld1 {v6.h}[2], [x14]

	3206 add x19, x2, #0x5a

	3207 ld1 {v7.h}[2], [x3]

	3208 add x20, x2, #0x6c

	3209 ld1 {v4.h}[3], [x9]

	3210 add x12, x2, #0x72

	3211 ld1 {v5.h}[3], [x15]

	3212 add x13, x2, #0x3c

	3213 ld1 {v6.h}[3], [x19]

	3214 add x14, x2, #0x4c

	3215 ld1 {v7.h}[3], [x20]

	3216 add x3, x2, #0x5e

	3217 ld1 {v4.h}[4], [x12]

	3218 add x9, x2, #0x64

	3219 ld1 {v5.h}[4], [x13]

	3220 add x15, x2, #0x4a

	3221 ld1 {v6.h}[4], [x14]

	3222 add x19, x2, #0x3e

	3223 ld1 {v7.h}[4], [x3]

	3224 add x20, x2, #0x6e

	3225 ld1 {v4.h}[5], [x9]

	3226 add x12, x2, #0x56

	3227 ld1 {v5.h}[5], [x15]

	3228 add x13, x2, #0x58

	3229 ld1 {v6.h}[5], [x19]

	3230 add x14, x2, #0x4e

	3231 ld1 {v7.h}[5], [x20]

	3232 add x3, x2, #0x7c

	3233 ld1 {v4.h}[6], [x12]

	3234 add x9, x2, #0x48

	3235 ld1 {v5.h}[6], [x13]

	3236 add x15, x2, #0x66

	3237 ld1 {v6.h}[6], [x14]

	3238 add x19, x2, #0x5c

	3239 ld1 {v7.h}[6], [x3]

	3240 add x20, x2, #0x7e

	3241 ld1 {v4.h}[7], [x9]

	3242 ld1 {v5.h}[7], [x15]

	3243 ld1 {v6.h}[7], [x19]

	3244 ld1 {v7.h}[7], [x20]

	3245 .endif

	3246 cmlt v24.8h, v0.8h, #0

	3247 cmlt v25.8h, v1.8h, #0

	3248 cmlt v26.8h, v2.8h, #0

	3249 cmlt v27.8h, v3.8h, #0

	3250 cmlt v28.8h, v4.8h, #0

	3251 cmlt v29.8h, v5.8h, #0

	3252 cmlt v30.8h, v6.8h, #0

	3253 cmlt v31.8h, v7.8h, #0

	3254 abs v0.8h, v0.8h

	3255 abs v1.8h, v1.8h

	3256 abs v2.8h, v2.8h

	3257 abs v3.8h, v3.8h

	3258 abs v4.8h, v4.8h

	3259 abs v5.8h, v5.8h

	3260 abs v6.8h, v6.8h

	3261 abs v7.8h, v7.8h

	3262 eor v24.16b, v24.16b, v0.16b

	3263 eor v25.16b, v25.16b, v1.16b

	3264 eor v26.16b, v26.16b, v2.16b

	3265 eor v27.16b, v27.16b, v3.16b

	3266 eor v28.16b, v28.16b, v4.16b

	3267 eor v29.16b, v29.16b, v5.16b

	3268 eor v30.16b, v30.16b, v6.16b

	3269 eor v31.16b, v31.16b, v7.16b

	3270 cmeq v16.8h, v0.8h, #0

	3271 cmeq v17.8h, v1.8h, #0

	3272 cmeq v18.8h, v2.8h, #0

	3273 cmeq v19.8h, v3.8h, #0

	3274 cmeq v20.8h, v4.8h, #0

	3275 cmeq v21.8h, v5.8h, #0

	3276 cmeq v22.8h, v6.8h, #0

	3277 xtn v16.8b, v16.8h

	3278 xtn v18.8b, v18.8h

	3279 xtn v20.8b, v20.8h

	3280 xtn v22.8b, v22.8h

	3281 umov w14, v0.h[0]

	3282 xtn2 v16.16b, v17.8h

	3283 umov w13, v24.h[0]

	3284 xtn2 v18.16b, v19.8h

	3285 clz w14, w14

	3286 xtn2 v20.16b, v21.8h

	3287 lsl w13, w13, w14

	3288 cmeq v17.8h, v7.8h, #0

	3289 sub w12, w14, #32

	3290 xtn2 v22.16b, v17.8h

	3291 lsr w13, w13, w14

	3292 and v16.16b, v16.16b, v23.16b

	3293 neg w12, w12

	3294 and v18.16b, v18.16b, v23.16b

	3295 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */

	3296 and v20.16b, v20.16b, v23.16b

	3297 add x15, sp, #0x80 /* x15 = t2 */

	3298 and v22.16b, v22.16b, v23.16b

	3299 ldr w10, [x4, x12, lsl #2]

	3300 addp v16.16b, v16.16b, v18.16b

	3301 ldrb w11, [x3, x12]

	3302 addp v20.16b, v20.16b, v22.16b

	3303 checkbuf47

	3304 addp v16.16b, v16.16b, v20.16b

	3305 put_bits x10, x11

	3306 addp v16.16b, v16.16b, v18.16b

	3307 checkbuf47

	3308 umov x9,v16.D[0]

	3309 put_bits x13, x12

	3310 cnt v17.8b, v16.8b

	3311 mvn x9, x9

	3312 addv B18, v17.8b

	3313 add x4, x5, #0x400 /* x4 = actbl->ehufsi */

	3314 umov w12, v18.b[0]

	3315 lsr x9, x9, #0x1 /* clear AC coeff */

	3316 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */

	3317 rbit x9, x9 /* x9 = index0 */

	3318 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */

	3319 cmp w12, #(64-8)

	3320 mov x11, sp

	3321 b.lt 4f

	3322 cbz x9, 6f

	3323 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64

	3324 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64

	3325 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64

	3326 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64

	3327 1:

	3328 clz x2, x9

	3329 add x15, x15, x2, lsl #1

	3330 lsl x9, x9, x2

	3331 ldrh w20, [x15, #-126]

	3332 2:

	3333 cmp x2, #0x10

	3334 b.lt 3f

	3335 sub x2, x2, #0x10

	3336 checkbuf47

	3337 put_bits x13, x14

	3338 b 2b

	3339 3:

	3340 clz w20, w20

	3341 ldrh w3, [x15, #2]!

	3342 sub w11, w20, #32

	3343 lsl w3, w3, w20

	3344 neg w11, w11

	3345 lsr w3, w3, w20

	3346 add x2, x11, x2, lsl #4

	3347 lsl x9, x9, #0x1

	3348 ldr w12, [x5, x2, lsl #2]

	3349 ldrb w10, [x4, x2]

	3350 checkbuf31

	3351 put_bits x12, x10

	3352 put_bits x3, x11

	3353 cbnz x9, 1b

	3354 b 6f

	3355 4:

	3356 movi v21.8h, #0x0010

	3357 clz v0.8h, v0.8h

	3358 clz v1.8h, v1.8h

	3359 clz v2.8h, v2.8h

	3360 clz v3.8h, v3.8h

	3361 clz v4.8h, v4.8h

	3362 clz v5.8h, v5.8h

	3363 clz v6.8h, v6.8h

	3364 clz v7.8h, v7.8h

	3365 ushl v24.8h, v24.8h, v0.8h

	3366 ushl v25.8h, v25.8h, v1.8h

	3367 ushl v26.8h, v26.8h, v2.8h

	3368 ushl v27.8h, v27.8h, v3.8h

	3369 ushl v28.8h, v28.8h, v4.8h

	3370 ushl v29.8h, v29.8h, v5.8h

	3371 ushl v30.8h, v30.8h, v6.8h

	3372 ushl v31.8h, v31.8h, v7.8h

	3373 neg v0.8h, v0.8h

	3374 neg v1.8h, v1.8h

	3375 neg v2.8h, v2.8h

	3376 neg v3.8h, v3.8h

	3377 neg v4.8h, v4.8h

	3378 neg v5.8h, v5.8h

	3379 neg v6.8h, v6.8h

	3380 neg v7.8h, v7.8h

	3381 ushl v24.8h, v24.8h, v0.8h

	3382 ushl v25.8h, v25.8h, v1.8h

	3383 ushl v26.8h, v26.8h, v2.8h

	3384 ushl v27.8h, v27.8h, v3.8h

	3385 ushl v28.8h, v28.8h, v4.8h

	3386 ushl v29.8h, v29.8h, v5.8h

	3387 ushl v30.8h, v30.8h, v6.8h

	3388 ushl v31.8h, v31.8h, v7.8h

	3389 add v0.8h, v21.8h, v0.8h

	3390 add v1.8h, v21.8h, v1.8h

	3391 add v2.8h, v21.8h, v2.8h

	3392 add v3.8h, v21.8h, v3.8h

	3393 add v4.8h, v21.8h, v4.8h

	3394 add v5.8h, v21.8h, v5.8h

	3395 add v6.8h, v21.8h, v6.8h

	3396 add v7.8h, v21.8h, v7.8h

	3397 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64

	3398 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64

	3399 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64

	3400 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64

	3401 1:

	3402 clz x2, x9

	3403 add x15, x15, x2, lsl #1

	3404 lsl x9, x9, x2

	3405 ldrh w11, [x15, #-126]

	3406 2:

	3407 cmp x2, #0x10

	3408 b.lt 3f

	3409 sub x2, x2, #0x10

	3410 checkbuf47

	3411 put_bits x13, x14

	3412 b 2b

	3413 3:

	3414 ldrh w3, [x15, #2]!

	3415 add x2, x11, x2, lsl #4

	3416 lsl x9, x9, #0x1

	3417 ldr w12, [x5, x2, lsl #2]

	3418 ldrb w10, [x4, x2]

	3419 checkbuf31

	3420 put_bits x12, x10

	3421 put_bits x3, x11

	3422 cbnz x9, 1b

	3423 6:

	3424 add x13, sp, #0xfe

	3425 cmp x15, x13

	3426 b.hs 1f

	3427 ldr w12, [x5]

	3428 ldrb w14, [x4]

	3429 checkbuf47

	3430 put_bits x12, x14

	3431 1:

	3432 sub sp, sp, 16

	3433 str PUT_BUFFER, [x0, #0x10]

	3434 str PUT_BITSw, [x0, #0x18]

	3435 ldp x19, x20, [sp], 16

	3436 add x0, BUFFER, #0x1

	3437 add sp, sp, 256

	3438 br x30

	3439

	3440 .endm

	3441

	3442 generate_jsimd_huff_encode_one_block 1

	3443 generate_jsimd_huff_encode_one_block 0

	3444

	3445 .unreq BUFFER

	3446 .unreq PUT_BUFFER

	3447 .unreq PUT_BITS

	3448 .unreq PUT_BITSw

	3449

	3450 .purgem emit_byte

	3451 .purgem put_bits

	3452 .purgem checkbuf31

	3453 .purgem checkbuf47

OLD	NEW

« no previous file with comments | « simd/jsimd_arm64.c ('k') | simd/jsimd_arm_neon.S » ('j') | no next file with comments »