Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(472)

Side by Side Diff: google.patch

Issue 1271803002: Add jpeg_skip_scanlines() API to libjpeg-turbo (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « djpeg.c ('k') | jdapistd.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 Index: jdmarker.c
2 ===================================================================
3 --- jdmarker.c (revision 829)
4 +++ jdmarker.c (working copy)
5 @@ -910,7 +910,7 @@
6 }
7
8 if (cinfo->marker->discarded_bytes != 0) {
9 - WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
10 + TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c) ;
11 cinfo->marker->discarded_bytes = 0;
12 }
13
14 @@ -944,7 +944,144 @@
15 return TRUE;
16 }
17
18 +#ifdef MOTION_JPEG_SUPPORTED
19
20 +/* The default Huffman tables used by motion JPEG frames. When a motion JPEG
21 + * frame does not have DHT tables, we should use the huffman tables suggested b y
22 + * the JPEG standard. Each of these tables represents a member of the JHUFF_TBL S
23 + * struct so we can just copy it to the according JHUFF_TBLS member.
24 + */
25 +/* DC table 0 */
26 +LOCAL(const unsigned char) mjpg_dc0_bits[] = {
27 + 0x00, 0x01, 0x05, 0x01, 0x01, 0x01, 0x01, 0x01,
28 + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
29 +};
30 +
31 +LOCAL(const unsigned char) mjpg_dc0_huffval[] = {
32 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
33 + 0x08, 0x09, 0x0A, 0x0B
34 +};
35 +
36 +/* DC table 1 */
37 +LOCAL(const unsigned char) mjpg_dc1_bits[] = {
38 + 0x00, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
39 + 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00
40 +};
41 +
42 +LOCAL(const unsigned char) mjpg_dc1_huffval[] = {
43 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
44 + 0x08, 0x09, 0x0A, 0x0B
45 +};
46 +
47 +/* AC table 0 */
48 +LOCAL(const unsigned char) mjpg_ac0_bits[] = {
49 + 0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03,
50 + 0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7D
51 +};
52 +
53 +LOCAL(const unsigned char) mjpg_ac0_huffval[] = {
54 + 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
55 + 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
56 + 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08,
57 + 0x23, 0x42, 0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0,
58 + 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16,
59 + 0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28,
60 + 0x29, 0x2A, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
61 + 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
62 + 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
63 + 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
64 + 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
65 + 0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
66 + 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
67 + 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
68 + 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6,
69 + 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5,
70 + 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4,
71 + 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2,
72 + 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
73 + 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
74 + 0xF9, 0xFA
75 +};
76 +
77 +/* AC table 1 */
78 +LOCAL(const unsigned char) mjpg_ac1_bits[] = {
79 + 0x00, 0x02, 0x01, 0x02, 0x04, 0x04, 0x03, 0x04,
80 + 0x07, 0x05, 0x04, 0x04, 0x00, 0x01, 0x02, 0x77
81 +};
82 +
83 +LOCAL(const unsigned char) mjpg_ac1_huffval[] = {
84 + 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
85 + 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
86 + 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
87 + 0xA1, 0xB1, 0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0,
88 + 0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24, 0x34,
89 + 0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26,
90 + 0x27, 0x28, 0x29, 0x2A, 0x35, 0x36, 0x37, 0x38,
91 + 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
92 + 0x49, 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
93 + 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
94 + 0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
95 + 0x79, 0x7A, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
96 + 0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96,
97 + 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
98 + 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4,
99 + 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3,
100 + 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2,
101 + 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA,
102 + 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
103 + 0xEA, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
104 + 0xF9, 0xFA
105 +};
106 +
107 +/* Loads the default Huffman tables used by motion JPEG frames. This function
108 + * just copies the huffman tables suggested in the JPEG standard when we have
109 + * not load them.
110 + */
111 +LOCAL(void)
112 +mjpg_load_huff_tables (j_decompress_ptr cinfo)
113 +{
114 + JHUFF_TBL *htblptr;
115 +
116 + if (! cinfo->dc_huff_tbl_ptrs[0]) {
117 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
118 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
119 + MEMCOPY(&htblptr->bits[1], mjpg_dc0_bits, SIZEOF(mjpg_dc0_bits));
120 + MEMCOPY(&htblptr->huffval[0], mjpg_dc0_huffval, SIZEOF(mjpg_dc0_huffval));
121 + cinfo->dc_huff_tbl_ptrs[0] = htblptr;
122 + }
123 +
124 + if (! cinfo->dc_huff_tbl_ptrs[1]) {
125 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
126 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
127 + MEMCOPY(&htblptr->bits[1], mjpg_dc1_bits, SIZEOF(mjpg_dc1_bits));
128 + MEMCOPY(&htblptr->huffval[0], mjpg_dc1_huffval, SIZEOF(mjpg_dc1_huffval));
129 + cinfo->dc_huff_tbl_ptrs[1] = htblptr;
130 + }
131 +
132 + if (! cinfo->ac_huff_tbl_ptrs[0]) {
133 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
134 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
135 + MEMCOPY(&htblptr->bits[1], mjpg_ac0_bits, SIZEOF(mjpg_ac0_bits));
136 + MEMCOPY(&htblptr->huffval[0], mjpg_ac0_huffval, SIZEOF(mjpg_ac0_huffval));
137 + cinfo->ac_huff_tbl_ptrs[0] = htblptr;
138 + }
139 +
140 + if (! cinfo->ac_huff_tbl_ptrs[1]) {
141 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
142 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
143 + MEMCOPY(&htblptr->bits[1], mjpg_ac1_bits, SIZEOF(mjpg_ac1_bits));
144 + MEMCOPY(&htblptr->huffval[0], mjpg_ac1_huffval, SIZEOF(mjpg_ac1_huffval));
145 + cinfo->ac_huff_tbl_ptrs[1] = htblptr;
146 + }
147 +}
148 +
149 +#else
150 +
151 +#define mjpg_load_huff_tables(cinfo)
152 +
153 +#endif /* MOTION_JPEG_SUPPORTED */
154 +
155 +
156 /*
157 * Read markers until SOS or EOI.
158 *
159 @@ -1013,6 +1150,7 @@
160 break;
161
162 case M_SOS:
163 + mjpg_load_huff_tables(cinfo);
164 if (! get_sos(cinfo))
165 return JPEG_SUSPENDED;
166 cinfo->unread_marker = 0; /* processed the marker */
167 Index: jmorecfg.h
168 ===================================================================
169 --- jmorecfg.h (revision 829)
170 +++ jmorecfg.h (working copy)
171 @@ -153,14 +153,18 @@
172 /* INT16 must hold at least the values -32768..32767. */
173
174 #ifndef XMD_H /* X11/xmd.h correctly defines INT16 */
175 +#ifndef _BASETSD_H_ /* basetsd.h correctly defines INT32 */
176 typedef short INT16;
177 #endif
178 +#endif
179
180 /* INT32 must hold at least signed 32-bit values. */
181
182 #ifndef XMD_H /* X11/xmd.h correctly defines INT32 */
183 +#ifndef _BASETSD_H_ /* basetsd.h correctly defines INT32 */
184 typedef long INT32;
185 #endif
186 +#endif
187
188 /* Datatype used for image dimensions. The JPEG standard only supports
189 * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore
190 @@ -210,11 +214,13 @@
191 * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
192 */
193
194 +#ifndef FAR
195 #ifdef NEED_FAR_POINTERS
196 #define FAR far
197 #else
198 #define FAR
199 #endif
200 +#endif
201
202
203 /*
204 Index: jpeglib.h
205 ===================================================================
206 --- jpeglib.h (revision 829)
207 +++ jpeglib.h (working copy)
208 @@ -15,6 +15,10 @@
209 #ifndef JPEGLIB_H
210 #define JPEGLIB_H
211
212 +/* Begin chromium edits */
213 +#include "jpeglibmangler.h"
214 +/* End chromium edits */
215 +
216 /*
217 * First we include the configuration files that record how this
218 * installation of the JPEG library is set up. jconfig.h can be
219 Index: jpeglibmangler.h
220 ===================================================================
221 --- jpeglibmangler.h (revision 0)
222 +++ jpeglibmangler.h (revision 0)
223 @@ -0,0 +1,113 @@
224 +// Copyright (c) 2009 The Chromium Authors. All rights reserved.
225 +// Use of this source code is governed by a BSD-style license that can be
226 +// found in the LICENSE file.
227 +
228 +#ifndef THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
229 +#define THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
230 +
231 +// Mangle all externally visible function names so we can build our own libjpeg
232 +// without system libraries trying to use it.
233 +
234 +#define jpeg_make_c_derived_tbl chromium_jpeg_make_c_derived_tbl
235 +#define jpeg_gen_optimal_table chromium_jpeg_gen_optimal_table
236 +#define jpeg_make_d_derived_tbl chromium_jpeg_make_d_derived_tbl
237 +#define jpeg_fill_bit_buffer chromium_jpeg_fill_bit_buffer
238 +#define jpeg_huff_decode chromium_jpeg_huff_decode
239 +#define jpeg_fdct_islow chromium_jpeg_fdct_islow
240 +#define jpeg_fdct_ifast chromium_jpeg_fdct_ifast
241 +#define jpeg_fdct_float chromium_jpeg_fdct_float
242 +#define jpeg_idct_islow chromium_jpeg_idct_islow
243 +#define jpeg_idct_ifast chromium_jpeg_idct_ifast
244 +#define jpeg_idct_float chromium_jpeg_idct_float
245 +#define jpeg_idct_4x4 chromium_jpeg_idct_4x4
246 +#define jpeg_idct_2x2 chromium_jpeg_idct_2x2
247 +#define jpeg_idct_1x1 chromium_jpeg_idct_1x1
248 +#define jinit_compress_master chromium_jinit_compress_master
249 +#define jinit_c_master_control chromium_jinit_c_master_control
250 +#define jinit_c_main_controller chromium_jinit_c_main_controller
251 +#define jinit_c_prep_controller chromium_jinit_c_prep_controller
252 +#define jinit_c_coef_controller chromium_jinit_c_coef_controller
253 +#define jinit_color_converter chromium_jinit_color_converter
254 +#define jinit_downsampler chromium_jinit_downsampler
255 +#define jinit_forward_dct chromium_jinit_forward_dct
256 +#define jinit_huff_encoder chromium_jinit_huff_encoder
257 +#define jinit_phuff_encoder chromium_jinit_phuff_encoder
258 +#define jinit_marker_writer chromium_jinit_marker_writer
259 +#define jinit_master_decompress chromium_jinit_master_decompress
260 +#define jinit_d_main_controller chromium_jinit_d_main_controller
261 +#define jinit_d_coef_controller chromium_jinit_d_coef_controller
262 +#define jinit_d_post_controller chromium_jinit_d_post_controller
263 +#define jinit_input_controller chromium_jinit_input_controller
264 +#define jinit_marker_reader chromium_jinit_marker_reader
265 +#define jinit_huff_decoder chromium_jinit_huff_decoder
266 +#define jinit_phuff_decoder chromium_jinit_phuff_decoder
267 +#define jinit_inverse_dct chromium_jinit_inverse_dct
268 +#define jinit_upsampler chromium_jinit_upsampler
269 +#define jinit_color_deconverter chromium_jinit_color_deconverter
270 +#define jinit_1pass_quantizer chromium_jinit_1pass_quantizer
271 +#define jinit_2pass_quantizer chromium_jinit_2pass_quantizer
272 +#define jinit_merged_upsampler chromium_jinit_merged_upsampler
273 +#define jinit_memory_mgr chromium_jinit_memory_mgr
274 +#define jdiv_round_up chromium_jdiv_round_up
275 +#define jround_up chromium_jround_up
276 +#define jcopy_sample_rows chromium_jcopy_sample_rows
277 +#define jcopy_block_row chromium_jcopy_block_row
278 +#define jzero_far chromium_jzero_far
279 +#define jpeg_std_error chromium_jpeg_std_error
280 +#define jpeg_CreateCompress chromium_jpeg_CreateCompress
281 +#define jpeg_CreateDecompress chromium_jpeg_CreateDecompress
282 +#define jpeg_destroy_compress chromium_jpeg_destroy_compress
283 +#define jpeg_destroy_decompress chromium_jpeg_destroy_decompress
284 +#define jpeg_stdio_dest chromium_jpeg_stdio_dest
285 +#define jpeg_stdio_src chromium_jpeg_stdio_src
286 +#define jpeg_set_defaults chromium_jpeg_set_defaults
287 +#define jpeg_set_colorspace chromium_jpeg_set_colorspace
288 +#define jpeg_default_colorspace chromium_jpeg_default_colorspace
289 +#define jpeg_set_quality chromium_jpeg_set_quality
290 +#define jpeg_set_linear_quality chromium_jpeg_set_linear_quality
291 +#define jpeg_add_quant_table chromium_jpeg_add_quant_table
292 +#define jpeg_quality_scaling chromium_jpeg_quality_scaling
293 +#define jpeg_simple_progression chromium_jpeg_simple_progression
294 +#define jpeg_suppress_tables chromium_jpeg_suppress_tables
295 +#define jpeg_alloc_quant_table chromium_jpeg_alloc_quant_table
296 +#define jpeg_alloc_huff_table chromium_jpeg_alloc_huff_table
297 +#define jpeg_start_compress chromium_jpeg_start_compress
298 +#define jpeg_write_scanlines chromium_jpeg_write_scanlines
299 +#define jpeg_finish_compress chromium_jpeg_finish_compress
300 +#define jpeg_write_raw_data chromium_jpeg_write_raw_data
301 +#define jpeg_write_marker chromium_jpeg_write_marker
302 +#define jpeg_write_m_header chromium_jpeg_write_m_header
303 +#define jpeg_write_m_byte chromium_jpeg_write_m_byte
304 +#define jpeg_write_tables chromium_jpeg_write_tables
305 +#define jpeg_read_header chromium_jpeg_read_header
306 +#define jpeg_start_decompress chromium_jpeg_start_decompress
307 +#define jpeg_read_scanlines chromium_jpeg_read_scanlines
308 +#define jpeg_finish_decompress chromium_jpeg_finish_decompress
309 +#define jpeg_read_raw_data chromium_jpeg_read_raw_data
310 +#define jpeg_has_multiple_scans chromium_jpeg_has_multiple_scans
311 +#define jpeg_start_output chromium_jpeg_start_output
312 +#define jpeg_finish_output chromium_jpeg_finish_output
313 +#define jpeg_input_complete chromium_jpeg_input_complete
314 +#define jpeg_new_colormap chromium_jpeg_new_colormap
315 +#define jpeg_consume_input chromium_jpeg_consume_input
316 +#define jpeg_calc_output_dimensions chromium_jpeg_calc_output_dimensions
317 +#define jpeg_save_markers chromium_jpeg_save_markers
318 +#define jpeg_set_marker_processor chromium_jpeg_set_marker_processor
319 +#define jpeg_read_coefficients chromium_jpeg_read_coefficients
320 +#define jpeg_write_coefficients chromium_jpeg_write_coefficients
321 +#define jpeg_copy_critical_parameters chromium_jpeg_copy_critical_parameters
322 +#define jpeg_abort_compress chromium_jpeg_abort_compress
323 +#define jpeg_abort_decompress chromium_jpeg_abort_decompress
324 +#define jpeg_abort chromium_jpeg_abort
325 +#define jpeg_destroy chromium_jpeg_destroy
326 +#define jpeg_resync_to_restart chromium_jpeg_resync_to_restart
327 +#define jpeg_get_small chromium_jpeg_get_small
328 +#define jpeg_free_small chromium_jpeg_free_small
329 +#define jpeg_get_large chromium_jpeg_get_large
330 +#define jpeg_free_large chromium_jpeg_free_large
331 +#define jpeg_mem_available chromium_jpeg_mem_available
332 +#define jpeg_open_backing_store chromium_jpeg_open_backing_store
333 +#define jpeg_mem_init chromium_jpeg_mem_init
334 +#define jpeg_mem_term chromium_jpeg_mem_term
335 +
336 +#endif // THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
337 Index: simd/jcgrass2-64.asm
338 ===================================================================
339 --- simd/jcgrass2-64.asm (revision 829)
340 +++ simd/jcgrass2-64.asm (working copy)
341 @@ -30,7 +30,7 @@
342 SECTION SEG_CONST
343
344 alignz 16
345 - global EXTN(jconst_rgb_gray_convert_sse2)
346 + global EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
347
348 EXTN(jconst_rgb_gray_convert_sse2):
349
350 Index: simd/jiss2fst.asm
351 ===================================================================
352 --- simd/jiss2fst.asm (revision 829)
353 +++ simd/jiss2fst.asm (working copy)
354 @@ -59,7 +59,7 @@
355 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
356
357 alignz 16
358 - global EXTN(jconst_idct_ifast_sse2)
359 + global EXTN(jconst_idct_ifast_sse2) PRIVATE
360
361 EXTN(jconst_idct_ifast_sse2):
362
363 @@ -92,7 +92,7 @@
364 %define WK_NUM 2
365
366 align 16
367 - global EXTN(jsimd_idct_ifast_sse2)
368 + global EXTN(jsimd_idct_ifast_sse2) PRIVATE
369
370 EXTN(jsimd_idct_ifast_sse2):
371 push ebp
372 Index: simd/jcclrss2-64.asm
373 ===================================================================
374 --- simd/jcclrss2-64.asm (revision 829)
375 +++ simd/jcclrss2-64.asm (working copy)
376 @@ -37,7 +37,7 @@
377
378 align 16
379
380 - global EXTN(jsimd_rgb_ycc_convert_sse2)
381 + global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
382
383 EXTN(jsimd_rgb_ycc_convert_sse2):
384 push rbp
385 Index: simd/jiss2red-64.asm
386 ===================================================================
387 --- simd/jiss2red-64.asm (revision 829)
388 +++ simd/jiss2red-64.asm (working copy)
389 @@ -73,7 +73,7 @@
390 SECTION SEG_CONST
391
392 alignz 16
393 - global EXTN(jconst_idct_red_sse2)
394 + global EXTN(jconst_idct_red_sse2) PRIVATE
395
396 EXTN(jconst_idct_red_sse2):
397
398 @@ -114,7 +114,7 @@
399 %define WK_NUM 2
400
401 align 16
402 - global EXTN(jsimd_idct_4x4_sse2)
403 + global EXTN(jsimd_idct_4x4_sse2) PRIVATE
404
405 EXTN(jsimd_idct_4x4_sse2):
406 push rbp
407 @@ -413,7 +413,7 @@
408 ; r13 = JDIMENSION output_col
409
410 align 16
411 - global EXTN(jsimd_idct_2x2_sse2)
412 + global EXTN(jsimd_idct_2x2_sse2) PRIVATE
413
414 EXTN(jsimd_idct_2x2_sse2):
415 push rbp
416 Index: simd/ji3dnflt.asm
417 ===================================================================
418 --- simd/ji3dnflt.asm (revision 829)
419 +++ simd/ji3dnflt.asm (working copy)
420 @@ -27,7 +27,7 @@
421 SECTION SEG_CONST
422
423 alignz 16
424 - global EXTN(jconst_idct_float_3dnow)
425 + global EXTN(jconst_idct_float_3dnow) PRIVATE
426
427 EXTN(jconst_idct_float_3dnow):
428
429 @@ -63,7 +63,7 @@
430 ; FAST_FLOAT workspace[DCTSIZE2]
431
432 align 16
433 - global EXTN(jsimd_idct_float_3dnow)
434 + global EXTN(jsimd_idct_float_3dnow) PRIVATE
435
436 EXTN(jsimd_idct_float_3dnow):
437 push ebp
438 Index: simd/jsimdcpu.asm
439 ===================================================================
440 --- simd/jsimdcpu.asm (revision 829)
441 +++ simd/jsimdcpu.asm (working copy)
442 @@ -29,7 +29,7 @@
443 ;
444
445 align 16
446 - global EXTN(jpeg_simd_cpu_support)
447 + global EXTN(jpeg_simd_cpu_support) PRIVATE
448
449 EXTN(jpeg_simd_cpu_support):
450 push ebx
451 Index: simd/jdmerss2-64.asm
452 ===================================================================
453 --- simd/jdmerss2-64.asm (revision 829)
454 +++ simd/jdmerss2-64.asm (working copy)
455 @@ -35,7 +35,7 @@
456 SECTION SEG_CONST
457
458 alignz 16
459 - global EXTN(jconst_merged_upsample_sse2)
460 + global EXTN(jconst_merged_upsample_sse2) PRIVATE
461
462 EXTN(jconst_merged_upsample_sse2):
463
464 Index: simd/jdsammmx.asm
465 ===================================================================
466 --- simd/jdsammmx.asm (revision 829)
467 +++ simd/jdsammmx.asm (working copy)
468 @@ -22,7 +22,7 @@
469 SECTION SEG_CONST
470
471 alignz 16
472 - global EXTN(jconst_fancy_upsample_mmx)
473 + global EXTN(jconst_fancy_upsample_mmx) PRIVATE
474
475 EXTN(jconst_fancy_upsample_mmx):
476
477 @@ -58,7 +58,7 @@
478 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
479
480 align 16
481 - global EXTN(jsimd_h2v1_fancy_upsample_mmx)
482 + global EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE
483
484 EXTN(jsimd_h2v1_fancy_upsample_mmx):
485 push ebp
486 @@ -216,7 +216,7 @@
487 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
488
489 align 16
490 - global EXTN(jsimd_h2v2_fancy_upsample_mmx)
491 + global EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE
492
493 EXTN(jsimd_h2v2_fancy_upsample_mmx):
494 push ebp
495 @@ -542,7 +542,7 @@
496 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
497
498 align 16
499 - global EXTN(jsimd_h2v1_upsample_mmx)
500 + global EXTN(jsimd_h2v1_upsample_mmx) PRIVATE
501
502 EXTN(jsimd_h2v1_upsample_mmx):
503 push ebp
504 @@ -643,7 +643,7 @@
505 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
506
507 align 16
508 - global EXTN(jsimd_h2v2_upsample_mmx)
509 + global EXTN(jsimd_h2v2_upsample_mmx) PRIVATE
510
511 EXTN(jsimd_h2v2_upsample_mmx):
512 push ebp
513 Index: simd/jdmrgmmx.asm
514 ===================================================================
515 --- simd/jdmrgmmx.asm (revision 829)
516 +++ simd/jdmrgmmx.asm (working copy)
517 @@ -40,7 +40,7 @@
518 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
519
520 align 16
521 - global EXTN(jsimd_h2v1_merged_upsample_mmx)
522 + global EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE
523
524 EXTN(jsimd_h2v1_merged_upsample_mmx):
525 push ebp
526 @@ -409,7 +409,7 @@
527 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
528
529 align 16
530 - global EXTN(jsimd_h2v2_merged_upsample_mmx)
531 + global EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE
532
533 EXTN(jsimd_h2v2_merged_upsample_mmx):
534 push ebp
535 Index: simd/jdsamss2.asm
536 ===================================================================
537 --- simd/jdsamss2.asm (revision 829)
538 +++ simd/jdsamss2.asm (working copy)
539 @@ -22,7 +22,7 @@
540 SECTION SEG_CONST
541
542 alignz 16
543 - global EXTN(jconst_fancy_upsample_sse2)
544 + global EXTN(jconst_fancy_upsample_sse2) PRIVATE
545
546 EXTN(jconst_fancy_upsample_sse2):
547
548 @@ -58,7 +58,7 @@
549 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
550
551 align 16
552 - global EXTN(jsimd_h2v1_fancy_upsample_sse2)
553 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
554
555 EXTN(jsimd_h2v1_fancy_upsample_sse2):
556 push ebp
557 @@ -214,7 +214,7 @@
558 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
559
560 align 16
561 - global EXTN(jsimd_h2v2_fancy_upsample_sse2)
562 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
563
564 EXTN(jsimd_h2v2_fancy_upsample_sse2):
565 push ebp
566 @@ -538,7 +538,7 @@
567 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
568
569 align 16
570 - global EXTN(jsimd_h2v1_upsample_sse2)
571 + global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
572
573 EXTN(jsimd_h2v1_upsample_sse2):
574 push ebp
575 @@ -637,7 +637,7 @@
576 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr
577
578 align 16
579 - global EXTN(jsimd_h2v2_upsample_sse2)
580 + global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
581
582 EXTN(jsimd_h2v2_upsample_sse2):
583 push ebp
584 Index: simd/jiss2flt-64.asm
585 ===================================================================
586 --- simd/jiss2flt-64.asm (revision 829)
587 +++ simd/jiss2flt-64.asm (working copy)
588 @@ -38,7 +38,7 @@
589 SECTION SEG_CONST
590
591 alignz 16
592 - global EXTN(jconst_idct_float_sse2)
593 + global EXTN(jconst_idct_float_sse2) PRIVATE
594
595 EXTN(jconst_idct_float_sse2):
596
597 @@ -74,7 +74,7 @@
598 ; FAST_FLOAT workspace[DCTSIZE2]
599
600 align 16
601 - global EXTN(jsimd_idct_float_sse2)
602 + global EXTN(jsimd_idct_float_sse2) PRIVATE
603
604 EXTN(jsimd_idct_float_sse2):
605 push rbp
606 Index: simd/jfss2int-64.asm
607 ===================================================================
608 --- simd/jfss2int-64.asm (revision 829)
609 +++ simd/jfss2int-64.asm (working copy)
610 @@ -67,7 +67,7 @@
611 SECTION SEG_CONST
612
613 alignz 16
614 - global EXTN(jconst_fdct_islow_sse2)
615 + global EXTN(jconst_fdct_islow_sse2) PRIVATE
616
617 EXTN(jconst_fdct_islow_sse2):
618
619 @@ -101,7 +101,7 @@
620 %define WK_NUM 6
621
622 align 16
623 - global EXTN(jsimd_fdct_islow_sse2)
624 + global EXTN(jsimd_fdct_islow_sse2) PRIVATE
625
626 EXTN(jsimd_fdct_islow_sse2):
627 push rbp
628 Index: simd/jcqnts2f.asm
629 ===================================================================
630 --- simd/jcqnts2f.asm (revision 829)
631 +++ simd/jcqnts2f.asm (working copy)
632 @@ -35,7 +35,7 @@
633 %define workspace ebp+16 ; FAST_FLOAT * workspace
634
635 align 16
636 - global EXTN(jsimd_convsamp_float_sse2)
637 + global EXTN(jsimd_convsamp_float_sse2) PRIVATE
638
639 EXTN(jsimd_convsamp_float_sse2):
640 push ebp
641 @@ -115,7 +115,7 @@
642 %define workspace ebp+16 ; FAST_FLOAT * workspace
643
644 align 16
645 - global EXTN(jsimd_quantize_float_sse2)
646 + global EXTN(jsimd_quantize_float_sse2) PRIVATE
647
648 EXTN(jsimd_quantize_float_sse2):
649 push ebp
650 Index: simd/jdmrgss2.asm
651 ===================================================================
652 --- simd/jdmrgss2.asm (revision 829)
653 +++ simd/jdmrgss2.asm (working copy)
654 @@ -40,7 +40,7 @@
655 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
656
657 align 16
658 - global EXTN(jsimd_h2v1_merged_upsample_sse2)
659 + global EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
660
661 EXTN(jsimd_h2v1_merged_upsample_sse2):
662 push ebp
663 @@ -560,7 +560,7 @@
664 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
665
666 align 16
667 - global EXTN(jsimd_h2v2_merged_upsample_sse2)
668 + global EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
669
670 EXTN(jsimd_h2v2_merged_upsample_sse2):
671 push ebp
672 Index: simd/jfmmxint.asm
673 ===================================================================
674 --- simd/jfmmxint.asm (revision 829)
675 +++ simd/jfmmxint.asm (working copy)
676 @@ -66,7 +66,7 @@
677 SECTION SEG_CONST
678
679 alignz 16
680 - global EXTN(jconst_fdct_islow_mmx)
681 + global EXTN(jconst_fdct_islow_mmx) PRIVATE
682
683 EXTN(jconst_fdct_islow_mmx):
684
685 @@ -101,7 +101,7 @@
686 %define WK_NUM 2
687
688 align 16
689 - global EXTN(jsimd_fdct_islow_mmx)
690 + global EXTN(jsimd_fdct_islow_mmx) PRIVATE
691
692 EXTN(jsimd_fdct_islow_mmx):
693 push ebp
694 Index: simd/jcgryss2-64.asm
695 ===================================================================
696 --- simd/jcgryss2-64.asm (revision 829)
697 +++ simd/jcgryss2-64.asm (working copy)
698 @@ -37,7 +37,7 @@
699
700 align 16
701
702 - global EXTN(jsimd_rgb_gray_convert_sse2)
703 + global EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
704
705 EXTN(jsimd_rgb_gray_convert_sse2):
706 push rbp
707 Index: simd/jcqnts2i.asm
708 ===================================================================
709 --- simd/jcqnts2i.asm (revision 829)
710 +++ simd/jcqnts2i.asm (working copy)
711 @@ -35,7 +35,7 @@
712 %define workspace ebp+16 ; DCTELEM * workspace
713
714 align 16
715 - global EXTN(jsimd_convsamp_sse2)
716 + global EXTN(jsimd_convsamp_sse2) PRIVATE
717
718 EXTN(jsimd_convsamp_sse2):
719 push ebp
720 @@ -117,7 +117,7 @@
721 %define workspace ebp+16 ; DCTELEM * workspace
722
723 align 16
724 - global EXTN(jsimd_quantize_sse2)
725 + global EXTN(jsimd_quantize_sse2) PRIVATE
726
727 EXTN(jsimd_quantize_sse2):
728 push ebp
729 Index: simd/jiss2fst-64.asm
730 ===================================================================
731 --- simd/jiss2fst-64.asm (revision 829)
732 +++ simd/jiss2fst-64.asm (working copy)
733 @@ -60,7 +60,7 @@
734 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
735
736 alignz 16
737 - global EXTN(jconst_idct_ifast_sse2)
738 + global EXTN(jconst_idct_ifast_sse2) PRIVATE
739
740 EXTN(jconst_idct_ifast_sse2):
741
742 @@ -93,7 +93,7 @@
743 %define WK_NUM 2
744
745 align 16
746 - global EXTN(jsimd_idct_ifast_sse2)
747 + global EXTN(jsimd_idct_ifast_sse2) PRIVATE
748
749 EXTN(jsimd_idct_ifast_sse2):
750 push rbp
751 Index: simd/jiss2flt.asm
752 ===================================================================
753 --- simd/jiss2flt.asm (revision 829)
754 +++ simd/jiss2flt.asm (working copy)
755 @@ -37,7 +37,7 @@
756 SECTION SEG_CONST
757
758 alignz 16
759 - global EXTN(jconst_idct_float_sse2)
760 + global EXTN(jconst_idct_float_sse2) PRIVATE
761
762 EXTN(jconst_idct_float_sse2):
763
764 @@ -73,7 +73,7 @@
765 ; FAST_FLOAT workspace[DCTSIZE2]
766
767 align 16
768 - global EXTN(jsimd_idct_float_sse2)
769 + global EXTN(jsimd_idct_float_sse2) PRIVATE
770
771 EXTN(jsimd_idct_float_sse2):
772 push ebp
773 Index: simd/jiss2int.asm
774 ===================================================================
775 --- simd/jiss2int.asm (revision 829)
776 +++ simd/jiss2int.asm (working copy)
777 @@ -66,7 +66,7 @@
778 SECTION SEG_CONST
779
780 alignz 16
781 - global EXTN(jconst_idct_islow_sse2)
782 + global EXTN(jconst_idct_islow_sse2) PRIVATE
783
784 EXTN(jconst_idct_islow_sse2):
785
786 @@ -105,7 +105,7 @@
787 %define WK_NUM 12
788
789 align 16
790 - global EXTN(jsimd_idct_islow_sse2)
791 + global EXTN(jsimd_idct_islow_sse2) PRIVATE
792
793 EXTN(jsimd_idct_islow_sse2):
794 push ebp
795 Index: simd/jfsseflt-64.asm
796 ===================================================================
797 --- simd/jfsseflt-64.asm (revision 829)
798 +++ simd/jfsseflt-64.asm (working copy)
799 @@ -38,7 +38,7 @@
800 SECTION SEG_CONST
801
802 alignz 16
803 - global EXTN(jconst_fdct_float_sse)
804 + global EXTN(jconst_fdct_float_sse) PRIVATE
805
806 EXTN(jconst_fdct_float_sse):
807
808 @@ -65,7 +65,7 @@
809 %define WK_NUM 2
810
811 align 16
812 - global EXTN(jsimd_fdct_float_sse)
813 + global EXTN(jsimd_fdct_float_sse) PRIVATE
814
815 EXTN(jsimd_fdct_float_sse):
816 push rbp
817 Index: simd/jccolss2-64.asm
818 ===================================================================
819 --- simd/jccolss2-64.asm (revision 829)
820 +++ simd/jccolss2-64.asm (working copy)
821 @@ -34,7 +34,7 @@
822 SECTION SEG_CONST
823
824 alignz 16
825 - global EXTN(jconst_rgb_ycc_convert_sse2)
826 + global EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
827
828 EXTN(jconst_rgb_ycc_convert_sse2):
829
830 Index: simd/jcsamss2-64.asm
831 ===================================================================
832 --- simd/jcsamss2-64.asm (revision 829)
833 +++ simd/jcsamss2-64.asm (working copy)
834 @@ -41,7 +41,7 @@
835 ; r15 = JSAMPARRAY output_data
836
837 align 16
838 - global EXTN(jsimd_h2v1_downsample_sse2)
839 + global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
840
841 EXTN(jsimd_h2v1_downsample_sse2):
842 push rbp
843 @@ -185,7 +185,7 @@
844 ; r15 = JSAMPARRAY output_data
845
846 align 16
847 - global EXTN(jsimd_h2v2_downsample_sse2)
848 + global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
849
850 EXTN(jsimd_h2v2_downsample_sse2):
851 push rbp
852 Index: simd/jdclrss2-64.asm
853 ===================================================================
854 --- simd/jdclrss2-64.asm (revision 829)
855 +++ simd/jdclrss2-64.asm (working copy)
856 @@ -39,7 +39,7 @@
857 %define WK_NUM 2
858
859 align 16
860 - global EXTN(jsimd_ycc_rgb_convert_sse2)
861 + global EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
862
863 EXTN(jsimd_ycc_rgb_convert_sse2):
864 push rbp
865 Index: simd/jdcolmmx.asm
866 ===================================================================
867 --- simd/jdcolmmx.asm (revision 829)
868 +++ simd/jdcolmmx.asm (working copy)
869 @@ -35,7 +35,7 @@
870 SECTION SEG_CONST
871
872 alignz 16
873 - global EXTN(jconst_ycc_rgb_convert_mmx)
874 + global EXTN(jconst_ycc_rgb_convert_mmx) PRIVATE
875
876 EXTN(jconst_ycc_rgb_convert_mmx):
877
878 Index: simd/jcclrmmx.asm
879 ===================================================================
880 --- simd/jcclrmmx.asm (revision 829)
881 +++ simd/jcclrmmx.asm (working copy)
882 @@ -40,7 +40,7 @@
883 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
884
885 align 16
886 - global EXTN(jsimd_rgb_ycc_convert_mmx)
887 + global EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE
888
889 EXTN(jsimd_rgb_ycc_convert_mmx):
890 push ebp
891 Index: simd/jfsseflt.asm
892 ===================================================================
893 --- simd/jfsseflt.asm (revision 829)
894 +++ simd/jfsseflt.asm (working copy)
895 @@ -37,7 +37,7 @@
896 SECTION SEG_CONST
897
898 alignz 16
899 - global EXTN(jconst_fdct_float_sse)
900 + global EXTN(jconst_fdct_float_sse) PRIVATE
901
902 EXTN(jconst_fdct_float_sse):
903
904 @@ -65,7 +65,7 @@
905 %define WK_NUM 2
906
907 align 16
908 - global EXTN(jsimd_fdct_float_sse)
909 + global EXTN(jsimd_fdct_float_sse) PRIVATE
910
911 EXTN(jsimd_fdct_float_sse):
912 push ebp
913 Index: simd/jdmrgss2-64.asm
914 ===================================================================
915 --- simd/jdmrgss2-64.asm (revision 829)
916 +++ simd/jdmrgss2-64.asm (working copy)
917 @@ -39,7 +39,7 @@
918 %define WK_NUM 3
919
920 align 16
921 - global EXTN(jsimd_h2v1_merged_upsample_sse2)
922 + global EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
923
924 EXTN(jsimd_h2v1_merged_upsample_sse2):
925 push rbp
926 @@ -543,7 +543,7 @@
927 ; r13 = JSAMPARRAY output_buf
928
929 align 16
930 - global EXTN(jsimd_h2v2_merged_upsample_sse2)
931 + global EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
932
933 EXTN(jsimd_h2v2_merged_upsample_sse2):
934 push rbp
935 Index: simd/jdcolss2.asm
936 ===================================================================
937 --- simd/jdcolss2.asm (revision 829)
938 +++ simd/jdcolss2.asm (working copy)
939 @@ -35,7 +35,7 @@
940 SECTION SEG_CONST
941
942 alignz 16
943 - global EXTN(jconst_ycc_rgb_convert_sse2)
944 + global EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
945
946 EXTN(jconst_ycc_rgb_convert_sse2):
947
948 Index: simd/jdmermmx.asm
949 ===================================================================
950 --- simd/jdmermmx.asm (revision 829)
951 +++ simd/jdmermmx.asm (working copy)
952 @@ -35,7 +35,7 @@
953 SECTION SEG_CONST
954
955 alignz 16
956 - global EXTN(jconst_merged_upsample_mmx)
957 + global EXTN(jconst_merged_upsample_mmx) PRIVATE
958
959 EXTN(jconst_merged_upsample_mmx):
960
961 Index: simd/jcclrss2.asm
962 ===================================================================
963 --- simd/jcclrss2.asm (revision 829)
964 +++ simd/jcclrss2.asm (working copy)
965 @@ -38,7 +38,7 @@
966
967 align 16
968
969 - global EXTN(jsimd_rgb_ycc_convert_sse2)
970 + global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
971
972 EXTN(jsimd_rgb_ycc_convert_sse2):
973 push ebp
974 Index: simd/jiss2red.asm
975 ===================================================================
976 --- simd/jiss2red.asm (revision 829)
977 +++ simd/jiss2red.asm (working copy)
978 @@ -72,7 +72,7 @@
979 SECTION SEG_CONST
980
981 alignz 16
982 - global EXTN(jconst_idct_red_sse2)
983 + global EXTN(jconst_idct_red_sse2) PRIVATE
984
985 EXTN(jconst_idct_red_sse2):
986
987 @@ -113,7 +113,7 @@
988 %define WK_NUM 2
989
990 align 16
991 - global EXTN(jsimd_idct_4x4_sse2)
992 + global EXTN(jsimd_idct_4x4_sse2) PRIVATE
993
994 EXTN(jsimd_idct_4x4_sse2):
995 push ebp
996 @@ -424,7 +424,7 @@
997 %define output_col(b) (b)+20 ; JDIMENSION output_col
998
999 align 16
1000 - global EXTN(jsimd_idct_2x2_sse2)
1001 + global EXTN(jsimd_idct_2x2_sse2) PRIVATE
1002
1003 EXTN(jsimd_idct_2x2_sse2):
1004 push ebp
1005 Index: simd/jdmerss2.asm
1006 ===================================================================
1007 --- simd/jdmerss2.asm (revision 829)
1008 +++ simd/jdmerss2.asm (working copy)
1009 @@ -35,7 +35,7 @@
1010 SECTION SEG_CONST
1011
1012 alignz 16
1013 - global EXTN(jconst_merged_upsample_sse2)
1014 + global EXTN(jconst_merged_upsample_sse2) PRIVATE
1015
1016 EXTN(jconst_merged_upsample_sse2):
1017
1018 Index: simd/jfss2fst-64.asm
1019 ===================================================================
1020 --- simd/jfss2fst-64.asm (revision 829)
1021 +++ simd/jfss2fst-64.asm (working copy)
1022 @@ -53,7 +53,7 @@
1023 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1024
1025 alignz 16
1026 - global EXTN(jconst_fdct_ifast_sse2)
1027 + global EXTN(jconst_fdct_ifast_sse2) PRIVATE
1028
1029 EXTN(jconst_fdct_ifast_sse2):
1030
1031 @@ -80,7 +80,7 @@
1032 %define WK_NUM 2
1033
1034 align 16
1035 - global EXTN(jsimd_fdct_ifast_sse2)
1036 + global EXTN(jsimd_fdct_ifast_sse2) PRIVATE
1037
1038 EXTN(jsimd_fdct_ifast_sse2):
1039 push rbp
1040 Index: simd/jcqntmmx.asm
1041 ===================================================================
1042 --- simd/jcqntmmx.asm (revision 829)
1043 +++ simd/jcqntmmx.asm (working copy)
1044 @@ -35,7 +35,7 @@
1045 %define workspace ebp+16 ; DCTELEM * workspace
1046
1047 align 16
1048 - global EXTN(jsimd_convsamp_mmx)
1049 + global EXTN(jsimd_convsamp_mmx) PRIVATE
1050
1051 EXTN(jsimd_convsamp_mmx):
1052 push ebp
1053 @@ -140,7 +140,7 @@
1054 %define workspace ebp+16 ; DCTELEM * workspace
1055
1056 align 16
1057 - global EXTN(jsimd_quantize_mmx)
1058 + global EXTN(jsimd_quantize_mmx) PRIVATE
1059
1060 EXTN(jsimd_quantize_mmx):
1061 push ebp
1062 Index: simd/jimmxfst.asm
1063 ===================================================================
1064 --- simd/jimmxfst.asm (revision 829)
1065 +++ simd/jimmxfst.asm (working copy)
1066 @@ -59,7 +59,7 @@
1067 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1068
1069 alignz 16
1070 - global EXTN(jconst_idct_ifast_mmx)
1071 + global EXTN(jconst_idct_ifast_mmx) PRIVATE
1072
1073 EXTN(jconst_idct_ifast_mmx):
1074
1075 @@ -94,7 +94,7 @@
1076 ; JCOEF workspace[DCTSIZE2]
1077
1078 align 16
1079 - global EXTN(jsimd_idct_ifast_mmx)
1080 + global EXTN(jsimd_idct_ifast_mmx) PRIVATE
1081
1082 EXTN(jsimd_idct_ifast_mmx):
1083 push ebp
1084 Index: simd/jfss2fst.asm
1085 ===================================================================
1086 --- simd/jfss2fst.asm (revision 829)
1087 +++ simd/jfss2fst.asm (working copy)
1088 @@ -52,7 +52,7 @@
1089 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1090
1091 alignz 16
1092 - global EXTN(jconst_fdct_ifast_sse2)
1093 + global EXTN(jconst_fdct_ifast_sse2) PRIVATE
1094
1095 EXTN(jconst_fdct_ifast_sse2):
1096
1097 @@ -80,7 +80,7 @@
1098 %define WK_NUM 2
1099
1100 align 16
1101 - global EXTN(jsimd_fdct_ifast_sse2)
1102 + global EXTN(jsimd_fdct_ifast_sse2) PRIVATE
1103
1104 EXTN(jsimd_fdct_ifast_sse2):
1105 push ebp
1106 Index: simd/jcgrammx.asm
1107 ===================================================================
1108 --- simd/jcgrammx.asm (revision 829)
1109 +++ simd/jcgrammx.asm (working copy)
1110 @@ -33,7 +33,7 @@
1111 SECTION SEG_CONST
1112
1113 alignz 16
1114 - global EXTN(jconst_rgb_gray_convert_mmx)
1115 + global EXTN(jconst_rgb_gray_convert_mmx) PRIVATE
1116
1117 EXTN(jconst_rgb_gray_convert_mmx):
1118
1119 Index: simd/jdcolss2-64.asm
1120 ===================================================================
1121 --- simd/jdcolss2-64.asm (revision 829)
1122 +++ simd/jdcolss2-64.asm (working copy)
1123 @@ -35,7 +35,7 @@
1124 SECTION SEG_CONST
1125
1126 alignz 16
1127 - global EXTN(jconst_ycc_rgb_convert_sse2)
1128 + global EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
1129
1130 EXTN(jconst_ycc_rgb_convert_sse2):
1131
1132 Index: simd/jf3dnflt.asm
1133 ===================================================================
1134 --- simd/jf3dnflt.asm (revision 829)
1135 +++ simd/jf3dnflt.asm (working copy)
1136 @@ -27,7 +27,7 @@
1137 SECTION SEG_CONST
1138
1139 alignz 16
1140 - global EXTN(jconst_fdct_float_3dnow)
1141 + global EXTN(jconst_fdct_float_3dnow) PRIVATE
1142
1143 EXTN(jconst_fdct_float_3dnow):
1144
1145 @@ -55,7 +55,7 @@
1146 %define WK_NUM 2
1147
1148 align 16
1149 - global EXTN(jsimd_fdct_float_3dnow)
1150 + global EXTN(jsimd_fdct_float_3dnow) PRIVATE
1151
1152 EXTN(jsimd_fdct_float_3dnow):
1153 push ebp
1154 Index: simd/jdsamss2-64.asm
1155 ===================================================================
1156 --- simd/jdsamss2-64.asm (revision 829)
1157 +++ simd/jdsamss2-64.asm (working copy)
1158 @@ -23,7 +23,7 @@
1159 SECTION SEG_CONST
1160
1161 alignz 16
1162 - global EXTN(jconst_fancy_upsample_sse2)
1163 + global EXTN(jconst_fancy_upsample_sse2) PRIVATE
1164
1165 EXTN(jconst_fancy_upsample_sse2):
1166
1167 @@ -59,7 +59,7 @@
1168 ; r13 = JSAMPARRAY * output_data_ptr
1169
1170 align 16
1171 - global EXTN(jsimd_h2v1_fancy_upsample_sse2)
1172 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
1173
1174 EXTN(jsimd_h2v1_fancy_upsample_sse2):
1175 push rbp
1176 @@ -201,7 +201,7 @@
1177 %define WK_NUM 4
1178
1179 align 16
1180 - global EXTN(jsimd_h2v2_fancy_upsample_sse2)
1181 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
1182
1183 EXTN(jsimd_h2v2_fancy_upsample_sse2):
1184 push rbp
1185 @@ -498,7 +498,7 @@
1186 ; r13 = JSAMPARRAY * output_data_ptr
1187
1188 align 16
1189 - global EXTN(jsimd_h2v1_upsample_sse2)
1190 + global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
1191
1192 EXTN(jsimd_h2v1_upsample_sse2):
1193 push rbp
1194 @@ -587,7 +587,7 @@
1195 ; r13 = JSAMPARRAY * output_data_ptr
1196
1197 align 16
1198 - global EXTN(jsimd_h2v2_upsample_sse2)
1199 + global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
1200
1201 EXTN(jsimd_h2v2_upsample_sse2):
1202 push rbp
1203 Index: simd/jcgrass2.asm
1204 ===================================================================
1205 --- simd/jcgrass2.asm (revision 829)
1206 +++ simd/jcgrass2.asm (working copy)
1207 @@ -30,7 +30,7 @@
1208 SECTION SEG_CONST
1209
1210 alignz 16
1211 - global EXTN(jconst_rgb_gray_convert_sse2)
1212 + global EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
1213
1214 EXTN(jconst_rgb_gray_convert_sse2):
1215
1216 Index: simd/jcsammmx.asm
1217 ===================================================================
1218 --- simd/jcsammmx.asm (revision 829)
1219 +++ simd/jcsammmx.asm (working copy)
1220 @@ -40,7 +40,7 @@
1221 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
1222
1223 align 16
1224 - global EXTN(jsimd_h2v1_downsample_mmx)
1225 + global EXTN(jsimd_h2v1_downsample_mmx) PRIVATE
1226
1227 EXTN(jsimd_h2v1_downsample_mmx):
1228 push ebp
1229 @@ -182,7 +182,7 @@
1230 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
1231
1232 align 16
1233 - global EXTN(jsimd_h2v2_downsample_mmx)
1234 + global EXTN(jsimd_h2v2_downsample_mmx) PRIVATE
1235
1236 EXTN(jsimd_h2v2_downsample_mmx):
1237 push ebp
1238 +Index: simd/jsimd_arm.c
1239 +===================================================================
1240 +--- simd/jsimd_arm.c (revision 272637)
1241 ++++ simd/jsimd_arm.c (working copy)
1242 +@@ -29,0 +29,0 @@
1243 +
1244 + static unsigned int simd_support = ~0;
1245 +
1246 +-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
1247 ++#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defin ed(__ANDROID__))
1248 +
1249 + #define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
1250 +
1251 +@@ -100,6 +100,6 @@
1252 + init_simd (void)
1253 + {
1254 + char *env = NULL;
1255 +-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || define d(__ANDROID__)
1256 ++#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defin ed(__ANDROID__))
1257 + int bufsize = 1024; /* an initial guess for the line buffer size limit */
1258 + #endif
1259 +
1260 Index: simd/jsimd_arm_neon.S
1261 ===================================================================
1262 --- simd/jsimd_arm_neon.S (revision 272637)
1263 +++ simd/jsimd_arm_neon.S (working copy)
1264 @@ -41,11 +41,9 @@
1265 /* Supplementary macro for setting function attributes */
1266 .macro asm_function fname
1267 #ifdef __APPLE__
1268 - .func _\fname
1269 .globl _\fname
1270 _\fname:
1271 #else
1272 - .func \fname
1273 .global \fname
1274 #ifdef __ELF__
1275 .hidden \fname
1276 @@ -670,7 +668,6 @@
1277 .unreq ROW6R
1278 .unreq ROW7L
1279 .unreq ROW7R
1280 -.endfunc
1281
1282
1283 /*****************************************************************************/
1284 @@ -895,7 +892,6 @@
1285 .unreq TMP2
1286 .unreq TMP3
1287 .unreq TMP4
1288 -.endfunc
1289
1290
1291 /*****************************************************************************/
1292 @@ -1108,7 +1104,6 @@
1293 .unreq TMP2
1294 .unreq TMP3
1295 .unreq TMP4
1296 -.endfunc
1297
1298 .purgem idct_helper
1299
1300 @@ -1263,7 +1258,6 @@
1301 .unreq OUTPUT_COL
1302 .unreq TMP1
1303 .unreq TMP2
1304 -.endfunc
1305
1306 .purgem idct_helper
1307
1308 @@ -1547,7 +1541,6 @@
1309 .unreq U
1310 .unreq V
1311 .unreq N
1312 -.endfunc
1313
1314 .purgem do_yuv_to_rgb
1315 .purgem do_yuv_to_rgb_stage1
1316 @@ -1858,7 +1851,6 @@
1317 .unreq U
1318 .unreq V
1319 .unreq N
1320 -.endfunc
1321
1322 .purgem do_rgb_to_yuv
1323 .purgem do_rgb_to_yuv_stage1
1324 @@ -1940,7 +1932,6 @@
1325 .unreq TMP2
1326 .unreq TMP3
1327 .unreq TMP4
1328 -.endfunc
1329
1330
1331 /*****************************************************************************/
1332 @@ -2064,7 +2055,6 @@
1333
1334 .unreq DATA
1335 .unreq TMP
1336 -.endfunc
1337
1338
1339 /*****************************************************************************/
1340 @@ -2166,7 +2156,6 @@
1341 .unreq CORRECTION
1342 .unreq SHIFT
1343 .unreq LOOP_COUNT
1344 -.endfunc
1345
1346
1347 /*****************************************************************************/
1348 @@ -2401,7 +2390,6 @@
1349 .unreq WIDTH
1350 .unreq TMP
1351
1352 -.endfunc
1353
1354 .purgem upsample16
1355 .purgem upsample32
1356 Index: simd/jsimd_i386.c
1357 ===================================================================
1358 --- simd/jsimd_i386.c (revision 829)
1359 +++ simd/jsimd_i386.c (working copy)
1360 @@ -61,6 +61,7 @@
1361 simd_support &= JSIMD_SSE2;
1362 }
1363
1364 +#ifndef JPEG_DECODE_ONLY
1365 GLOBAL(int)
1366 jsimd_can_rgb_ycc (void)
1367 {
1368 @@ -82,6 +83,7 @@
1369
1370 return 0;
1371 }
1372 +#endif
1373
1374 GLOBAL(int)
1375 jsimd_can_rgb_gray (void)
1376 @@ -127,6 +129,7 @@
1377 return 0;
1378 }
1379
1380 +#ifndef JPEG_DECODE_ONLY
1381 GLOBAL(void)
1382 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
1383 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
1384 @@ -179,6 +182,7 @@
1385 mmxfct(cinfo->image_width, input_buf,
1386 output_buf, output_row, num_rows);
1387 }
1388 +#endif
1389
1390 GLOBAL(void)
1391 jsimd_rgb_gray_convert (j_compress_ptr cinfo,
1392 @@ -286,6 +290,7 @@
1393 input_row, output_buf, num_rows);
1394 }
1395
1396 +#ifndef JPEG_DECODE_ONLY
1397 GLOBAL(int)
1398 jsimd_can_h2v2_downsample (void)
1399 {
1400 @@ -351,6 +356,7 @@
1401 compptr->v_samp_factor, compptr->width_in_blocks,
1402 input_data, output_data);
1403 }
1404 +#endif
1405
1406 GLOBAL(int)
1407 jsimd_can_h2v2_upsample (void)
1408 @@ -636,6 +642,7 @@
1409 in_row_group_ctr, output_buf);
1410 }
1411
1412 +#ifndef JPEG_DECODE_ONLY
1413 GLOBAL(int)
1414 jsimd_can_convsamp (void)
1415 {
1416 @@ -855,6 +862,7 @@
1417 else if (simd_support & JSIMD_3DNOW)
1418 jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
1419 }
1420 +#endif
1421
1422 GLOBAL(int)
1423 jsimd_can_idct_2x2 (void)
1424 @@ -1045,4 +1053,3 @@
1425 jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
1426 output_buf, output_col);
1427 }
1428 -
1429 Index: simd/jcqnts2f-64.asm
1430 ===================================================================
1431 --- simd/jcqnts2f-64.asm (revision 829)
1432 +++ simd/jcqnts2f-64.asm (working copy)
1433 @@ -36,7 +36,7 @@
1434 ; r12 = FAST_FLOAT * workspace
1435
1436 align 16
1437 - global EXTN(jsimd_convsamp_float_sse2)
1438 + global EXTN(jsimd_convsamp_float_sse2) PRIVATE
1439
1440 EXTN(jsimd_convsamp_float_sse2):
1441 push rbp
1442 @@ -110,7 +110,7 @@
1443 ; r12 = FAST_FLOAT * workspace
1444
1445 align 16
1446 - global EXTN(jsimd_quantize_float_sse2)
1447 + global EXTN(jsimd_quantize_float_sse2) PRIVATE
1448
1449 EXTN(jsimd_quantize_float_sse2):
1450 push rbp
1451 Index: simd/jcqnt3dn.asm
1452 ===================================================================
1453 --- simd/jcqnt3dn.asm (revision 829)
1454 +++ simd/jcqnt3dn.asm (working copy)
1455 @@ -35,7 +35,7 @@
1456 %define workspace ebp+16 ; FAST_FLOAT * workspace
1457
1458 align 16
1459 - global EXTN(jsimd_convsamp_float_3dnow)
1460 + global EXTN(jsimd_convsamp_float_3dnow) PRIVATE
1461
1462 EXTN(jsimd_convsamp_float_3dnow):
1463 push ebp
1464 @@ -138,7 +138,7 @@
1465 %define workspace ebp+16 ; FAST_FLOAT * workspace
1466
1467 align 16
1468 - global EXTN(jsimd_quantize_float_3dnow)
1469 + global EXTN(jsimd_quantize_float_3dnow) PRIVATE
1470
1471 EXTN(jsimd_quantize_float_3dnow):
1472 push ebp
1473 Index: simd/jcsamss2.asm
1474 ===================================================================
1475 --- simd/jcsamss2.asm (revision 829)
1476 +++ simd/jcsamss2.asm (working copy)
1477 @@ -40,7 +40,7 @@
1478 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
1479
1480 align 16
1481 - global EXTN(jsimd_h2v1_downsample_sse2)
1482 + global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
1483
1484 EXTN(jsimd_h2v1_downsample_sse2):
1485 push ebp
1486 @@ -195,7 +195,7 @@
1487 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
1488
1489 align 16
1490 - global EXTN(jsimd_h2v2_downsample_sse2)
1491 + global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
1492
1493 EXTN(jsimd_h2v2_downsample_sse2):
1494 push ebp
1495 Index: simd/jsimd_x86_64.c
1496 ===================================================================
1497 --- simd/jsimd_x86_64.c (revision 829)
1498 +++ simd/jsimd_x86_64.c (working copy)
1499 @@ -29,6 +29,7 @@
1500
1501 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
1502
1503 +#ifndef JPEG_DECODE_ONLY
1504 GLOBAL(int)
1505 jsimd_can_rgb_ycc (void)
1506 {
1507 @@ -45,6 +46,7 @@
1508
1509 return 1;
1510 }
1511 +#endif
1512
1513 GLOBAL(int)
1514 jsimd_can_rgb_gray (void)
1515 @@ -80,6 +82,7 @@
1516 return 1;
1517 }
1518
1519 +#ifndef JPEG_DECODE_ONLY
1520 GLOBAL(void)
1521 jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
1522 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
1523 @@ -118,6 +121,7 @@
1524
1525 sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
1526 }
1527 +#endif
1528
1529 GLOBAL(void)
1530 jsimd_rgb_gray_convert (j_compress_ptr cinfo,
1531 @@ -197,6 +201,7 @@
1532 sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
1533 }
1534
1535 +#ifndef JPEG_DECODE_ONLY
1536 GLOBAL(int)
1537 jsimd_can_h2v2_downsample (void)
1538 {
1539 @@ -242,6 +247,7 @@
1540 compptr->width_in_blocks,
1541 input_data, output_data);
1542 }
1543 +#endif
1544
1545 GLOBAL(int)
1546 jsimd_can_h2v2_upsample (void)
1547 @@ -451,6 +457,7 @@
1548 sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
1549 }
1550
1551 +#ifndef JPEG_DECODE_ONLY
1552 GLOBAL(int)
1553 jsimd_can_convsamp (void)
1554 {
1555 @@ -601,6 +608,7 @@
1556 {
1557 jsimd_quantize_float_sse2(coef_block, divisors, workspace);
1558 }
1559 +#endif
1560
1561 GLOBAL(int)
1562 jsimd_can_idct_2x2 (void)
1563 @@ -750,4 +758,3 @@
1564 jsimd_idct_float_sse2(compptr->dct_table, coef_block,
1565 output_buf, output_col);
1566 }
1567 -
1568 Index: simd/jimmxint.asm
1569 ===================================================================
1570 --- simd/jimmxint.asm (revision 829)
1571 +++ simd/jimmxint.asm (working copy)
1572 @@ -66,7 +66,7 @@
1573 SECTION SEG_CONST
1574
1575 alignz 16
1576 - global EXTN(jconst_idct_islow_mmx)
1577 + global EXTN(jconst_idct_islow_mmx) PRIVATE
1578
1579 EXTN(jconst_idct_islow_mmx):
1580
1581 @@ -107,7 +107,7 @@
1582 ; JCOEF workspace[DCTSIZE2]
1583
1584 align 16
1585 - global EXTN(jsimd_idct_islow_mmx)
1586 + global EXTN(jsimd_idct_islow_mmx) PRIVATE
1587
1588 EXTN(jsimd_idct_islow_mmx):
1589 push ebp
1590 Index: simd/jcgrymmx.asm
1591 ===================================================================
1592 --- simd/jcgrymmx.asm (revision 829)
1593 +++ simd/jcgrymmx.asm (working copy)
1594 @@ -41,7 +41,7 @@
1595 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
1596
1597 align 16
1598 - global EXTN(jsimd_rgb_gray_convert_mmx)
1599 + global EXTN(jsimd_rgb_gray_convert_mmx) PRIVATE
1600
1601 EXTN(jsimd_rgb_gray_convert_mmx):
1602 push ebp
1603 Index: simd/jfss2int.asm
1604 ===================================================================
1605 --- simd/jfss2int.asm (revision 829)
1606 +++ simd/jfss2int.asm (working copy)
1607 @@ -66,7 +66,7 @@
1608 SECTION SEG_CONST
1609
1610 alignz 16
1611 - global EXTN(jconst_fdct_islow_sse2)
1612 + global EXTN(jconst_fdct_islow_sse2) PRIVATE
1613
1614 EXTN(jconst_fdct_islow_sse2):
1615
1616 @@ -101,7 +101,7 @@
1617 %define WK_NUM 6
1618
1619 align 16
1620 - global EXTN(jsimd_fdct_islow_sse2)
1621 + global EXTN(jsimd_fdct_islow_sse2) PRIVATE
1622
1623 EXTN(jsimd_fdct_islow_sse2):
1624 push ebp
1625 Index: simd/jcgryss2.asm
1626 ===================================================================
1627 --- simd/jcgryss2.asm (revision 829)
1628 +++ simd/jcgryss2.asm (working copy)
1629 @@ -39,7 +39,7 @@
1630
1631 align 16
1632
1633 - global EXTN(jsimd_rgb_gray_convert_sse2)
1634 + global EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
1635
1636 EXTN(jsimd_rgb_gray_convert_sse2):
1637 push ebp
1638 Index: simd/jccolmmx.asm
1639 ===================================================================
1640 --- simd/jccolmmx.asm (revision 829)
1641 +++ simd/jccolmmx.asm (working copy)
1642 @@ -37,7 +37,7 @@
1643 SECTION SEG_CONST
1644
1645 alignz 16
1646 - global EXTN(jconst_rgb_ycc_convert_mmx)
1647 + global EXTN(jconst_rgb_ycc_convert_mmx) PRIVATE
1648
1649 EXTN(jconst_rgb_ycc_convert_mmx):
1650
1651 Index: simd/jimmxred.asm
1652 ===================================================================
1653 --- simd/jimmxred.asm (revision 829)
1654 +++ simd/jimmxred.asm (working copy)
1655 @@ -72,7 +72,7 @@
1656 SECTION SEG_CONST
1657
1658 alignz 16
1659 - global EXTN(jconst_idct_red_mmx)
1660 + global EXTN(jconst_idct_red_mmx) PRIVATE
1661
1662 EXTN(jconst_idct_red_mmx):
1663
1664 @@ -115,7 +115,7 @@
1665 ; JCOEF workspace[DCTSIZE2]
1666
1667 align 16
1668 - global EXTN(jsimd_idct_4x4_mmx)
1669 + global EXTN(jsimd_idct_4x4_mmx) PRIVATE
1670
1671 EXTN(jsimd_idct_4x4_mmx):
1672 push ebp
1673 @@ -503,7 +503,7 @@
1674 %define output_col(b) (b)+20 ; JDIMENSION output_col
1675
1676 align 16
1677 - global EXTN(jsimd_idct_2x2_mmx)
1678 + global EXTN(jsimd_idct_2x2_mmx) PRIVATE
1679
1680 EXTN(jsimd_idct_2x2_mmx):
1681 push ebp
1682 Index: simd/jsimdext.inc
1683 ===================================================================
1684 --- simd/jsimdext.inc (revision 829)
1685 +++ simd/jsimdext.inc (working copy)
1686 @@ -73,6 +73,9 @@
1687 ; * *BSD family Unix using elf format
1688 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
1689
1690 +; PIC is the default on Linux
1691 +%define PIC
1692 +
1693 ; mark stack as non-executable
1694 section .note.GNU-stack noalloc noexec nowrite progbits
1695
1696 @@ -375,4 +378,14 @@
1697 ;
1698 %include "jsimdcfg.inc"
1699
1700 +; Begin chromium edits
1701 +%ifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
1702 +%define PRIVATE :private_extern
1703 +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
1704 +%define PRIVATE :hidden
1705 +%else
1706 +%define PRIVATE
1707 +%endif
1708 +; End chromium edits
1709 +
1710 ; --------------------------------------------------------------------------
1711 Index: simd/jdclrmmx.asm
1712 ===================================================================
1713 --- simd/jdclrmmx.asm (revision 829)
1714 +++ simd/jdclrmmx.asm (working copy)
1715 @@ -40,7 +40,7 @@
1716 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
1717
1718 align 16
1719 - global EXTN(jsimd_ycc_rgb_convert_mmx)
1720 + global EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE
1721
1722 EXTN(jsimd_ycc_rgb_convert_mmx):
1723 push ebp
1724 Index: simd/jccolss2.asm
1725 ===================================================================
1726 --- simd/jccolss2.asm (revision 829)
1727 +++ simd/jccolss2.asm (working copy)
1728 @@ -34,7 +34,7 @@
1729 SECTION SEG_CONST
1730
1731 alignz 16
1732 - global EXTN(jconst_rgb_ycc_convert_sse2)
1733 + global EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
1734
1735 EXTN(jconst_rgb_ycc_convert_sse2):
1736
1737 Index: simd/jisseflt.asm
1738 ===================================================================
1739 --- simd/jisseflt.asm (revision 829)
1740 +++ simd/jisseflt.asm (working copy)
1741 @@ -37,7 +37,7 @@
1742 SECTION SEG_CONST
1743
1744 alignz 16
1745 - global EXTN(jconst_idct_float_sse)
1746 + global EXTN(jconst_idct_float_sse) PRIVATE
1747
1748 EXTN(jconst_idct_float_sse):
1749
1750 @@ -73,7 +73,7 @@
1751 ; FAST_FLOAT workspace[DCTSIZE2]
1752
1753 align 16
1754 - global EXTN(jsimd_idct_float_sse)
1755 + global EXTN(jsimd_idct_float_sse) PRIVATE
1756
1757 EXTN(jsimd_idct_float_sse):
1758 push ebp
1759 Index: simd/jcqnts2i-64.asm
1760 ===================================================================
1761 --- simd/jcqnts2i-64.asm (revision 829)
1762 +++ simd/jcqnts2i-64.asm (working copy)
1763 @@ -36,7 +36,7 @@
1764 ; r12 = DCTELEM * workspace
1765
1766 align 16
1767 - global EXTN(jsimd_convsamp_sse2)
1768 + global EXTN(jsimd_convsamp_sse2) PRIVATE
1769
1770 EXTN(jsimd_convsamp_sse2):
1771 push rbp
1772 @@ -112,7 +112,7 @@
1773 ; r12 = DCTELEM * workspace
1774
1775 align 16
1776 - global EXTN(jsimd_quantize_sse2)
1777 + global EXTN(jsimd_quantize_sse2) PRIVATE
1778
1779 EXTN(jsimd_quantize_sse2):
1780 push rbp
1781 Index: simd/jdclrss2.asm
1782 ===================================================================
1783 --- simd/jdclrss2.asm (revision 829)
1784 +++ simd/jdclrss2.asm (working copy)
1785 @@ -40,7 +40,7 @@
1786 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
1787
1788 align 16
1789 - global EXTN(jsimd_ycc_rgb_convert_sse2)
1790 + global EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
1791
1792 EXTN(jsimd_ycc_rgb_convert_sse2):
1793 push ebp
1794 Index: simd/jcqntsse.asm
1795 ===================================================================
1796 --- simd/jcqntsse.asm (revision 829)
1797 +++ simd/jcqntsse.asm (working copy)
1798 @@ -35,7 +35,7 @@
1799 %define workspace ebp+16 ; FAST_FLOAT * workspace
1800
1801 align 16
1802 - global EXTN(jsimd_convsamp_float_sse)
1803 + global EXTN(jsimd_convsamp_float_sse) PRIVATE
1804
1805 EXTN(jsimd_convsamp_float_sse):
1806 push ebp
1807 @@ -138,7 +138,7 @@
1808 %define workspace ebp+16 ; FAST_FLOAT * workspace
1809
1810 align 16
1811 - global EXTN(jsimd_quantize_float_sse)
1812 + global EXTN(jsimd_quantize_float_sse) PRIVATE
1813
1814 EXTN(jsimd_quantize_float_sse):
1815 push ebp
1816 Index: simd/jiss2int-64.asm
1817 ===================================================================
1818 --- simd/jiss2int-64.asm (revision 829)
1819 +++ simd/jiss2int-64.asm (working copy)
1820 @@ -67,7 +67,7 @@
1821 SECTION SEG_CONST
1822
1823 alignz 16
1824 - global EXTN(jconst_idct_islow_sse2)
1825 + global EXTN(jconst_idct_islow_sse2) PRIVATE
1826
1827 EXTN(jconst_idct_islow_sse2):
1828
1829 @@ -106,7 +106,7 @@
1830 %define WK_NUM 12
1831
1832 align 16
1833 - global EXTN(jsimd_idct_islow_sse2)
1834 + global EXTN(jsimd_idct_islow_sse2) PRIVATE
1835
1836 EXTN(jsimd_idct_islow_sse2):
1837 push rbp
1838 Index: simd/jfmmxfst.asm
1839 ===================================================================
1840 --- simd/jfmmxfst.asm (revision 829)
1841 +++ simd/jfmmxfst.asm (working copy)
1842 @@ -52,7 +52,7 @@
1843 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
1844
1845 alignz 16
1846 - global EXTN(jconst_fdct_ifast_mmx)
1847 + global EXTN(jconst_fdct_ifast_mmx) PRIVATE
1848
1849 EXTN(jconst_fdct_ifast_mmx):
1850
1851 @@ -80,7 +80,7 @@
1852 %define WK_NUM 2
1853
1854 align 16
1855 - global EXTN(jsimd_fdct_ifast_mmx)
1856 + global EXTN(jsimd_fdct_ifast_mmx) PRIVATE
1857
1858 EXTN(jsimd_fdct_ifast_mmx):
1859 push ebp
1860 Index: jdarith.c
1861 ===================================================================
1862 --- jdarith.c (revision 829)
1863 +++ jdarith.c (working copy)
1864 @@ -150,8 +150,8 @@
1865 */
1866 sv = *st;
1867 qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */
1868 - nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */
1869 - nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
1870 + nl = (unsigned char) qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS * /
1871 + nm = (unsigned char) qe & 0xFF; qe >>= 8; /* Next_Index_MPS */
1872
1873 /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
1874 temp = e->a - qe;
1875 Index: jdhuff.c
1876 ===================================================================
1877 --- jdhuff.c (revision 1541)
1878 +++ jdhuff.c (working copy)
1879 @@ -662,7 +662,7 @@
1880 d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn];
1881 register int s, k, r, l;
1882
1883 - HUFF_DECODE_FAST(s, l, dctbl);
1884 + HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu);
1885 if (s) {
1886 FILL_BIT_BUFFER_FAST
1887 r = GET_BITS(s);
1888 @@ -679,7 +679,7 @@
1889 if (entropy->ac_needed[blkn]) {
1890
1891 for (k = 1; k < DCTSIZE2; k++) {
1892 - HUFF_DECODE_FAST(s, l, actbl);
1893 + HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
1894 r = s >> 4;
1895 s &= 15;
1896
1897 @@ -698,7 +698,7 @@
1898 } else {
1899
1900 for (k = 1; k < DCTSIZE2; k++) {
1901 - HUFF_DECODE_FAST(s, l, actbl);
1902 + HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
1903 r = s >> 4;
1904 s &= 15;
1905
1906 @@ -715,6 +715,7 @@
1907 }
1908
1909 if (cinfo->unread_marker != 0) {
1910 +slow_decode_mcu:
1911 cinfo->unread_marker = 0;
1912 return FALSE;
1913 }
1914 @@ -742,7 +743,7 @@
1915 * this module, since we'll just re-assign them on the next call.)
1916 */
1917
1918 -#define BUFSIZE (DCTSIZE2 * 2)
1919 +#define BUFSIZE (DCTSIZE2 * 2u)
1920
1921 METHODDEF(boolean)
1922 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
1923 Index: jdhuff.h
1924 ===================================================================
1925 --- jdhuff.h (revision 1541)
1926 +++ jdhuff.h (working copy)
1927 @@ -208,7 +208,7 @@
1928 } \
1929 }
1930
1931 -#define HUFF_DECODE_FAST(s,nb,htbl) \
1932 +#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \
1933 FILL_BIT_BUFFER_FAST; \
1934 s = PEEK_BITS(HUFF_LOOKAHEAD); \
1935 s = htbl->lookup[s]; \
1936 @@ -225,7 +225,9 @@
1937 s |= GET_BITS(1); \
1938 nb++; \
1939 } \
1940 - s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) & 0xFF ]; \
1941 + if (nb > 16) \
1942 + goto slowlabel; \
1943 + s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \
1944 }
1945
1946 /* Out-of-line case for Huffman code fetching */
1947
1948 Index: jchuff.c
1949 ===================================================================
1950 --- jchuff.c (revision 1219)
1951 +++ jchuff.c (revision 1220)
1952 @@ -22,8 +22,36 @@
1953 #include "jchuff.h" /* Declarations shared with jcphuff.c */
1954 #include <limits.h>
1955
1956 +/*
1957 + * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
1958 + * used for bit counting rather than the lookup table. This will reduce the
1959 + * memory footprint by 64k, which is important for some mobile applications
1960 + * that create many isolated instances of libjpeg-turbo (web browsers, for
1961 + * instance.) This may improve performance on some mobile platforms as well.
1962 + * This feature is enabled by default only on ARM processors, because some x86
1963 + * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
1964 + * shown to have a significant performance impact even on the x86 chips that
1965 + * have a fast implementation of it. When building for ARMv6, you can
1966 + * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
1967 + * flags (this defines __thumb__).
1968 + */
1969 +
1970 +/* NOTE: Both GCC and Clang define __GNUC__ */
1971 +#if defined __GNUC__ && defined __arm__
1972 +#if !defined __thumb__ || defined __thumb2__
1973 +#define USE_CLZ_INTRINSIC
1974 +#endif
1975 +#endif
1976 +
1977 +#ifdef USE_CLZ_INTRINSIC
1978 +#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
1979 +#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
1980 +#else
1981 static unsigned char jpeg_nbits_table[65536];
1982 static int jpeg_nbits_table_init = 0;
1983 +#define JPEG_NBITS(x) (jpeg_nbits_table[x])
1984 +#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
1985 +#endif
1986
1987 #ifndef min
1988 #define min(a,b) ((a)<(b)?(a):(b))
1989 @@ -272,6 +300,7 @@
1990 dtbl->ehufsi[i] = huffsize[p];
1991 }
1992
1993 +#ifndef USE_CLZ_INTRINSIC
1994 if(!jpeg_nbits_table_init) {
1995 for(i = 0; i < 65536; i++) {
1996 int nbits = 0, temp = i;
1997 @@ -280,6 +309,7 @@
1998 }
1999 jpeg_nbits_table_init = 1;
2000 }
2001 +#endif
2002 }
2003
2004
2005 @@ -482,7 +512,7 @@
2006 temp2 += temp3;
2007
2008 /* Find the number of bits needed for the magnitude of the coefficient */
2009 - nbits = jpeg_nbits_table[temp];
2010 + nbits = JPEG_NBITS(temp);
2011
2012 /* Emit the Huffman-coded symbol for the number of bits */
2013 code = dctbl->ehufco[nbits];
2014 @@ -516,7 +546,7 @@
2015 temp ^= temp3; \
2016 temp -= temp3; \
2017 temp2 += temp3; \
2018 - nbits = jpeg_nbits_table[temp]; \
2019 + nbits = JPEG_NBITS_NONZERO(temp); \
2020 /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
2021 while (r > 15) { \
2022 EMIT_BITS(code_0xf0, size_0xf0) \
2023 Index: simd/jsimd_arm64.c
2024 ===================================================================
2025 --- /dev/null
2026 +++ simd/jsimd_arm64.c
2027 @@ -0,0 +1,544 @@
2028 +/*
2029 + * jsimd_arm64.c
2030 + *
2031 + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
2032 + * Copyright 2009-2011, 2013-2014 D. R. Commander
2033 + *
2034 + * Based on the x86 SIMD extension for IJG JPEG library,
2035 + * Copyright (C) 1999-2006, MIYASAKA Masaru.
2036 + * For conditions of distribution and use, see copyright notice in jsimdext.inc
2037 + *
2038 + * This file contains the interface between the "normal" portions
2039 + * of the library and the SIMD implementations when running on a
2040 + * 64-bit ARM architecture.
2041 + */
2042 +
2043 +#define JPEG_INTERNALS
2044 +#include "../jinclude.h"
2045 +#include "../jpeglib.h"
2046 +#include "../jsimd.h"
2047 +#include "../jdct.h"
2048 +#include "../jsimddct.h"
2049 +#include "jsimd.h"
2050 +
2051 +#include <stdio.h>
2052 +#include <string.h>
2053 +#include <ctype.h>
2054 +
2055 +static unsigned int simd_support = ~0;
2056 +
2057 +/*
2058 + * Check what SIMD accelerations are supported.
2059 + *
2060 + * FIXME: This code is racy under a multi-threaded environment.
2061 + */
2062 +
2063 +/*
2064 + * ARMv8 architectures support NEON extensions by default.
2065 + * It is no longer optional as it was with ARMv7.
2066 + */
2067 +
2068 +
2069 +LOCAL(void)
2070 +init_simd (void)
2071 +{
2072 + char *env = NULL;
2073 +
2074 + if (simd_support != ~0U)
2075 + return;
2076 +
2077 + simd_support = 0;
2078 +
2079 + simd_support |= JSIMD_ARM_NEON;
2080 +
2081 + /* Force different settings through environment variables */
2082 + env = getenv("JSIMD_FORCENEON");
2083 + if ((env != NULL) && (strcmp(env, "1") == 0))
2084 + simd_support &= JSIMD_ARM_NEON;
2085 + env = getenv("JSIMD_FORCENONE");
2086 + if ((env != NULL) && (strcmp(env, "1") == 0))
2087 + simd_support = 0;
2088 +}
2089 +
2090 +GLOBAL(int)
2091 +jsimd_can_rgb_ycc (void)
2092 +{
2093 + init_simd();
2094 +
2095 + return 0;
2096 +}
2097 +
2098 +GLOBAL(int)
2099 +jsimd_can_rgb_gray (void)
2100 +{
2101 + init_simd();
2102 +
2103 + return 0;
2104 +}
2105 +
2106 +GLOBAL(int)
2107 +jsimd_can_ycc_rgb (void)
2108 +{
2109 + init_simd();
2110 +
2111 + /* The code is optimised for these values only */
2112 + if (BITS_IN_JSAMPLE != 8)
2113 + return 0;
2114 + if (sizeof(JDIMENSION) != 4)
2115 + return 0;
2116 + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
2117 + return 0;
2118 +
2119 + if (simd_support & JSIMD_ARM_NEON)
2120 + return 1;
2121 +
2122 + return 0;
2123 +}
2124 +
2125 +GLOBAL(int)
2126 +jsimd_can_ycc_rgb565 (void)
2127 +{
2128 + init_simd();
2129 +
2130 + /* The code is optimised for these values only */
2131 + if (BITS_IN_JSAMPLE != 8)
2132 + return 0;
2133 + if (sizeof(JDIMENSION) != 4)
2134 + return 0;
2135 +
2136 + if (simd_support & JSIMD_ARM_NEON)
2137 + return 1;
2138 +
2139 + return 0;
2140 +}
2141 +
2142 +GLOBAL(void)
2143 +jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
2144 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
2145 + JDIMENSION output_row, int num_rows)
2146 +{
2147 +}
2148 +
2149 +GLOBAL(void)
2150 +jsimd_rgb_gray_convert (j_compress_ptr cinfo,
2151 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
2152 + JDIMENSION output_row, int num_rows)
2153 +{
2154 +}
2155 +
2156 +GLOBAL(void)
2157 +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
2158 + JSAMPIMAGE input_buf, JDIMENSION input_row,
2159 + JSAMPARRAY output_buf, int num_rows)
2160 +{
2161 + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
2162 +
2163 + switch(cinfo->out_color_space) {
2164 + case JCS_EXT_RGB:
2165 + neonfct=jsimd_ycc_extrgb_convert_neon;
2166 + break;
2167 + case JCS_EXT_RGBX:
2168 + case JCS_EXT_RGBA:
2169 + neonfct=jsimd_ycc_extrgbx_convert_neon;
2170 + break;
2171 + case JCS_EXT_BGR:
2172 + neonfct=jsimd_ycc_extbgr_convert_neon;
2173 + break;
2174 + case JCS_EXT_BGRX:
2175 + case JCS_EXT_BGRA:
2176 + neonfct=jsimd_ycc_extbgrx_convert_neon;
2177 + break;
2178 + case JCS_EXT_XBGR:
2179 + case JCS_EXT_ABGR:
2180 + neonfct=jsimd_ycc_extxbgr_convert_neon;
2181 + break;
2182 + case JCS_EXT_XRGB:
2183 + case JCS_EXT_ARGB:
2184 + neonfct=jsimd_ycc_extxrgb_convert_neon;
2185 + break;
2186 + default:
2187 + neonfct=jsimd_ycc_extrgb_convert_neon;
2188 + break;
2189 + }
2190 +
2191 + if (simd_support & JSIMD_ARM_NEON)
2192 + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
2193 +}
2194 +
2195 +GLOBAL(void)
2196 +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
2197 + JSAMPIMAGE input_buf, JDIMENSION input_row,
2198 + JSAMPARRAY output_buf, int num_rows)
2199 +{
2200 + if (simd_support & JSIMD_ARM_NEON)
2201 + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
2202 + output_buf, num_rows);
2203 +}
2204 +
2205 +GLOBAL(int)
2206 +jsimd_can_h2v2_downsample (void)
2207 +{
2208 + init_simd();
2209 +
2210 + return 0;
2211 +}
2212 +
2213 +GLOBAL(int)
2214 +jsimd_can_h2v1_downsample (void)
2215 +{
2216 + init_simd();
2217 +
2218 + return 0;
2219 +}
2220 +
2221 +GLOBAL(void)
2222 +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
2223 + JSAMPARRAY input_data, JSAMPARRAY output_data)
2224 +{
2225 +}
2226 +
2227 +GLOBAL(void)
2228 +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
2229 + JSAMPARRAY input_data, JSAMPARRAY output_data)
2230 +{
2231 +}
2232 +
2233 +GLOBAL(int)
2234 +jsimd_can_h2v2_upsample (void)
2235 +{
2236 + init_simd();
2237 +
2238 + return 0;
2239 +}
2240 +
2241 +GLOBAL(int)
2242 +jsimd_can_h2v1_upsample (void)
2243 +{
2244 + init_simd();
2245 +
2246 + return 0;
2247 +}
2248 +
2249 +GLOBAL(void)
2250 +jsimd_h2v2_upsample (j_decompress_ptr cinfo,
2251 + jpeg_component_info * compptr,
2252 + JSAMPARRAY input_data,
2253 + JSAMPARRAY * output_data_ptr)
2254 +{
2255 +}
2256 +
2257 +GLOBAL(void)
2258 +jsimd_h2v1_upsample (j_decompress_ptr cinfo,
2259 + jpeg_component_info * compptr,
2260 + JSAMPARRAY input_data,
2261 + JSAMPARRAY * output_data_ptr)
2262 +{
2263 +}
2264 +
2265 +GLOBAL(int)
2266 +jsimd_can_h2v2_fancy_upsample (void)
2267 +{
2268 + init_simd();
2269 +
2270 + return 0;
2271 +}
2272 +
2273 +GLOBAL(int)
2274 +jsimd_can_h2v1_fancy_upsample (void)
2275 +{
2276 + init_simd();
2277 +
2278 + return 0;
2279 +}
2280 +
2281 +GLOBAL(void)
2282 +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
2283 + jpeg_component_info * compptr,
2284 + JSAMPARRAY input_data,
2285 + JSAMPARRAY * output_data_ptr)
2286 +{
2287 +}
2288 +
2289 +GLOBAL(void)
2290 +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
2291 + jpeg_component_info * compptr,
2292 + JSAMPARRAY input_data,
2293 + JSAMPARRAY * output_data_ptr)
2294 +{
2295 +}
2296 +
2297 +GLOBAL(int)
2298 +jsimd_can_h2v2_merged_upsample (void)
2299 +{
2300 + init_simd();
2301 +
2302 + return 0;
2303 +}
2304 +
2305 +GLOBAL(int)
2306 +jsimd_can_h2v1_merged_upsample (void)
2307 +{
2308 + init_simd();
2309 +
2310 + return 0;
2311 +}
2312 +
2313 +GLOBAL(void)
2314 +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
2315 + JSAMPIMAGE input_buf,
2316 + JDIMENSION in_row_group_ctr,
2317 + JSAMPARRAY output_buf)
2318 +{
2319 +}
2320 +
2321 +GLOBAL(void)
2322 +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
2323 + JSAMPIMAGE input_buf,
2324 + JDIMENSION in_row_group_ctr,
2325 + JSAMPARRAY output_buf)
2326 +{
2327 +}
2328 +
2329 +GLOBAL(int)
2330 +jsimd_can_convsamp (void)
2331 +{
2332 + init_simd();
2333 +
2334 + return 0;
2335 +}
2336 +
2337 +GLOBAL(int)
2338 +jsimd_can_convsamp_float (void)
2339 +{
2340 + init_simd();
2341 +
2342 + return 0;
2343 +}
2344 +
2345 +GLOBAL(void)
2346 +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
2347 + DCTELEM * workspace)
2348 +{
2349 +}
2350 +
2351 +GLOBAL(void)
2352 +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
2353 + FAST_FLOAT * workspace)
2354 +{
2355 +}
2356 +
2357 +GLOBAL(int)
2358 +jsimd_can_fdct_islow (void)
2359 +{
2360 + init_simd();
2361 +
2362 + return 0;
2363 +}
2364 +
2365 +GLOBAL(int)
2366 +jsimd_can_fdct_ifast (void)
2367 +{
2368 + init_simd();
2369 +
2370 + return 0;
2371 +}
2372 +
2373 +GLOBAL(int)
2374 +jsimd_can_fdct_float (void)
2375 +{
2376 + init_simd();
2377 +
2378 + return 0;
2379 +}
2380 +
2381 +GLOBAL(void)
2382 +jsimd_fdct_islow (DCTELEM * data)
2383 +{
2384 +}
2385 +
2386 +GLOBAL(void)
2387 +jsimd_fdct_ifast (DCTELEM * data)
2388 +{
2389 +}
2390 +
2391 +GLOBAL(void)
2392 +jsimd_fdct_float (FAST_FLOAT * data)
2393 +{
2394 +}
2395 +
2396 +GLOBAL(int)
2397 +jsimd_can_quantize (void)
2398 +{
2399 + init_simd();
2400 +
2401 + return 0;
2402 +}
2403 +
2404 +GLOBAL(int)
2405 +jsimd_can_quantize_float (void)
2406 +{
2407 + init_simd();
2408 +
2409 + return 0;
2410 +}
2411 +
2412 +GLOBAL(void)
2413 +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
2414 + DCTELEM * workspace)
2415 +{
2416 +}
2417 +
2418 +GLOBAL(void)
2419 +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
2420 + FAST_FLOAT * workspace)
2421 +{
2422 +}
2423 +
2424 +GLOBAL(int)
2425 +jsimd_can_idct_2x2 (void)
2426 +{
2427 + init_simd();
2428 +
2429 + /* The code is optimised for these values only */
2430 + if (DCTSIZE != 8)
2431 + return 0;
2432 + if (sizeof(JCOEF) != 2)
2433 + return 0;
2434 + if (BITS_IN_JSAMPLE != 8)
2435 + return 0;
2436 + if (sizeof(JDIMENSION) != 4)
2437 + return 0;
2438 + if (sizeof(ISLOW_MULT_TYPE) != 2)
2439 + return 0;
2440 +
2441 + if (simd_support & JSIMD_ARM_NEON)
2442 + return 1;
2443 +
2444 + return 0;
2445 +}
2446 +
2447 +GLOBAL(int)
2448 +jsimd_can_idct_4x4 (void)
2449 +{
2450 + init_simd();
2451 +
2452 + /* The code is optimised for these values only */
2453 + if (DCTSIZE != 8)
2454 + return 0;
2455 + if (sizeof(JCOEF) != 2)
2456 + return 0;
2457 + if (BITS_IN_JSAMPLE != 8)
2458 + return 0;
2459 + if (sizeof(JDIMENSION) != 4)
2460 + return 0;
2461 + if (sizeof(ISLOW_MULT_TYPE) != 2)
2462 + return 0;
2463 +
2464 + if (simd_support & JSIMD_ARM_NEON)
2465 + return 1;
2466 +
2467 + return 0;
2468 +}
2469 +
2470 +GLOBAL(void)
2471 +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2472 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2473 + JDIMENSION output_col)
2474 +{
2475 + if (simd_support & JSIMD_ARM_NEON)
2476 + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
2477 + output_col);
2478 +}
2479 +
2480 +GLOBAL(void)
2481 +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2482 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2483 + JDIMENSION output_col)
2484 +{
2485 + if (simd_support & JSIMD_ARM_NEON)
2486 + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
2487 + output_col);
2488 +}
2489 +
2490 +GLOBAL(int)
2491 +jsimd_can_idct_islow (void)
2492 +{
2493 + init_simd();
2494 +
2495 + /* The code is optimised for these values only */
2496 + if (DCTSIZE != 8)
2497 + return 0;
2498 + if (sizeof(JCOEF) != 2)
2499 + return 0;
2500 + if (BITS_IN_JSAMPLE != 8)
2501 + return 0;
2502 + if (sizeof(JDIMENSION) != 4)
2503 + return 0;
2504 + if (sizeof(ISLOW_MULT_TYPE) != 2)
2505 + return 0;
2506 +
2507 + if (simd_support & JSIMD_ARM_NEON)
2508 + return 1;
2509 +
2510 + return 0;
2511 +}
2512 +
2513 +GLOBAL(int)
2514 +jsimd_can_idct_ifast (void)
2515 +{
2516 + init_simd();
2517 +
2518 + /* The code is optimised for these values only */
2519 + if (DCTSIZE != 8)
2520 + return 0;
2521 + if (sizeof(JCOEF) != 2)
2522 + return 0;
2523 + if (BITS_IN_JSAMPLE != 8)
2524 + return 0;
2525 + if (sizeof(JDIMENSION) != 4)
2526 + return 0;
2527 + if (sizeof(IFAST_MULT_TYPE) != 2)
2528 + return 0;
2529 + if (IFAST_SCALE_BITS != 2)
2530 + return 0;
2531 +
2532 + if (simd_support & JSIMD_ARM_NEON)
2533 + return 1;
2534 +
2535 + return 0;
2536 +}
2537 +
2538 +GLOBAL(int)
2539 +jsimd_can_idct_float (void)
2540 +{
2541 + init_simd();
2542 +
2543 + return 0;
2544 +}
2545 +
2546 +GLOBAL(void)
2547 +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2548 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2549 + JDIMENSION output_col)
2550 +{
2551 + if (simd_support & JSIMD_ARM_NEON)
2552 + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
2553 + output_col);
2554 +}
2555 +
2556 +GLOBAL(void)
2557 +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2558 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2559 + JDIMENSION output_col)
2560 +{
2561 + if (simd_support & JSIMD_ARM_NEON)
2562 + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
2563 + output_col);
2564 +}
2565 +
2566 +GLOBAL(void)
2567 +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2568 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2569 + JDIMENSION output_col)
2570 +{
2571 +}
2572 Index: simd/jsimd_arm64_neon.S
2573 new file mode 100644
2574 ===================================================================
2575 --- /dev/null
2576 +++ simd/jsimd_arm64_neon.S
2577 @@ -0,0 +1,1861 @@
2578 +/*
2579 + * ARMv8 NEON optimizations for libjpeg-turbo
2580 + *
2581 + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
2582 + * All rights reserved.
2583 + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
2584 + * Copyright (C) 2013-2014, Linaro Limited
2585 + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
2586 + *
2587 + * This software is provided 'as-is', without any express or implied
2588 + * warranty. In no event will the authors be held liable for any damages
2589 + * arising from the use of this software.
2590 + *
2591 + * Permission is granted to anyone to use this software for any purpose,
2592 + * including commercial applications, and to alter it and redistribute it
2593 + * freely, subject to the following restrictions:
2594 + *
2595 + * 1. The origin of this software must not be misrepresented; you must not
2596 + * claim that you wrote the original software. If you use this software
2597 + * in a product, an acknowledgment in the product documentation would be
2598 + * appreciated but is not required.
2599 + * 2. Altered source versions must be plainly marked as such, and must not be
2600 + * misrepresented as being the original software.
2601 + * 3. This notice may not be removed or altered from any source distribution.
2602 + */
2603 +
2604 +#if defined(__linux__) && defined(__ELF__)
2605 +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
2606 +#endif
2607 +
2608 +.text
2609 +.arch armv8-a+fp+simd
2610 +
2611 +
2612 +#define RESPECT_STRICT_ALIGNMENT 1
2613 +
2614 +
2615 +/*****************************************************************************/
2616 +
2617 +/* Supplementary macro for setting function attributes */
2618 +.macro asm_function fname
2619 +#ifdef __APPLE__
2620 + .globl _\fname
2621 +_\fname:
2622 +#else
2623 + .global \fname
2624 +#ifdef __ELF__
2625 + .hidden \fname
2626 + .type \fname, %function
2627 +#endif
2628 +\fname:
2629 +#endif
2630 +.endm
2631 +
2632 +/* Transpose elements of single 128 bit registers */
2633 +.macro transpose_single x0,x1,xi,xilen,literal
2634 + ins \xi\xilen[0], \x0\xilen[0]
2635 + ins \x1\xilen[0], \x0\xilen[1]
2636 + trn1 \x0\literal, \x0\literal, \x1\literal
2637 + trn2 \x1\literal, \xi\literal, \x1\literal
2638 +.endm
2639 +
2640 +/* Transpose elements of 2 differnet registers */
2641 +.macro transpose x0,x1,xi,xilen,literal
2642 + mov \xi\xilen, \x0\xilen
2643 + trn1 \x0\literal, \x0\literal, \x1\literal
2644 + trn2 \x1\literal, \xi\literal, \x1\literal
2645 +.endm
2646 +
2647 +/* Transpose a block of 4x4 coefficients in four 64-bit registers */
2648 +.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
2649 + mov \xi\xilen, \x0\xilen
2650 + trn1 \x0\x0len, \x0\x0len, \x2\x2len
2651 + trn2 \x2\x2len, \xi\x0len, \x2\x2len
2652 + mov \xi\xilen, \x1\xilen
2653 + trn1 \x1\x1len, \x1\x1len, \x3\x3len
2654 + trn2 \x3\x3len, \xi\x1len, \x3\x3len
2655 +.endm
2656 +
2657 +.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
2658 + mov \xi\xilen, \x0\xilen
2659 + trn1 \x0\x0len, \x0\x0len, \x1\x1len
2660 + trn2 \x1\x2len, \xi\x0len, \x1\x2len
2661 + mov \xi\xilen, \x2\xilen
2662 + trn1 \x2\x2len, \x2\x2len, \x3\x3len
2663 + trn2 \x3\x2len, \xi\x1len, \x3\x3len
2664 +.endm
2665 +
2666 +.macro transpose_4x4 x0, x1, x2, x3,x5
2667 + transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
2668 + transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
2669 +.endm
2670 +
2671 +
2672 +#define CENTERJSAMPLE 128
2673 +
2674 +/*****************************************************************************/
2675 +
2676 +/*
2677 + * Perform dequantization and inverse DCT on one block of coefficients.
2678 + *
2679 + * GLOBAL(void)
2680 + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
2681 + * JSAMPARRAY output_buf, JDIMENSION output_col)
2682 + */
2683 +
2684 +#define FIX_0_298631336 (2446)
2685 +#define FIX_0_390180644 (3196)
2686 +#define FIX_0_541196100 (4433)
2687 +#define FIX_0_765366865 (6270)
2688 +#define FIX_0_899976223 (7373)
2689 +#define FIX_1_175875602 (9633)
2690 +#define FIX_1_501321110 (12299)
2691 +#define FIX_1_847759065 (15137)
2692 +#define FIX_1_961570560 (16069)
2693 +#define FIX_2_053119869 (16819)
2694 +#define FIX_2_562915447 (20995)
2695 +#define FIX_3_072711026 (25172)
2696 +
2697 +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
2698 +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
2699 +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
2700 +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
2701 +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
2702 +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
2703 +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
2704 +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
2705 +
2706 +/*
2707 + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
2708 + * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
2709 + */
2710 +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
2711 +{ \
2712 + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
2713 + INT32 q1, q2, q3, q4, q5, q6, q7; \
2714 + INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
2715 + \
2716 + /* 1-D iDCT input data */ \
2717 + row0 = xrow0; \
2718 + row1 = xrow1; \
2719 + row2 = xrow2; \
2720 + row3 = xrow3; \
2721 + row4 = xrow4; \
2722 + row5 = xrow5; \
2723 + row6 = xrow6; \
2724 + row7 = xrow7; \
2725 + \
2726 + q5 = row7 + row3; \
2727 + q4 = row5 + row1; \
2728 + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
2729 + MULTIPLY(q4, FIX_1_175875602); \
2730 + q7 = MULTIPLY(q5, FIX_1_175875602) + \
2731 + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
2732 + q2 = MULTIPLY(row2, FIX_0_541196100) + \
2733 + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
2734 + q4 = q6; \
2735 + q3 = ((INT32) row0 - (INT32) row4) << 13; \
2736 + q6 += MULTIPLY(row5, -FIX_2_562915447) + \
2737 + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
2738 + /* now we can use q1 (reloadable constants have been used up) */ \
2739 + q1 = q3 + q2; \
2740 + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
2741 + MULTIPLY(row1, -FIX_0_899976223); \
2742 + q5 = q7; \
2743 + q1 = q1 + q6; \
2744 + q7 += MULTIPLY(row7, -FIX_0_899976223) + \
2745 + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
2746 + \
2747 + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
2748 + tmp11_plus_tmp2 = q1; \
2749 + row1 = 0; \
2750 + \
2751 + q1 = q1 - q6; \
2752 + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
2753 + MULTIPLY(row3, -FIX_2_562915447); \
2754 + q1 = q1 - q6; \
2755 + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
2756 + MULTIPLY(row6, FIX_0_541196100); \
2757 + q3 = q3 - q2; \
2758 + \
2759 + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
2760 + tmp11_minus_tmp2 = q1; \
2761 + \
2762 + q1 = ((INT32) row0 + (INT32) row4) << 13; \
2763 + q2 = q1 + q6; \
2764 + q1 = q1 - q6; \
2765 + \
2766 + /* pick up the results */ \
2767 + tmp0 = q4; \
2768 + tmp1 = q5; \
2769 + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
2770 + tmp3 = q7; \
2771 + tmp10 = q2; \
2772 + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
2773 + tmp12 = q3; \
2774 + tmp13 = q1; \
2775 +}
2776 +
2777 +#define XFIX_0_899976223 v0.4h[0]
2778 +#define XFIX_0_541196100 v0.4h[1]
2779 +#define XFIX_2_562915447 v0.4h[2]
2780 +#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
2781 +#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
2782 +#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
2783 +#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
2784 +#define XFIX_1_175875602 v1.4h[3]
2785 +#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
2786 +#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
2787 +#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
2788 +#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
2789 +
2790 +.balign 16
2791 +jsimd_idct_islow_neon_consts:
2792 + .short FIX_0_899976223 /* d0[0] */
2793 + .short FIX_0_541196100 /* d0[1] */
2794 + .short FIX_2_562915447 /* d0[2] */
2795 + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
2796 + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
2797 + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
2798 + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
2799 + .short FIX_1_175875602 /* d1[3] */
2800 + /* reloadable constants */
2801 + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
2802 + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
2803 + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
2804 + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
2805 +
2806 +asm_function jsimd_idct_islow_neon
2807 +
2808 + DCT_TABLE .req x0
2809 + COEF_BLOCK .req x1
2810 + OUTPUT_BUF .req x2
2811 + OUTPUT_COL .req x3
2812 + TMP1 .req x0
2813 + TMP2 .req x1
2814 + TMP3 .req x2
2815 + TMP4 .req x15
2816 +
2817 + ROW0L .req v16
2818 + ROW0R .req v17
2819 + ROW1L .req v18
2820 + ROW1R .req v19
2821 + ROW2L .req v20
2822 + ROW2R .req v21
2823 + ROW3L .req v22
2824 + ROW3R .req v23
2825 + ROW4L .req v24
2826 + ROW4R .req v25
2827 + ROW5L .req v26
2828 + ROW5R .req v27
2829 + ROW6L .req v28
2830 + ROW6R .req v29
2831 + ROW7L .req v30
2832 + ROW7R .req v31
2833 + /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
2834 + sub sp, sp, 272
2835 + str x15, [sp], 16
2836 + adr x15, jsimd_idct_islow_neon_consts
2837 + st1 {v0.8b - v3.8b}, [sp], 32
2838 + st1 {v4.8b - v7.8b}, [sp], 32
2839 + st1 {v8.8b - v11.8b}, [sp], 32
2840 + st1 {v12.8b - v15.8b}, [sp], 32
2841 + st1 {v16.8b - v19.8b}, [sp], 32
2842 + st1 {v20.8b - v23.8b}, [sp], 32
2843 + st1 {v24.8b - v27.8b}, [sp], 32
2844 + st1 {v28.8b - v31.8b}, [sp], 32
2845 + ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
2846 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
2847 + ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
2848 + mul v16.4h, v16.4h, v0.4h
2849 + mul v17.4h, v17.4h, v1.4h
2850 + ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
2851 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
2852 + mul v18.4h, v18.4h, v2.4h
2853 + mul v19.4h, v19.4h, v3.4h
2854 + ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
2855 + ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
2856 + mul v20.4h, v20.4h, v4.4h
2857 + mul v21.4h, v21.4h, v5.4h
2858 + ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
2859 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
2860 + mul v22.4h, v22.4h, v6.4h
2861 + mul v23.4h, v23.4h, v7.4h
2862 + ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
2863 + ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
2864 + mul v24.4h, v24.4h, v0.4h
2865 + mul v25.4h, v25.4h, v1.4h
2866 + ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
2867 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
2868 + mul v28.4h, v28.4h, v4.4h
2869 + mul v29.4h, v29.4h, v5.4h
2870 + ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
2871 + mul v26.4h, v26.4h, v2.4h
2872 + mul v27.4h, v27.4h, v3.4h
2873 + ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
2874 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
2875 + add x15, x15, #16
2876 + mul v30.4h, v30.4h, v6.4h
2877 + mul v31.4h, v31.4h, v7.4h
2878 + ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
2879 + /* Go to the bottom of the stack */
2880 + sub sp, sp, 352
2881 + stp x4, x5, [sp], 16
2882 + st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
2883 + st1 {v12.4h - v15.4h}, [sp], 32
2884 + /* 1-D IDCT, pass 1, left 4x8 half */
2885 + add v4.4h, ROW7L.4h, ROW3L.4h
2886 + add v5.4h, ROW5L.4h, ROW1L.4h
2887 + smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
2888 + smlal v12.4s, v5.4h, XFIX_1_175875602
2889 + smull v14.4s, v4.4h, XFIX_1_175875602
2890 + /* Check for the zero coefficients in the right 4x8 half */
2891 + smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
2892 + ssubl v6.4s, ROW0L.4h, ROW4L.4h
2893 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
2894 + smull v4.4s, ROW2L.4h, XFIX_0_541196100
2895 + smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
2896 + orr x0, x4, x5
2897 + mov v8.16b, v12.16b
2898 + smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
2899 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
2900 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
2901 + shl v6.4s, v6.4s, #13
2902 + orr x0, x0, x4
2903 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
2904 + orr x0, x0 , x5
2905 + add v2.4s, v6.4s, v4.4s
2906 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
2907 + mov v10.16b, v14.16b
2908 + add v2.4s, v2.4s, v12.4s
2909 + orr x0, x0, x4
2910 + smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
2911 + orr x0, x0, x5
2912 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
2913 + rshrn ROW1L.4h, v2.4s, #11
2914 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
2915 + sub v2.4s, v2.4s, v12.4s
2916 + smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
2917 + orr x0, x0, x4
2918 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
2919 + orr x0, x0, x5
2920 + sub v2.4s, v2.4s, v12.4s
2921 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
2922 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
2923 + smlal v12.4s, ROW6L.4h, XFIX_0_541196100
2924 + sub v6.4s, v6.4s, v4.4s
2925 + orr x0, x0, x4
2926 + rshrn ROW6L.4h, v2.4s, #11
2927 + orr x0, x0, x5
2928 + add v2.4s, v6.4s, v10.4s
2929 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
2930 + sub v6.4s, v6.4s, v10.4s
2931 + saddl v10.4s, ROW0L.4h, ROW4L.4h
2932 + orr x0, x0, x4
2933 + rshrn ROW2L.4h, v2.4s, #11
2934 + orr x0, x0, x5
2935 + rshrn ROW5L.4h, v6.4s, #11
2936 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
2937 + shl v10.4s, v10.4s, #13
2938 + smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
2939 + orr x0, x0, x4
2940 + add v4.4s, v10.4s, v12.4s
2941 + orr x0, x0, x5
2942 + cmp x0, #0 /* orrs instruction removed */
2943 + sub v2.4s, v10.4s, v12.4s
2944 + add v12.4s, v4.4s, v14.4s
2945 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
2946 + sub v4.4s, v4.4s, v14.4s
2947 + add v10.4s, v2.4s, v8.4s
2948 + orr x0, x4, x5
2949 + sub v6.4s, v2.4s, v8.4s
2950 + /* pop {x4, x5} */
2951 + sub sp, sp, 80
2952 + ldp x4, x5, [sp], 16
2953 + rshrn ROW7L.4h, v4.4s, #11
2954 + rshrn ROW3L.4h, v10.4s, #11
2955 + rshrn ROW0L.4h, v12.4s, #11
2956 + rshrn ROW4L.4h, v6.4s, #11
2957 +
2958 + beq 3f /* Go to do some special handling for the sparse right 4x8 half */
2959 +
2960 + /* 1-D IDCT, pass 1, right 4x8 half */
2961 + ld1 {v2.4h}, [x15] /* reload constants */
2962 + add v10.4h, ROW7R.4h, ROW3R.4h
2963 + add v8.4h, ROW5R.4h, ROW1R.4h
2964 + /* Transpose ROW6L <-> ROW7L (v3 available free register) */
2965 + transpose ROW6L, ROW7L, v3, .16b, .4h
2966 + smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560
2967 + smlal v12.4s, v8.4h, XFIX_1_175875602
2968 + /* Transpose ROW2L <-> ROW3L (v3 available free register) */
2969 + transpose ROW2L, ROW3L, v3, .16b, .4h
2970 + smull v14.4s, v10.4h, XFIX_1_175875602
2971 + smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644
2972 + /* Transpose ROW0L <-> ROW1L (v3 available free register) */
2973 + transpose ROW0L, ROW1L, v3, .16b, .4h
2974 + ssubl v6.4s, ROW0R.4h, ROW4R.4h
2975 + smull v4.4s, ROW2R.4h, XFIX_0_541196100
2976 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
2977 + /* Transpose ROW4L <-> ROW5L (v3 available free register) */
2978 + transpose ROW4L, ROW5L, v3, .16b, .4h
2979 + mov v8.16b, v12.16b
2980 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
2981 + smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
2982 + /* Transpose ROW1L <-> ROW3L (v3 available free register) */
2983 + transpose ROW1L, ROW3L, v3, .16b, .2s
2984 + shl v6.4s, v6.4s, #13
2985 + smlsl v8.4s, ROW1R.4h, XFIX_0_899976223
2986 + /* Transpose ROW4L <-> ROW6L (v3 available free register) */
2987 + transpose ROW4L, ROW6L, v3, .16b, .2s
2988 + add v2.4s, v6.4s, v4.4s
2989 + mov v10.16b, v14.16b
2990 + add v2.4s, v2.4s, v12.4s
2991 + /* Transpose ROW0L <-> ROW2L (v3 available free register) */
2992 + transpose ROW0L, ROW2L, v3, .16b, .2s
2993 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
2994 + smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
2995 + rshrn ROW1R.4h, v2.4s, #11
2996 + /* Transpose ROW5L <-> ROW7L (v3 available free register) */
2997 + transpose ROW5L, ROW7L, v3, .16b, .2s
2998 + sub v2.4s, v2.4s, v12.4s
2999 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
3000 + smlsl v10.4s, ROW3R.4h, XFIX_2_562915447
3001 + sub v2.4s, v2.4s, v12.4s
3002 + smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
3003 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100
3004 + sub v6.4s, v6.4s, v4.4s
3005 + rshrn ROW6R.4h, v2.4s, #11
3006 + add v2.4s, v6.4s, v10.4s
3007 + sub v6.4s, v6.4s, v10.4s
3008 + saddl v10.4s, ROW0R.4h, ROW4R.4h
3009 + rshrn ROW2R.4h, v2.4s, #11
3010 + rshrn ROW5R.4h, v6.4s, #11
3011 + shl v10.4s, v10.4s, #13
3012 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
3013 + add v4.4s, v10.4s, v12.4s
3014 + sub v2.4s, v10.4s, v12.4s
3015 + add v12.4s, v4.4s, v14.4s
3016 + sub v4.4s, v4.4s, v14.4s
3017 + add v10.4s, v2.4s, v8.4s
3018 + sub v6.4s, v2.4s, v8.4s
3019 + rshrn ROW7R.4h, v4.4s, #11
3020 + rshrn ROW3R.4h, v10.4s, #11
3021 + rshrn ROW0R.4h, v12.4s, #11
3022 + rshrn ROW4R.4h, v6.4s, #11
3023 + /* Transpose right 4x8 half */
3024 + transpose ROW6R, ROW7R, v3, .16b, .4h
3025 + transpose ROW2R, ROW3R, v3, .16b, .4h
3026 + transpose ROW0R, ROW1R, v3, .16b, .4h
3027 + transpose ROW4R, ROW5R, v3, .16b, .4h
3028 + transpose ROW1R, ROW3R, v3, .16b, .2s
3029 + transpose ROW4R, ROW6R, v3, .16b, .2s
3030 + transpose ROW0R, ROW2R, v3, .16b, .2s
3031 + transpose ROW5R, ROW7R, v3, .16b, .2s
3032 +
3033 +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
3034 + ld1 {v2.4h}, [x15] /* reload constants */
3035 + smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R. 4h */
3036 + smlal v12.4s, ROW1L.4h, XFIX_1_175875602
3037 + smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* R OW7L.4h <-> ROW3R.4h */
3038 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
3039 + smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R. 4h */
3040 + smlal v14.4s, ROW3L.4h, XFIX_1_175875602
3041 + smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* R OW5L.4h <-> ROW1R.4h */
3042 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
3043 + ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
3044 + smull v4.4s, ROW2L.4h, XFIX_0_541196100
3045 + smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* R OW6L.4h <-> ROW2R.4h */
3046 + mov v8.16b, v12.16b
3047 + smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R. 4h */
3048 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
3049 + shl v6.4s, v6.4s, #13
3050 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
3051 + add v2.4s, v6.4s, v4.4s
3052 + mov v10.16b, v14.16b
3053 + add v2.4s, v2.4s, v12.4s
3054 + smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R. 4h */
3055 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
3056 + shrn ROW1L.4h, v2.4s, #16
3057 + sub v2.4s, v2.4s, v12.4s
3058 + smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* R OW5L.4h <-> ROW1R.4h */
3059 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
3060 + sub v2.4s, v2.4s, v12.4s
3061 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
3062 + smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R. 4h */
3063 + sub v6.4s, v6.4s, v4.4s
3064 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3065 + add v2.4s, v6.4s, v10.4s
3066 + sub v6.4s, v6.4s, v10.4s
3067 + saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
3068 + shrn ROW2L.4h, v2.4s, #16
3069 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3070 + shl v10.4s, v10.4s, #13
3071 + smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* R OW7L.4h <-> ROW3R.4h */
3072 + add v4.4s, v10.4s, v12.4s
3073 + sub v2.4s, v10.4s, v12.4s
3074 + add v12.4s, v4.4s, v14.4s
3075 + sub v4.4s, v4.4s, v14.4s
3076 + add v10.4s, v2.4s, v8.4s
3077 + sub v6.4s, v2.4s, v8.4s
3078 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3079 + shrn ROW3L.4h, v10.4s, #16
3080 + shrn ROW0L.4h, v12.4s, #16
3081 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3082 + /* 1-D IDCT, pass 2, right 4x8 half */
3083 + ld1 {v2.4h}, [x15] /* reload constants */
3084 + smull v12.4s, ROW5R.4h, XFIX_1_175875602
3085 + smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R. 4h */
3086 + smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
3087 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* R OW7L.4h <-> ROW3R.4h */
3088 + smull v14.4s, ROW7R.4h, XFIX_1_175875602
3089 + smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R. 4h */
3090 + smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
3091 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* R OW5L.4h <-> ROW1R.4h */
3092 + ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
3093 + smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R. 4h */
3094 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
3095 + mov v8.16b, v12.16b
3096 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
3097 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* R OW7L.4h <-> ROW3R.4h */
3098 + shl v6.4s, v6.4s, #13
3099 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R. 4h */
3100 + add v2.4s, v6.4s, v4.4s
3101 + mov v10.16b, v14.16b
3102 + add v2.4s, v2.4s, v12.4s
3103 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
3104 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* R OW5L.4h <-> ROW1R.4h */
3105 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3106 + sub v2.4s, v2.4s, v12.4s
3107 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
3108 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R. 4h */
3109 + sub v2.4s, v2.4s, v12.4s
3110 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* RO W6L.4h <-> ROW2R.4h */
3111 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100
3112 + sub v6.4s, v6.4s, v4.4s
3113 + shrn ROW6R.4h, v2.4s, #16
3114 + add v2.4s, v6.4s, v10.4s
3115 + sub v6.4s, v6.4s, v10.4s
3116 + saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
3117 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3118 + shrn ROW5R.4h, v6.4s, #16
3119 + shl v10.4s, v10.4s, #13
3120 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
3121 + add v4.4s, v10.4s, v12.4s
3122 + sub v2.4s, v10.4s, v12.4s
3123 + add v12.4s, v4.4s, v14.4s
3124 + sub v4.4s, v4.4s, v14.4s
3125 + add v10.4s, v2.4s, v8.4s
3126 + sub v6.4s, v2.4s, v8.4s
3127 + shrn ROW7R.4h, v4.4s, #16
3128 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3129 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3130 + shrn ROW4R.4h, v6.4s, #16
3131 +
3132 +2: /* Descale to 8-bit and range limit */
3133 + ins v16.2d[1], v17.2d[0]
3134 + ins v18.2d[1], v19.2d[0]
3135 + ins v20.2d[1], v21.2d[0]
3136 + ins v22.2d[1], v23.2d[0]
3137 + sqrshrn v16.8b, v16.8h, #2
3138 + sqrshrn2 v16.16b, v18.8h, #2
3139 + sqrshrn v18.8b, v20.8h, #2
3140 + sqrshrn2 v18.16b, v22.8h, #2
3141 +
3142 + /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
3143 + ld1 {v8.4h - v11.4h}, [sp], 32
3144 + ld1 {v12.4h - v15.4h}, [sp], 32
3145 + ins v24.2d[1], v25.2d[0]
3146 +
3147 + sqrshrn v20.8b, v24.8h, #2
3148 + /* Transpose the final 8-bit samples and do signed->unsigned conversion * /
3149 + /* trn1 v16.8h, v16.8h, v18.8h */
3150 + transpose v16, v18, v3, .16b, .8h
3151 + ins v26.2d[1], v27.2d[0]
3152 + ins v28.2d[1], v29.2d[0]
3153 + ins v30.2d[1], v31.2d[0]
3154 + sqrshrn2 v20.16b, v26.8h, #2
3155 + sqrshrn v22.8b, v28.8h, #2
3156 + movi v0.16b, #(CENTERJSAMPLE)
3157 + sqrshrn2 v22.16b, v30.8h, #2
3158 + transpose_single v16, v17, v3, .2d, .8b
3159 + transpose_single v18, v19, v3, .2d, .8b
3160 + add v16.8b, v16.8b, v0.8b
3161 + add v17.8b, v17.8b, v0.8b
3162 + add v18.8b, v18.8b, v0.8b
3163 + add v19.8b, v19.8b, v0.8b
3164 + transpose v20, v22, v3, .16b, .8h
3165 + /* Store results to the output buffer */
3166 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3167 + add TMP1, TMP1, OUTPUT_COL
3168 + add TMP2, TMP2, OUTPUT_COL
3169 + st1 {v16.8b}, [TMP1]
3170 + transpose_single v20, v21, v3, .2d, .8b
3171 + st1 {v17.8b}, [TMP2]
3172 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3173 + add TMP1, TMP1, OUTPUT_COL
3174 + add TMP2, TMP2, OUTPUT_COL
3175 + st1 {v18.8b}, [TMP1]
3176 + add v20.8b, v20.8b, v0.8b
3177 + add v21.8b, v21.8b, v0.8b
3178 + st1 {v19.8b}, [TMP2]
3179 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3180 + ldp TMP3, TMP4, [OUTPUT_BUF]
3181 + add TMP1, TMP1, OUTPUT_COL
3182 + add TMP2, TMP2, OUTPUT_COL
3183 + add TMP3, TMP3, OUTPUT_COL
3184 + add TMP4, TMP4, OUTPUT_COL
3185 + transpose_single v22, v23, v3, .2d, .8b
3186 + st1 {v20.8b}, [TMP1]
3187 + add v22.8b, v22.8b, v0.8b
3188 + add v23.8b, v23.8b, v0.8b
3189 + st1 {v21.8b}, [TMP2]
3190 + st1 {v22.8b}, [TMP3]
3191 + st1 {v23.8b}, [TMP4]
3192 + ldr x15, [sp], 16
3193 + ld1 {v0.8b - v3.8b}, [sp], 32
3194 + ld1 {v4.8b - v7.8b}, [sp], 32
3195 + ld1 {v8.8b - v11.8b}, [sp], 32
3196 + ld1 {v12.8b - v15.8b}, [sp], 32
3197 + ld1 {v16.8b - v19.8b}, [sp], 32
3198 + ld1 {v20.8b - v23.8b}, [sp], 32
3199 + ld1 {v24.8b - v27.8b}, [sp], 32
3200 + ld1 {v28.8b - v31.8b}, [sp], 32
3201 + blr x30
3202 +
3203 +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
3204 +
3205 + /* Transpose left 4x8 half */
3206 + transpose ROW6L, ROW7L, v3, .16b, .4h
3207 + transpose ROW2L, ROW3L, v3, .16b, .4h
3208 + transpose ROW0L, ROW1L, v3, .16b, .4h
3209 + transpose ROW4L, ROW5L, v3, .16b, .4h
3210 + shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
3211 + transpose ROW1L, ROW3L, v3, .16b, .2s
3212 + transpose ROW4L, ROW6L, v3, .16b, .2s
3213 + transpose ROW0L, ROW2L, v3, .16b, .2s
3214 + transpose ROW5L, ROW7L, v3, .16b, .2s
3215 + cmp x0, #0
3216 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second p ass */
3217 +
3218 + /* Only row 0 is non-zero for the right 4x8 half */
3219 + dup ROW1R.4h, ROW0R.4h[1]
3220 + dup ROW2R.4h, ROW0R.4h[2]
3221 + dup ROW3R.4h, ROW0R.4h[3]
3222 + dup ROW4R.4h, ROW0R.4h[0]
3223 + dup ROW5R.4h, ROW0R.4h[1]
3224 + dup ROW6R.4h, ROW0R.4h[2]
3225 + dup ROW7R.4h, ROW0R.4h[3]
3226 + dup ROW0R.4h, ROW0R.4h[0]
3227 + b 1b /* Go to 'normal' second pass */
3228 +
3229 +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
3230 + ld1 {v2.4h}, [x15] /* reload constants */
3231 + smull v12.4s, ROW1L.4h, XFIX_1_175875602
3232 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
3233 + smull v14.4s, ROW3L.4h, XFIX_1_175875602
3234 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
3235 + smull v4.4s, ROW2L.4h, XFIX_0_541196100
3236 + sshll v6.4s, ROW0L.4h, #13
3237 + mov v8.16b, v12.16b
3238 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
3239 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
3240 + add v2.4s, v6.4s, v4.4s
3241 + mov v10.16b, v14.16b
3242 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
3243 + add v2.4s, v2.4s, v12.4s
3244 + add v12.4s, v12.4s, v12.4s
3245 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
3246 + shrn ROW1L.4h, v2.4s, #16
3247 + sub v2.4s, v2.4s, v12.4s
3248 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
3249 + sub v6.4s, v6.4s, v4.4s
3250 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3251 + add v2.4s, v6.4s, v10.4s
3252 + sub v6.4s, v6.4s, v10.4s
3253 + sshll v10.4s, ROW0L.4h, #13
3254 + shrn ROW2L.4h, v2.4s, #16
3255 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3256 + add v4.4s, v10.4s, v12.4s
3257 + sub v2.4s, v10.4s, v12.4s
3258 + add v12.4s, v4.4s, v14.4s
3259 + sub v4.4s, v4.4s, v14.4s
3260 + add v10.4s, v2.4s, v8.4s
3261 + sub v6.4s, v2.4s, v8.4s
3262 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3263 + shrn ROW3L.4h, v10.4s, #16
3264 + shrn ROW0L.4h, v12.4s, #16
3265 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3266 + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
3267 + ld1 {v2.4h}, [x15] /* reload constants */
3268 + smull v12.4s, ROW5L.4h, XFIX_1_175875602
3269 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
3270 + smull v14.4s, ROW7L.4h, XFIX_1_175875602
3271 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
3272 + smull v4.4s, ROW6L.4h, XFIX_0_541196100
3273 + sshll v6.4s, ROW4L.4h, #13
3274 + mov v8.16b, v12.16b
3275 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
3276 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223
3277 + add v2.4s, v6.4s, v4.4s
3278 + mov v10.16b, v14.16b
3279 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
3280 + add v2.4s, v2.4s, v12.4s
3281 + add v12.4s, v12.4s, v12.4s
3282 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447
3283 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3284 + sub v2.4s, v2.4s, v12.4s
3285 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
3286 + sub v6.4s, v6.4s, v4.4s
3287 + shrn ROW6R.4h, v2.4s, #16
3288 + add v2.4s, v6.4s, v10.4s
3289 + sub v6.4s, v6.4s, v10.4s
3290 + sshll v10.4s, ROW4L.4h, #13
3291 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3292 + shrn ROW5R.4h, v6.4s, #16
3293 + add v4.4s, v10.4s, v12.4s
3294 + sub v2.4s, v10.4s, v12.4s
3295 + add v12.4s, v4.4s, v14.4s
3296 + sub v4.4s, v4.4s, v14.4s
3297 + add v10.4s, v2.4s, v8.4s
3298 + sub v6.4s, v2.4s, v8.4s
3299 + shrn ROW7R.4h, v4.4s, #16
3300 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3301 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3302 + shrn ROW4R.4h, v6.4s, #16
3303 + b 2b /* Go to epilogue */
3304 +
3305 + .unreq DCT_TABLE
3306 + .unreq COEF_BLOCK
3307 + .unreq OUTPUT_BUF
3308 + .unreq OUTPUT_COL
3309 + .unreq TMP1
3310 + .unreq TMP2
3311 + .unreq TMP3
3312 + .unreq TMP4
3313 +
3314 + .unreq ROW0L
3315 + .unreq ROW0R
3316 + .unreq ROW1L
3317 + .unreq ROW1R
3318 + .unreq ROW2L
3319 + .unreq ROW2R
3320 + .unreq ROW3L
3321 + .unreq ROW3R
3322 + .unreq ROW4L
3323 + .unreq ROW4R
3324 + .unreq ROW5L
3325 + .unreq ROW5R
3326 + .unreq ROW6L
3327 + .unreq ROW6R
3328 + .unreq ROW7L
3329 + .unreq ROW7R
3330 +
3331 +
3332 +/*****************************************************************************/
3333 +
3334 +/*
3335 + * jsimd_idct_ifast_neon
3336 + *
3337 + * This function contains a fast, not so accurate integer implementation of
3338 + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
3339 + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
3340 + * function from jidctfst.c
3341 + *
3342 + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
3343 + * But in ARM NEON case some extra additions are required because VQDMULH
3344 + * instruction can't handle the constants larger than 1. So the expressions
3345 + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
3346 + * which introduces an extra addition. Overall, there are 6 extra additions
3347 + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
3348 + */
3349 +
3350 +#define XFIX_1_082392200 v0.4h[0]
3351 +#define XFIX_1_414213562 v0.4h[1]
3352 +#define XFIX_1_847759065 v0.4h[2]
3353 +#define XFIX_2_613125930 v0.4h[3]
3354 +
3355 +.balign 16
3356 +jsimd_idct_ifast_neon_consts:
3357 + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
3358 + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
3359 + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
3360 + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
3361 +
3362 +asm_function jsimd_idct_ifast_neon
3363 +
3364 + DCT_TABLE .req x0
3365 + COEF_BLOCK .req x1
3366 + OUTPUT_BUF .req x2
3367 + OUTPUT_COL .req x3
3368 + TMP1 .req x0
3369 + TMP2 .req x1
3370 + TMP3 .req x2
3371 + TMP4 .req x22
3372 + TMP5 .req x23
3373 +
3374 + /* Load and dequantize coefficients into NEON registers
3375 + * with the following allocation:
3376 + * 0 1 2 3 | 4 5 6 7
3377 + * ---------+--------
3378 + * 0 | d16 | d17 ( v8.8h )
3379 + * 1 | d18 | d19 ( v9.8h )
3380 + * 2 | d20 | d21 ( v10.8h )
3381 + * 3 | d22 | d23 ( v11.8h )
3382 + * 4 | d24 | d25 ( v12.8h )
3383 + * 5 | d26 | d27 ( v13.8h )
3384 + * 6 | d28 | d29 ( v14.8h )
3385 + * 7 | d30 | d31 ( v15.8h )
3386 + */
3387 + /* Save NEON registers used in fast IDCT */
3388 + sub sp, sp, #176
3389 + stp x22, x23, [sp], 16
3390 + adr x23, jsimd_idct_ifast_neon_consts
3391 + st1 {v0.8b - v3.8b}, [sp], 32
3392 + st1 {v4.8b - v7.8b}, [sp], 32
3393 + st1 {v8.8b - v11.8b}, [sp], 32
3394 + st1 {v12.8b - v15.8b}, [sp], 32
3395 + st1 {v16.8b - v19.8b}, [sp], 32
3396 + ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
3397 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
3398 + ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
3399 + mul v8.8h, v8.8h, v0.8h
3400 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
3401 + mul v9.8h, v9.8h, v1.8h
3402 + ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32
3403 + mul v10.8h, v10.8h, v2.8h
3404 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
3405 + mul v11.8h, v11.8h, v3.8h
3406 + ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32
3407 + mul v12.8h, v12.8h, v0.8h
3408 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
3409 + mul v14.8h, v14.8h, v2.8h
3410 + mul v13.8h, v13.8h, v1.8h
3411 + ld1 {v0.4h}, [x23] /* load constants */
3412 + mul v15.8h, v15.8h, v3.8h
3413 +
3414 + /* 1-D IDCT, pass 1 */
3415 + sub v2.8h, v10.8h, v14.8h
3416 + add v14.8h, v10.8h, v14.8h
3417 + sub v1.8h, v11.8h, v13.8h
3418 + add v13.8h, v11.8h, v13.8h
3419 + sub v5.8h, v9.8h, v15.8h
3420 + add v15.8h, v9.8h, v15.8h
3421 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562
3422 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930
3423 + add v3.8h, v1.8h, v1.8h
3424 + sub v1.8h, v5.8h, v1.8h
3425 + add v10.8h, v2.8h, v4.8h
3426 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065
3427 + sub v2.8h, v15.8h, v13.8h
3428 + add v3.8h, v3.8h, v6.8h
3429 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562
3430 + add v1.8h, v1.8h, v4.8h
3431 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200
3432 + sub v10.8h, v10.8h, v14.8h
3433 + add v2.8h, v2.8h, v6.8h
3434 + sub v6.8h, v8.8h, v12.8h
3435 + add v12.8h, v8.8h, v12.8h
3436 + add v9.8h, v5.8h, v4.8h
3437 + add v5.8h, v6.8h, v10.8h
3438 + sub v10.8h, v6.8h, v10.8h
3439 + add v6.8h, v15.8h, v13.8h
3440 + add v8.8h, v12.8h, v14.8h
3441 + sub v3.8h, v6.8h, v3.8h
3442 + sub v12.8h, v12.8h, v14.8h
3443 + sub v3.8h, v3.8h, v1.8h
3444 + sub v1.8h, v9.8h, v1.8h
3445 + add v2.8h, v3.8h, v2.8h
3446 + sub v15.8h, v8.8h, v6.8h
3447 + add v1.8h, v1.8h, v2.8h
3448 + add v8.8h, v8.8h, v6.8h
3449 + add v14.8h, v5.8h, v3.8h
3450 + sub v9.8h, v5.8h, v3.8h
3451 + sub v13.8h, v10.8h, v2.8h
3452 + add v10.8h, v10.8h, v2.8h
3453 + /* Transpose q8-q9 */
3454 + mov v18.16b, v8.16b
3455 + trn1 v8.8h, v8.8h, v9.8h
3456 + trn2 v9.8h, v18.8h, v9.8h
3457 + sub v11.8h, v12.8h, v1.8h
3458 + /* Transpose q14-q15 */
3459 + mov v18.16b, v14.16b
3460 + trn1 v14.8h, v14.8h, v15.8h
3461 + trn2 v15.8h, v18.8h, v15.8h
3462 + add v12.8h, v12.8h, v1.8h
3463 + /* Transpose q10-q11 */
3464 + mov v18.16b, v10.16b
3465 + trn1 v10.8h, v10.8h, v11.8h
3466 + trn2 v11.8h, v18.8h, v11.8h
3467 + /* Transpose q12-q13 */
3468 + mov v18.16b, v12.16b
3469 + trn1 v12.8h, v12.8h, v13.8h
3470 + trn2 v13.8h, v18.8h, v13.8h
3471 + /* Transpose q9-q11 */
3472 + mov v18.16b, v9.16b
3473 + trn1 v9.4s, v9.4s, v11.4s
3474 + trn2 v11.4s, v18.4s, v11.4s
3475 + /* Transpose q12-q14 */
3476 + mov v18.16b, v12.16b
3477 + trn1 v12.4s, v12.4s, v14.4s
3478 + trn2 v14.4s, v18.4s, v14.4s
3479 + /* Transpose q8-q10 */
3480 + mov v18.16b, v8.16b
3481 + trn1 v8.4s, v8.4s, v10.4s
3482 + trn2 v10.4s, v18.4s, v10.4s
3483 + /* Transpose q13-q15 */
3484 + mov v18.16b, v13.16b
3485 + trn1 v13.4s, v13.4s, v15.4s
3486 + trn2 v15.4s, v18.4s, v15.4s
3487 + /* vswp v14.4h, v10-MSB.4h */
3488 + umov x22, v14.d[0]
3489 + ins v14.2d[0], v10.2d[1]
3490 + ins v10.2d[1], x22
3491 + /* vswp v13.4h, v9MSB.4h */
3492 +
3493 + umov x22, v13.d[0]
3494 + ins v13.2d[0], v9.2d[1]
3495 + ins v9.2d[1], x22
3496 + /* 1-D IDCT, pass 2 */
3497 + sub v2.8h, v10.8h, v14.8h
3498 + /* vswp v15.4h, v11MSB.4h */
3499 + umov x22, v15.d[0]
3500 + ins v15.2d[0], v11.2d[1]
3501 + ins v11.2d[1], x22
3502 + add v14.8h, v10.8h, v14.8h
3503 + /* vswp v12.4h, v8-MSB.4h */
3504 + umov x22, v12.d[0]
3505 + ins v12.2d[0], v8.2d[1]
3506 + ins v8.2d[1], x22
3507 + sub v1.8h, v11.8h, v13.8h
3508 + add v13.8h, v11.8h, v13.8h
3509 + sub v5.8h, v9.8h, v15.8h
3510 + add v15.8h, v9.8h, v15.8h
3511 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562
3512 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930
3513 + add v3.8h, v1.8h, v1.8h
3514 + sub v1.8h, v5.8h, v1.8h
3515 + add v10.8h, v2.8h, v4.8h
3516 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065
3517 + sub v2.8h, v15.8h, v13.8h
3518 + add v3.8h, v3.8h, v6.8h
3519 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562
3520 + add v1.8h, v1.8h, v4.8h
3521 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200
3522 + sub v10.8h, v10.8h, v14.8h
3523 + add v2.8h, v2.8h, v6.8h
3524 + sub v6.8h, v8.8h, v12.8h
3525 + add v12.8h, v8.8h, v12.8h
3526 + add v9.8h, v5.8h, v4.8h
3527 + add v5.8h, v6.8h, v10.8h
3528 + sub v10.8h, v6.8h, v10.8h
3529 + add v6.8h, v15.8h, v13.8h
3530 + add v8.8h, v12.8h, v14.8h
3531 + sub v3.8h, v6.8h, v3.8h
3532 + sub v12.8h, v12.8h, v14.8h
3533 + sub v3.8h, v3.8h, v1.8h
3534 + sub v1.8h, v9.8h, v1.8h
3535 + add v2.8h, v3.8h, v2.8h
3536 + sub v15.8h, v8.8h, v6.8h
3537 + add v1.8h, v1.8h, v2.8h
3538 + add v8.8h, v8.8h, v6.8h
3539 + add v14.8h, v5.8h, v3.8h
3540 + sub v9.8h, v5.8h, v3.8h
3541 + sub v13.8h, v10.8h, v2.8h
3542 + add v10.8h, v10.8h, v2.8h
3543 + sub v11.8h, v12.8h, v1.8h
3544 + add v12.8h, v12.8h, v1.8h
3545 + /* Descale to 8-bit and range limit */
3546 + movi v0.16b, #0x80
3547 + sqshrn v8.8b, v8.8h, #5
3548 + sqshrn2 v8.16b, v9.8h, #5
3549 + sqshrn v9.8b, v10.8h, #5
3550 + sqshrn2 v9.16b, v11.8h, #5
3551 + sqshrn v10.8b, v12.8h, #5
3552 + sqshrn2 v10.16b, v13.8h, #5
3553 + sqshrn v11.8b, v14.8h, #5
3554 + sqshrn2 v11.16b, v15.8h, #5
3555 + add v8.16b, v8.16b, v0.16b
3556 + add v9.16b, v9.16b, v0.16b
3557 + add v10.16b, v10.16b, v0.16b
3558 + add v11.16b, v11.16b, v0.16b
3559 + /* Transpose the final 8-bit samples */
3560 + /* Transpose q8-q9 */
3561 + mov v18.16b, v8.16b
3562 + trn1 v8.8h, v8.8h, v9.8h
3563 + trn2 v9.8h, v18.8h, v9.8h
3564 + /* Transpose q10-q11 */
3565 + mov v18.16b, v10.16b
3566 + trn1 v10.8h, v10.8h, v11.8h
3567 + trn2 v11.8h, v18.8h, v11.8h
3568 + /* Transpose q8-q10 */
3569 + mov v18.16b, v8.16b
3570 + trn1 v8.4s, v8.4s, v10.4s
3571 + trn2 v10.4s, v18.4s, v10.4s
3572 + /* Transpose q9-q11 */
3573 + mov v18.16b, v9.16b
3574 + trn1 v9.4s, v9.4s, v11.4s
3575 + trn2 v11.4s, v18.4s, v11.4s
3576 + /* make copy */
3577 + ins v17.2d[0], v8.2d[1]
3578 + /* Transpose d16-d17-msb */
3579 + mov v18.16b, v8.16b
3580 + trn1 v8.8b, v8.8b, v17.8b
3581 + trn2 v17.8b, v18.8b, v17.8b
3582 + /* make copy */
3583 + ins v19.2d[0], v9.2d[1]
3584 + mov v18.16b, v9.16b
3585 + trn1 v9.8b, v9.8b, v19.8b
3586 + trn2 v19.8b, v18.8b, v19.8b
3587 + /* Store results to the output buffer */
3588 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3589 + add TMP1, TMP1, OUTPUT_COL
3590 + add TMP2, TMP2, OUTPUT_COL
3591 + st1 {v8.8b}, [TMP1]
3592 + st1 {v17.8b}, [TMP2]
3593 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3594 + add TMP1, TMP1, OUTPUT_COL
3595 + add TMP2, TMP2, OUTPUT_COL
3596 + st1 {v9.8b}, [TMP1]
3597 + /* make copy */
3598 + ins v7.2d[0], v10.2d[1]
3599 + mov v18.16b, v10.16b
3600 + trn1 v10.8b, v10.8b, v7.8b
3601 + trn2 v7.8b, v18.8b, v7.8b
3602 + st1 {v19.8b}, [TMP2]
3603 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3604 + ldp TMP4, TMP5, [OUTPUT_BUF], 16
3605 + add TMP1, TMP1, OUTPUT_COL
3606 + add TMP2, TMP2, OUTPUT_COL
3607 + add TMP4, TMP4, OUTPUT_COL
3608 + add TMP5, TMP5, OUTPUT_COL
3609 + st1 {v10.8b}, [TMP1]
3610 + /* make copy */
3611 + ins v16.2d[0], v11.2d[1]
3612 + mov v18.16b, v11.16b
3613 + trn1 v11.8b, v11.8b, v16.8b
3614 + trn2 v16.8b, v18.8b, v16.8b
3615 + st1 {v7.8b}, [TMP2]
3616 + st1 {v11.8b}, [TMP4]
3617 + st1 {v16.8b}, [TMP5]
3618 + sub sp, sp, #176
3619 + ldp x22, x23, [sp], 16
3620 + ld1 {v0.8b - v3.8b}, [sp], 32
3621 + ld1 {v4.8b - v7.8b}, [sp], 32
3622 + ld1 {v8.8b - v11.8b}, [sp], 32
3623 + ld1 {v12.8b - v15.8b}, [sp], 32
3624 + ld1 {v16.8b - v19.8b}, [sp], 32
3625 + blr x30
3626 +
3627 + .unreq DCT_TABLE
3628 + .unreq COEF_BLOCK
3629 + .unreq OUTPUT_BUF
3630 + .unreq OUTPUT_COL
3631 + .unreq TMP1
3632 + .unreq TMP2
3633 + .unreq TMP3
3634 + .unreq TMP4
3635 +
3636 +
3637 +/*****************************************************************************/
3638 +
3639 +/*
3640 + * jsimd_idct_4x4_neon
3641 + *
3642 + * This function contains inverse-DCT code for getting reduced-size
3643 + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
3644 + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
3645 + * function from jpeg-6b (jidctred.c).
3646 + *
3647 + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
3648 + * requires much less arithmetic operations and hence should be faster.
3649 + * The primary purpose of this particular NEON optimized function is
3650 + * bit exact compatibility with jpeg-6b.
3651 + *
3652 + * TODO: a bit better instructions scheduling can be achieved by expanding
3653 + * idct_helper/transpose_4x4 macros and reordering instructions,
3654 + * but readability will suffer somewhat.
3655 + */
3656 +
3657 +#define CONST_BITS 13
3658 +
3659 +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
3660 +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
3661 +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
3662 +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
3663 +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
3664 +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
3665 +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
3666 +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
3667 +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
3668 +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
3669 +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
3670 +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
3671 +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
3672 +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
3673 +
3674 +.balign 16
3675 +jsimd_idct_4x4_neon_consts:
3676 + .short FIX_1_847759065 /* v0.4h[0] */
3677 + .short -FIX_0_765366865 /* v0.4h[1] */
3678 + .short -FIX_0_211164243 /* v0.4h[2] */
3679 + .short FIX_1_451774981 /* v0.4h[3] */
3680 + .short -FIX_2_172734803 /* d1[0] */
3681 + .short FIX_1_061594337 /* d1[1] */
3682 + .short -FIX_0_509795579 /* d1[2] */
3683 + .short -FIX_0_601344887 /* d1[3] */
3684 + .short FIX_0_899976223 /* v2.4h[0] */
3685 + .short FIX_2_562915447 /* v2.4h[1] */
3686 + .short 1 << (CONST_BITS+1) /* v2.4h[2] */
3687 + .short 0 /* v2.4h[3] */
3688 +
3689 +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
3690 + smull v28.4s, \x4, v2.4h[2]
3691 + smlal v28.4s, \x8, v0.4h[0]
3692 + smlal v28.4s, \x14, v0.4h[1]
3693 +
3694 + smull v26.4s, \x16, v1.4h[2]
3695 + smlal v26.4s, \x12, v1.4h[3]
3696 + smlal v26.4s, \x10, v2.4h[0]
3697 + smlal v26.4s, \x6, v2.4h[1]
3698 +
3699 + smull v30.4s, \x4, v2.4h[2]
3700 + smlsl v30.4s, \x8, v0.4h[0]
3701 + smlsl v30.4s, \x14, v0.4h[1]
3702 +
3703 + smull v24.4s, \x16, v0.4h[2]
3704 + smlal v24.4s, \x12, v0.4h[3]
3705 + smlal v24.4s, \x10, v1.4h[0]
3706 + smlal v24.4s, \x6, v1.4h[1]
3707 +
3708 + add v20.4s, v28.4s, v26.4s
3709 + sub v28.4s, v28.4s, v26.4s
3710 +
3711 +.if \shift > 16
3712 + srshr v20.4s, v20.4s, #\shift
3713 + srshr v28.4s, v28.4s, #\shift
3714 + xtn \y26, v20.4s
3715 + xtn \y29, v28.4s
3716 +.else
3717 + rshrn \y26, v20.4s, #\shift
3718 + rshrn \y29, v28.4s, #\shift
3719 +.endif
3720 +
3721 + add v20.4s, v30.4s, v24.4s
3722 + sub v30.4s, v30.4s, v24.4s
3723 +
3724 +.if \shift > 16
3725 + srshr v20.4s, v20.4s, #\shift
3726 + srshr v30.4s, v30.4s, #\shift
3727 + xtn \y27, v20.4s
3728 + xtn \y28, v30.4s
3729 +.else
3730 + rshrn \y27, v20.4s, #\shift
3731 + rshrn \y28, v30.4s, #\shift
3732 +.endif
3733 +
3734 +.endm
3735 +
3736 +asm_function jsimd_idct_4x4_neon
3737 +
3738 + DCT_TABLE .req x0
3739 + COEF_BLOCK .req x1
3740 + OUTPUT_BUF .req x2
3741 + OUTPUT_COL .req x3
3742 + TMP1 .req x0
3743 + TMP2 .req x1
3744 + TMP3 .req x2
3745 + TMP4 .req x15
3746 +
3747 + /* Save all used NEON registers */
3748 + sub sp, sp, 272
3749 + str x15, [sp], 16
3750 + /* Load constants (v3.4h is just used for padding) */
3751 + adr TMP4, jsimd_idct_4x4_neon_consts
3752 + st1 {v0.8b - v3.8b}, [sp], 32
3753 + st1 {v4.8b - v7.8b}, [sp], 32
3754 + st1 {v8.8b - v11.8b}, [sp], 32
3755 + st1 {v12.8b - v15.8b}, [sp], 32
3756 + st1 {v16.8b - v19.8b}, [sp], 32
3757 + st1 {v20.8b - v23.8b}, [sp], 32
3758 + st1 {v24.8b - v27.8b}, [sp], 32
3759 + st1 {v28.8b - v31.8b}, [sp], 32
3760 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
3761 +
3762 + /* Load all COEF_BLOCK into NEON registers with the following allocation:
3763 + * 0 1 2 3 | 4 5 6 7
3764 + * ---------+--------
3765 + * 0 | v4.4h | v5.4h
3766 + * 1 | v6.4h | v7.4h
3767 + * 2 | v8.4h | v9.4h
3768 + * 3 | v10.4h | v11.4h
3769 + * 4 | - | -
3770 + * 5 | v12.4h | v13.4h
3771 + * 6 | v14.4h | v15.4h
3772 + * 7 | v16.4h | v17.4h
3773 + */
3774 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
3775 + ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
3776 + add COEF_BLOCK, COEF_BLOCK, #16
3777 + ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
3778 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
3779 + /* dequantize */
3780 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
3781 + mul v4.4h, v4.4h, v18.4h
3782 + mul v5.4h, v5.4h, v19.4h
3783 + ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
3784 + ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
3785 + mul v6.4h, v6.4h, v20.4h
3786 + mul v7.4h, v7.4h, v21.4h
3787 + ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
3788 + mul v8.4h, v8.4h, v22.4h
3789 + mul v9.4h, v9.4h, v23.4h
3790 + ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
3791 + add DCT_TABLE, DCT_TABLE, #16
3792 + ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
3793 + mul v10.4h, v10.4h, v24.4h
3794 + mul v11.4h, v11.4h, v25.4h
3795 + ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
3796 + mul v12.4h, v12.4h, v26.4h
3797 + mul v13.4h, v13.4h, v27.4h
3798 + ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
3799 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
3800 + mul v14.4h, v14.4h, v28.4h
3801 + mul v15.4h, v15.4h, v29.4h
3802 + ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
3803 + mul v16.4h, v16.4h, v30.4h
3804 + mul v17.4h, v17.4h, v31.4h
3805 + ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
3806 +
3807 + /* Pass 1 */
3808 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4 .4h, v6.4h, v8.4h, v10.4h
3809 + transpose_4x4 v4, v6, v8, v10, v3
3810 + ins v10.2d[1], v11.2d[0]
3811 + idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5 .4h, v7.4h, v9.4h, v11.4h
3812 + transpose_4x4 v5, v7, v9, v11, v3
3813 + ins v10.2d[1], v11.2d[0]
3814 + /* Pass 2 */
3815 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26. 4h, v27.4h, v28.4h, v29.4h
3816 + transpose_4x4 v26, v27, v28, v29, v3
3817 +
3818 + /* Range limit */
3819 + movi v30.8h, #0x80
3820 + ins v26.2d[1], v27.2d[0]
3821 + ins v28.2d[1], v29.2d[0]
3822 + add v26.8h, v26.8h, v30.8h
3823 + add v28.8h, v28.8h, v30.8h
3824 + sqxtun v26.8b, v26.8h
3825 + sqxtun v27.8b, v28.8h
3826 +
3827 + /* Store results to the output buffer */
3828 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3829 + ldp TMP3, TMP4, [OUTPUT_BUF]
3830 + add TMP1, TMP1, OUTPUT_COL
3831 + add TMP2, TMP2, OUTPUT_COL
3832 + add TMP3, TMP3, OUTPUT_COL
3833 + add TMP4, TMP4, OUTPUT_COL
3834 +
3835 +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
3836 + /* We can use much less instructions on little endian systems if the
3837 + * OS kernel is not configured to trap unaligned memory accesses
3838 + */
3839 + st1 {v26.s}[0], [TMP1], 4
3840 + st1 {v27.s}[0], [TMP3], 4
3841 + st1 {v26.s}[1], [TMP2], 4
3842 + st1 {v27.s}[1], [TMP4], 4
3843 +#else
3844 + st1 {v26.b}[0], [TMP1], 1
3845 + st1 {v27.b}[0], [TMP3], 1
3846 + st1 {v26.b}[1], [TMP1], 1
3847 + st1 {v27.b}[1], [TMP3], 1
3848 + st1 {v26.b}[2], [TMP1], 1
3849 + st1 {v27.b}[2], [TMP3], 1
3850 + st1 {v26.b}[3], [TMP1], 1
3851 + st1 {v27.b}[3], [TMP3], 1
3852 +
3853 + st1 {v26.b}[4], [TMP2], 1
3854 + st1 {v27.b}[4], [TMP4], 1
3855 + st1 {v26.b}[5], [TMP2], 1
3856 + st1 {v27.b}[5], [TMP4], 1
3857 + st1 {v26.b}[6], [TMP2], 1
3858 + st1 {v27.b}[6], [TMP4], 1
3859 + st1 {v26.b}[7], [TMP2], 1
3860 + st1 {v27.b}[7], [TMP4], 1
3861 +#endif
3862 +
3863 + /* vpop {v8.4h - v15.4h} ;not available */
3864 + sub sp, sp, #272
3865 + ldr x15, [sp], 16
3866 + ld1 {v0.8b - v3.8b}, [sp], 32
3867 + ld1 {v4.8b - v7.8b}, [sp], 32
3868 + ld1 {v8.8b - v11.8b}, [sp], 32
3869 + ld1 {v12.8b - v15.8b}, [sp], 32
3870 + ld1 {v16.8b - v19.8b}, [sp], 32
3871 + ld1 {v20.8b - v23.8b}, [sp], 32
3872 + ld1 {v24.8b - v27.8b}, [sp], 32
3873 + ld1 {v28.8b - v31.8b}, [sp], 32
3874 + blr x30
3875 +
3876 + .unreq DCT_TABLE
3877 + .unreq COEF_BLOCK
3878 + .unreq OUTPUT_BUF
3879 + .unreq OUTPUT_COL
3880 + .unreq TMP1
3881 + .unreq TMP2
3882 + .unreq TMP3
3883 + .unreq TMP4
3884 +
3885 +.purgem idct_helper
3886 +
3887 +
3888 +/*****************************************************************************/
3889 +
3890 +/*
3891 + * jsimd_idct_2x2_neon
3892 + *
3893 + * This function contains inverse-DCT code for getting reduced-size
3894 + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
3895 + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
3896 + * function from jpeg-6b (jidctred.c).
3897 + *
3898 + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
3899 + * requires much less arithmetic operations and hence should be faster.
3900 + * The primary purpose of this particular NEON optimized function is
3901 + * bit exact compatibility with jpeg-6b.
3902 + */
3903 +
3904 +.balign 8
3905 +jsimd_idct_2x2_neon_consts:
3906 + .short -FIX_0_720959822 /* v14[0] */
3907 + .short FIX_0_850430095 /* v14[1] */
3908 + .short -FIX_1_272758580 /* v14[2] */
3909 + .short FIX_3_624509785 /* v14[3] */
3910 +
3911 +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
3912 + sshll v15.4s, \x4, #15
3913 + smull v26.4s, \x6, v14.4h[3]
3914 + smlal v26.4s, \x10, v14.4h[2]
3915 + smlal v26.4s, \x12, v14.4h[1]
3916 + smlal v26.4s, \x16, v14.4h[0]
3917 +
3918 + add v20.4s, v15.4s, v26.4s
3919 + sub v15.4s, v15.4s, v26.4s
3920 +
3921 +.if \shift > 16
3922 + srshr v20.4s, v20.4s, #\shift
3923 + srshr v15.4s, v15.4s, #\shift
3924 + xtn \y26, v20.4s
3925 + xtn \y27, v15.4s
3926 +.else
3927 + rshrn \y26, v20.4s, #\shift
3928 + rshrn \y27, v15.4s, #\shift
3929 +.endif
3930 +
3931 +.endm
3932 +
3933 +asm_function jsimd_idct_2x2_neon
3934 +
3935 + DCT_TABLE .req x0
3936 + COEF_BLOCK .req x1
3937 + OUTPUT_BUF .req x2
3938 + OUTPUT_COL .req x3
3939 + TMP1 .req x0
3940 + TMP2 .req x15
3941 +
3942 + /* vpush {v8.4h - v15.4h} ; not available */
3943 + sub sp, sp, 208
3944 + str x15, [sp], 16
3945 +
3946 + /* Load constants */
3947 + adr TMP2, jsimd_idct_2x2_neon_consts
3948 + st1 {v4.8b - v7.8b}, [sp], 32
3949 + st1 {v8.8b - v11.8b}, [sp], 32
3950 + st1 {v12.8b - v15.8b}, [sp], 32
3951 + st1 {v16.8b - v19.8b}, [sp], 32
3952 + st1 {v21.8b - v22.8b}, [sp], 16
3953 + st1 {v24.8b - v27.8b}, [sp], 32
3954 + st1 {v30.8b - v31.8b}, [sp], 16
3955 + ld1 {v14.4h}, [TMP2]
3956 +
3957 + /* Load all COEF_BLOCK into NEON registers with the following allocation:
3958 + * 0 1 2 3 | 4 5 6 7
3959 + * ---------+--------
3960 + * 0 | v4.4h | v5.4h
3961 + * 1 | v6.4h | v7.4h
3962 + * 2 | - | -
3963 + * 3 | v10.4h | v11.4h
3964 + * 4 | - | -
3965 + * 5 | v12.4h | v13.4h
3966 + * 6 | - | -
3967 + * 7 | v16.4h | v17.4h
3968 + */
3969 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
3970 + add COEF_BLOCK, COEF_BLOCK, #16
3971 + ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
3972 + add COEF_BLOCK, COEF_BLOCK, #16
3973 + ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
3974 + add COEF_BLOCK, COEF_BLOCK, #16
3975 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
3976 + /* Dequantize */
3977 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
3978 + mul v4.4h, v4.4h, v18.4h
3979 + mul v5.4h, v5.4h, v19.4h
3980 + ins v4.2d[1], v5.2d[0]
3981 + mul v6.4h, v6.4h, v20.4h
3982 + mul v7.4h, v7.4h, v21.4h
3983 + ins v6.2d[1], v7.2d[0]
3984 + add DCT_TABLE, DCT_TABLE, #16
3985 + ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
3986 + mul v10.4h, v10.4h, v24.4h
3987 + mul v11.4h, v11.4h, v25.4h
3988 + ins v10.2d[1], v11.2d[0]
3989 + add DCT_TABLE, DCT_TABLE, #16
3990 + ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
3991 + mul v12.4h, v12.4h, v26.4h
3992 + mul v13.4h, v13.4h, v27.4h
3993 + ins v12.2d[1], v13.2d[0]
3994 + add DCT_TABLE, DCT_TABLE, #16
3995 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
3996 + mul v16.4h, v16.4h, v30.4h
3997 + mul v17.4h, v17.4h, v31.4h
3998 + ins v16.2d[1], v17.2d[0]
3999 +
4000 + /* Pass 1 */
4001 +#if 0
4002 + idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
4003 + transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
4004 + idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
4005 + transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
4006 +#else
4007 + smull v26.4s, v6.4h, v14.4h[3]
4008 + smlal v26.4s, v10.4h, v14.4h[2]
4009 + smlal v26.4s, v12.4h, v14.4h[1]
4010 + smlal v26.4s, v16.4h, v14.4h[0]
4011 + smull v24.4s, v7.4h, v14.4h[3]
4012 + smlal v24.4s, v11.4h, v14.4h[2]
4013 + smlal v24.4s, v13.4h, v14.4h[1]
4014 + smlal v24.4s, v17.4h, v14.4h[0]
4015 + sshll v15.4s, v4.4h, #15
4016 + sshll v30.4s, v5.4h, #15
4017 + add v20.4s, v15.4s, v26.4s
4018 + sub v15.4s, v15.4s, v26.4s
4019 + rshrn v4.4h, v20.4s, #13
4020 + rshrn v6.4h, v15.4s, #13
4021 + add v20.4s, v30.4s, v24.4s
4022 + sub v15.4s, v30.4s, v24.4s
4023 + rshrn v5.4h, v20.4s, #13
4024 + rshrn v7.4h, v15.4s, #13
4025 + ins v4.2d[1], v5.2d[0]
4026 + ins v6.2d[1], v7.2d[0]
4027 + transpose v4, v6, v3, .16b, .8h
4028 + transpose v6, v10, v3, .16b, .4s
4029 + ins v11.2d[0], v10.2d[1]
4030 + ins v7.2d[0], v6.2d[1]
4031 +#endif
4032 +
4033 + /* Pass 2 */
4034 + idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
4035 +
4036 + /* Range limit */
4037 + movi v30.8h, #0x80
4038 + ins v26.2d[1], v27.2d[0]
4039 + add v26.8h, v26.8h, v30.8h
4040 + sqxtun v30.8b, v26.8h
4041 + ins v26.2d[0], v30.2d[0]
4042 + sqxtun v27.8b, v26.8h
4043 +
4044 + /* Store results to the output buffer */
4045 + ldp TMP1, TMP2, [OUTPUT_BUF]
4046 + add TMP1, TMP1, OUTPUT_COL
4047 + add TMP2, TMP2, OUTPUT_COL
4048 +
4049 + st1 {v26.b}[0], [TMP1], 1
4050 + st1 {v27.b}[4], [TMP1], 1
4051 + st1 {v26.b}[1], [TMP2], 1
4052 + st1 {v27.b}[5], [TMP2], 1
4053 +
4054 + sub sp, sp, #208
4055 + ldr x15, [sp], 16
4056 + ld1 {v4.8b - v7.8b}, [sp], 32
4057 + ld1 {v8.8b - v11.8b}, [sp], 32
4058 + ld1 {v12.8b - v15.8b}, [sp], 32
4059 + ld1 {v16.8b - v19.8b}, [sp], 32
4060 + ld1 {v21.8b - v22.8b}, [sp], 16
4061 + ld1 {v24.8b - v27.8b}, [sp], 32
4062 + ld1 {v30.8b - v31.8b}, [sp], 16
4063 + blr x30
4064 +
4065 + .unreq DCT_TABLE
4066 + .unreq COEF_BLOCK
4067 + .unreq OUTPUT_BUF
4068 + .unreq OUTPUT_COL
4069 + .unreq TMP1
4070 + .unreq TMP2
4071 +
4072 +.purgem idct_helper
4073 +
4074 +
4075 +/*****************************************************************************/
4076 +
4077 +/*
4078 + * jsimd_ycc_extrgb_convert_neon
4079 + * jsimd_ycc_extbgr_convert_neon
4080 + * jsimd_ycc_extrgbx_convert_neon
4081 + * jsimd_ycc_extbgrx_convert_neon
4082 + * jsimd_ycc_extxbgr_convert_neon
4083 + * jsimd_ycc_extxrgb_convert_neon
4084 + *
4085 + * Colorspace conversion YCbCr -> RGB
4086 + */
4087 +
4088 +
4089 +.macro do_load size
4090 + .if \size == 8
4091 + ld1 {v4.8b}, [U], 8
4092 + ld1 {v5.8b}, [V], 8
4093 + ld1 {v0.8b}, [Y], 8
4094 + prfm PLDL1KEEP, [U, #64]
4095 + prfm PLDL1KEEP, [V, #64]
4096 + prfm PLDL1KEEP, [Y, #64]
4097 + .elseif \size == 4
4098 + ld1 {v4.b}[0], [U], 1
4099 + ld1 {v4.b}[1], [U], 1
4100 + ld1 {v4.b}[2], [U], 1
4101 + ld1 {v4.b}[3], [U], 1
4102 + ld1 {v5.b}[0], [V], 1
4103 + ld1 {v5.b}[1], [V], 1
4104 + ld1 {v5.b}[2], [V], 1
4105 + ld1 {v5.b}[3], [V], 1
4106 + ld1 {v0.b}[0], [Y], 1
4107 + ld1 {v0.b}[1], [Y], 1
4108 + ld1 {v0.b}[2], [Y], 1
4109 + ld1 {v0.b}[3], [Y], 1
4110 + .elseif \size == 2
4111 + ld1 {v4.b}[4], [U], 1
4112 + ld1 {v4.b}[5], [U], 1
4113 + ld1 {v5.b}[4], [V], 1
4114 + ld1 {v5.b}[5], [V], 1
4115 + ld1 {v0.b}[4], [Y], 1
4116 + ld1 {v0.b}[5], [Y], 1
4117 + .elseif \size == 1
4118 + ld1 {v4.b}[6], [U], 1
4119 + ld1 {v5.b}[6], [V], 1
4120 + ld1 {v0.b}[6], [Y], 1
4121 + .else
4122 + .error unsupported macroblock size
4123 + .endif
4124 +.endm
4125 +
4126 +.macro do_store bpp, size
4127 + .if \bpp == 24
4128 + .if \size == 8
4129 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
4130 + .elseif \size == 4
4131 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
4132 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
4133 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
4134 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
4135 + .elseif \size == 2
4136 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
4137 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
4138 + .elseif \size == 1
4139 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
4140 + .else
4141 + .error unsupported macroblock size
4142 + .endif
4143 + .elseif \bpp == 32
4144 + .if \size == 8
4145 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
4146 + .elseif \size == 4
4147 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
4148 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
4149 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
4150 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
4151 + .elseif \size == 2
4152 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
4153 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
4154 + .elseif \size == 1
4155 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
4156 + .else
4157 + .error unsupported macroblock size
4158 + .endif
4159 + .elseif \bpp==16
4160 + .if \size == 8
4161 + st1 {v25.8h}, [RGB],16
4162 + .elseif \size == 4
4163 + st1 {v25.4h}, [RGB],8
4164 + .elseif \size == 2
4165 + st1 {v25.h}[4], [RGB],2
4166 + st1 {v25.h}[5], [RGB],2
4167 + .elseif \size == 1
4168 + st1 {v25.h}[6], [RGB],2
4169 + .else
4170 + .error unsupported macroblock size
4171 + .endif
4172 + .else
4173 + .error unsupported bpp
4174 + .endif
4175 +.endm
4176 +
4177 +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
4178 +
4179 +/*
4180 + * 2-stage pipelined YCbCr->RGB conversion
4181 + */
4182 +
4183 +.macro do_yuv_to_rgb_stage1
4184 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
4185 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4186 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4187 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4188 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4189 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4190 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4191 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4192 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
4193 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
4194 +.endm
4195 +
4196 +.macro do_yuv_to_rgb_stage2
4197 + rshrn v20.4h, v20.4s, #15
4198 + rshrn2 v20.8h, v22.4s, #15
4199 + rshrn v24.4h, v24.4s, #14
4200 + rshrn2 v24.8h, v26.4s, #14
4201 + rshrn v28.4h, v28.4s, #14
4202 + rshrn2 v28.8h, v30.4s, #14
4203 + uaddw v20.8h, v20.8h, v0.8b
4204 + uaddw v24.8h, v24.8h, v0.8b
4205 + uaddw v28.8h, v28.8h, v0.8b
4206 +.if \bpp != 16
4207 + sqxtun v1\g_offs\defsize, v20.8h
4208 + sqxtun v1\r_offs\defsize, v24.8h
4209 + sqxtun v1\b_offs\defsize, v28.8h
4210 +.else
4211 + sqshlu v21.8h, v20.8h, #8
4212 + sqshlu v25.8h, v24.8h, #8
4213 + sqshlu v29.8h, v28.8h, #8
4214 + sri v25.8h, v21.8h, #5
4215 + sri v25.8h, v29.8h, #11
4216 +.endif
4217 +
4218 +.endm
4219 +
4220 +.macro do_yuv_to_rgb_stage2_store_load_stage1
4221 + rshrn v20.4h, v20.4s, #15
4222 + rshrn v24.4h, v24.4s, #14
4223 + rshrn v28.4h, v28.4s, #14
4224 + ld1 {v4.8b}, [U], 8
4225 + rshrn2 v20.8h, v22.4s, #15
4226 + rshrn2 v24.8h, v26.4s, #14
4227 + rshrn2 v28.8h, v30.4s, #14
4228 + ld1 {v5.8b}, [V], 8
4229 + uaddw v20.8h, v20.8h, v0.8b
4230 + uaddw v24.8h, v24.8h, v0.8b
4231 + uaddw v28.8h, v28.8h, v0.8b
4232 +.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
4233 + sqxtun v1\g_offs\defsize, v20.8h
4234 + ld1 {v0.8b}, [Y], 8
4235 + sqxtun v1\r_offs\defsize, v24.8h
4236 + prfm PLDL1KEEP, [U, #64]
4237 + prfm PLDL1KEEP, [V, #64]
4238 + prfm PLDL1KEEP, [Y, #64]
4239 + sqxtun v1\b_offs\defsize, v28.8h
4240 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
4241 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4242 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4243 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4244 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4245 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4246 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4247 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4248 +.else /**************************** rgb565 ***********************************/
4249 + sqshlu v21.8h, v20.8h, #8
4250 + sqshlu v25.8h, v24.8h, #8
4251 + sqshlu v29.8h, v28.8h, #8
4252 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
4253 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4254 + ld1 {v0.8b}, [Y], 8
4255 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4256 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4257 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4258 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4259 + sri v25.8h, v21.8h, #5
4260 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4261 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4262 + prfm PLDL1KEEP, [U, #64]
4263 + prfm PLDL1KEEP, [V, #64]
4264 + prfm PLDL1KEEP, [Y, #64]
4265 + sri v25.8h, v29.8h, #11
4266 +.endif
4267 + do_store \bpp, 8
4268 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
4269 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
4270 +.endm
4271 +
4272 +.macro do_yuv_to_rgb
4273 + do_yuv_to_rgb_stage1
4274 + do_yuv_to_rgb_stage2
4275 +.endm
4276 +
4277 +/* Apple gas crashes on adrl, work around that by using adr.
4278 + * But this requires a copy of these constants for each function.
4279 + */
4280 +
4281 +.balign 16
4282 +jsimd_ycc_\colorid\()_neon_consts:
4283 + .short 0, 0, 0, 0
4284 + .short 22971, -11277, -23401, 29033
4285 + .short -128, -128, -128, -128
4286 + .short -128, -128, -128, -128
4287 +
4288 +asm_function jsimd_ycc_\colorid\()_convert_neon
4289 + OUTPUT_WIDTH .req x0
4290 + INPUT_BUF .req x1
4291 + INPUT_ROW .req x2
4292 + OUTPUT_BUF .req x3
4293 + NUM_ROWS .req x4
4294 +
4295 + INPUT_BUF0 .req x5
4296 + INPUT_BUF1 .req x6
4297 + INPUT_BUF2 .req INPUT_BUF
4298 +
4299 + RGB .req x7
4300 + Y .req x8
4301 + U .req x9
4302 + V .req x10
4303 + N .req x15
4304 +
4305 + sub sp, sp, 336
4306 + str x15, [sp], 16
4307 + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
4308 + adr x15, jsimd_ycc_\colorid\()_neon_consts
4309 + /* Save NEON registers */
4310 + st1 {v0.8b - v3.8b}, [sp], 32
4311 + st1 {v4.8b - v7.8b}, [sp], 32
4312 + st1 {v8.8b - v11.8b}, [sp], 32
4313 + st1 {v12.8b - v15.8b}, [sp], 32
4314 + st1 {v16.8b - v19.8b}, [sp], 32
4315 + st1 {v20.8b - v23.8b}, [sp], 32
4316 + st1 {v24.8b - v27.8b}, [sp], 32
4317 + st1 {v28.8b - v31.8b}, [sp], 32
4318 + ld1 {v0.4h, v1.4h}, [x15], 16
4319 + ld1 {v2.8h}, [x15]
4320 +
4321 + /* Save ARM registers and handle input arguments */
4322 + /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
4323 + stp x4, x5, [sp], 16
4324 + stp x6, x7, [sp], 16
4325 + stp x8, x9, [sp], 16
4326 + stp x10, x30, [sp], 16
4327 + ldr INPUT_BUF0, [INPUT_BUF]
4328 + ldr INPUT_BUF1, [INPUT_BUF, 8]
4329 + ldr INPUT_BUF2, [INPUT_BUF, 16]
4330 + .unreq INPUT_BUF
4331 +
4332 + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
4333 + movi v10.16b, #255
4334 + movi v13.16b, #255
4335 +
4336 + /* Outer loop over scanlines */
4337 + cmp NUM_ROWS, #1
4338 + blt 9f
4339 +0:
4340 + lsl x16, INPUT_ROW, #3
4341 + ldr Y, [INPUT_BUF0, x16]
4342 + ldr U, [INPUT_BUF1, x16]
4343 + mov N, OUTPUT_WIDTH
4344 + ldr V, [INPUT_BUF2, x16]
4345 + add INPUT_ROW, INPUT_ROW, #1
4346 + ldr RGB, [OUTPUT_BUF], #8
4347 +
4348 + /* Inner loop over pixels */
4349 + subs N, N, #8
4350 + blt 3f
4351 + do_load 8
4352 + do_yuv_to_rgb_stage1
4353 + subs N, N, #8
4354 + blt 2f
4355 +1:
4356 + do_yuv_to_rgb_stage2_store_load_stage1
4357 + subs N, N, #8
4358 + bge 1b
4359 +2:
4360 + do_yuv_to_rgb_stage2
4361 + do_store \bpp, 8
4362 + tst N, #7
4363 + beq 8f
4364 +3:
4365 + tst N, #4
4366 + beq 3f
4367 + do_load 4
4368 +3:
4369 + tst N, #2
4370 + beq 4f
4371 + do_load 2
4372 +4:
4373 + tst N, #1
4374 + beq 5f
4375 + do_load 1
4376 +5:
4377 + do_yuv_to_rgb
4378 + tst N, #4
4379 + beq 6f
4380 + do_store \bpp, 4
4381 +6:
4382 + tst N, #2
4383 + beq 7f
4384 + do_store \bpp, 2
4385 +7:
4386 + tst N, #1
4387 + beq 8f
4388 + do_store \bpp, 1
4389 +8:
4390 + subs NUM_ROWS, NUM_ROWS, #1
4391 + bgt 0b
4392 +9:
4393 + /* Restore all registers and return */
4394 + sub sp, sp, #336
4395 + ldr x15, [sp], 16
4396 + ld1 {v0.8b - v3.8b}, [sp], 32
4397 + ld1 {v4.8b - v7.8b}, [sp], 32
4398 + ld1 {v8.8b - v11.8b}, [sp], 32
4399 + ld1 {v12.8b - v15.8b}, [sp], 32
4400 + ld1 {v16.8b - v19.8b}, [sp], 32
4401 + ld1 {v20.8b - v23.8b}, [sp], 32
4402 + ld1 {v24.8b - v27.8b}, [sp], 32
4403 + ld1 {v28.8b - v31.8b}, [sp], 32
4404 + /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
4405 + ldp x4, x5, [sp], 16
4406 + ldp x6, x7, [sp], 16
4407 + ldp x8, x9, [sp], 16
4408 + ldp x10, x30, [sp], 16
4409 + br x30
4410 + .unreq OUTPUT_WIDTH
4411 + .unreq INPUT_ROW
4412 + .unreq OUTPUT_BUF
4413 + .unreq NUM_ROWS
4414 + .unreq INPUT_BUF0
4415 + .unreq INPUT_BUF1
4416 + .unreq INPUT_BUF2
4417 + .unreq RGB
4418 + .unreq Y
4419 + .unreq U
4420 + .unreq V
4421 + .unreq N
4422 +
4423 +.purgem do_yuv_to_rgb
4424 +.purgem do_yuv_to_rgb_stage1
4425 +.purgem do_yuv_to_rgb_stage2
4426 +.purgem do_yuv_to_rgb_stage2_store_load_stage1
4427 +.endm
4428 +
4429 +/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */
4430 +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b
4431 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b
4432 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b
4433 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b
4434 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b
4435 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b
4436 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b
4437 +.purgem do_load
4438 +.purgem do_store
OLDNEW
« no previous file with comments | « djpeg.c ('k') | jdapistd.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698