OLD | NEW |
| (Empty) |
1 Index: jdmarker.c | |
2 =================================================================== | |
3 --- jdmarker.c (revision 829) | |
4 +++ jdmarker.c (working copy) | |
5 @@ -910,7 +910,7 @@ | |
6 } | |
7 | |
8 if (cinfo->marker->discarded_bytes != 0) { | |
9 - WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c); | |
10 + TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c)
; | |
11 cinfo->marker->discarded_bytes = 0; | |
12 } | |
13 | |
14 @@ -944,7 +944,144 @@ | |
15 return TRUE; | |
16 } | |
17 | |
18 +#ifdef MOTION_JPEG_SUPPORTED | |
19 | |
20 +/* The default Huffman tables used by motion JPEG frames. When a motion JPEG | |
21 + * frame does not have DHT tables, we should use the huffman tables suggested b
y | |
22 + * the JPEG standard. Each of these tables represents a member of the JHUFF_TBL
S | |
23 + * struct so we can just copy it to the according JHUFF_TBLS member. | |
24 + */ | |
25 +/* DC table 0 */ | |
26 +LOCAL(const unsigned char) mjpg_dc0_bits[] = { | |
27 + 0x00, 0x01, 0x05, 0x01, 0x01, 0x01, 0x01, 0x01, | |
28 + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
29 +}; | |
30 + | |
31 +LOCAL(const unsigned char) mjpg_dc0_huffval[] = { | |
32 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
33 + 0x08, 0x09, 0x0A, 0x0B | |
34 +}; | |
35 + | |
36 +/* DC table 1 */ | |
37 +LOCAL(const unsigned char) mjpg_dc1_bits[] = { | |
38 + 0x00, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, | |
39 + 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 | |
40 +}; | |
41 + | |
42 +LOCAL(const unsigned char) mjpg_dc1_huffval[] = { | |
43 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
44 + 0x08, 0x09, 0x0A, 0x0B | |
45 +}; | |
46 + | |
47 +/* AC table 0 */ | |
48 +LOCAL(const unsigned char) mjpg_ac0_bits[] = { | |
49 + 0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03, | |
50 + 0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7D | |
51 +}; | |
52 + | |
53 +LOCAL(const unsigned char) mjpg_ac0_huffval[] = { | |
54 + 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, | |
55 + 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, | |
56 + 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08, | |
57 + 0x23, 0x42, 0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0, | |
58 + 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16, | |
59 + 0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28, | |
60 + 0x29, 0x2A, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, | |
61 + 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, | |
62 + 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, | |
63 + 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, | |
64 + 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, | |
65 + 0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, | |
66 + 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, | |
67 + 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, | |
68 + 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, | |
69 + 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5, | |
70 + 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, | |
71 + 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2, | |
72 + 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, | |
73 + 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, | |
74 + 0xF9, 0xFA | |
75 +}; | |
76 + | |
77 +/* AC table 1 */ | |
78 +LOCAL(const unsigned char) mjpg_ac1_bits[] = { | |
79 + 0x00, 0x02, 0x01, 0x02, 0x04, 0x04, 0x03, 0x04, | |
80 + 0x07, 0x05, 0x04, 0x04, 0x00, 0x01, 0x02, 0x77 | |
81 +}; | |
82 + | |
83 +LOCAL(const unsigned char) mjpg_ac1_huffval[] = { | |
84 + 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, | |
85 + 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, | |
86 + 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, | |
87 + 0xA1, 0xB1, 0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0, | |
88 + 0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24, 0x34, | |
89 + 0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26, | |
90 + 0x27, 0x28, 0x29, 0x2A, 0x35, 0x36, 0x37, 0x38, | |
91 + 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, | |
92 + 0x49, 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, | |
93 + 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, | |
94 + 0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, | |
95 + 0x79, 0x7A, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, | |
96 + 0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, | |
97 + 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, | |
98 + 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, | |
99 + 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, | |
100 + 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, | |
101 + 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, | |
102 + 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, | |
103 + 0xEA, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, | |
104 + 0xF9, 0xFA | |
105 +}; | |
106 + | |
107 +/* Loads the default Huffman tables used by motion JPEG frames. This function | |
108 + * just copies the huffman tables suggested in the JPEG standard when we have | |
109 + * not load them. | |
110 + */ | |
111 +LOCAL(void) | |
112 +mjpg_load_huff_tables (j_decompress_ptr cinfo) | |
113 +{ | |
114 + JHUFF_TBL *htblptr; | |
115 + | |
116 + if (! cinfo->dc_huff_tbl_ptrs[0]) { | |
117 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); | |
118 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL)); | |
119 + MEMCOPY(&htblptr->bits[1], mjpg_dc0_bits, SIZEOF(mjpg_dc0_bits)); | |
120 + MEMCOPY(&htblptr->huffval[0], mjpg_dc0_huffval, SIZEOF(mjpg_dc0_huffval)); | |
121 + cinfo->dc_huff_tbl_ptrs[0] = htblptr; | |
122 + } | |
123 + | |
124 + if (! cinfo->dc_huff_tbl_ptrs[1]) { | |
125 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); | |
126 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL)); | |
127 + MEMCOPY(&htblptr->bits[1], mjpg_dc1_bits, SIZEOF(mjpg_dc1_bits)); | |
128 + MEMCOPY(&htblptr->huffval[0], mjpg_dc1_huffval, SIZEOF(mjpg_dc1_huffval)); | |
129 + cinfo->dc_huff_tbl_ptrs[1] = htblptr; | |
130 + } | |
131 + | |
132 + if (! cinfo->ac_huff_tbl_ptrs[0]) { | |
133 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); | |
134 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL)); | |
135 + MEMCOPY(&htblptr->bits[1], mjpg_ac0_bits, SIZEOF(mjpg_ac0_bits)); | |
136 + MEMCOPY(&htblptr->huffval[0], mjpg_ac0_huffval, SIZEOF(mjpg_ac0_huffval)); | |
137 + cinfo->ac_huff_tbl_ptrs[0] = htblptr; | |
138 + } | |
139 + | |
140 + if (! cinfo->ac_huff_tbl_ptrs[1]) { | |
141 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); | |
142 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL)); | |
143 + MEMCOPY(&htblptr->bits[1], mjpg_ac1_bits, SIZEOF(mjpg_ac1_bits)); | |
144 + MEMCOPY(&htblptr->huffval[0], mjpg_ac1_huffval, SIZEOF(mjpg_ac1_huffval)); | |
145 + cinfo->ac_huff_tbl_ptrs[1] = htblptr; | |
146 + } | |
147 +} | |
148 + | |
149 +#else | |
150 + | |
151 +#define mjpg_load_huff_tables(cinfo) | |
152 + | |
153 +#endif /* MOTION_JPEG_SUPPORTED */ | |
154 + | |
155 + | |
156 /* | |
157 * Read markers until SOS or EOI. | |
158 * | |
159 @@ -1013,6 +1150,7 @@ | |
160 break; | |
161 | |
162 case M_SOS: | |
163 + mjpg_load_huff_tables(cinfo); | |
164 if (! get_sos(cinfo)) | |
165 return JPEG_SUSPENDED; | |
166 cinfo->unread_marker = 0; /* processed the marker */ | |
167 Index: jmorecfg.h | |
168 =================================================================== | |
169 --- jmorecfg.h (revision 829) | |
170 +++ jmorecfg.h (working copy) | |
171 @@ -153,14 +153,18 @@ | |
172 /* INT16 must hold at least the values -32768..32767. */ | |
173 | |
174 #ifndef XMD_H /* X11/xmd.h correctly defines INT16 */ | |
175 +#ifndef _BASETSD_H_ /* basetsd.h correctly defines INT32 */ | |
176 typedef short INT16; | |
177 #endif | |
178 +#endif | |
179 | |
180 /* INT32 must hold at least signed 32-bit values. */ | |
181 | |
182 #ifndef XMD_H /* X11/xmd.h correctly defines INT32 */ | |
183 +#ifndef _BASETSD_H_ /* basetsd.h correctly defines INT32 */ | |
184 typedef long INT32; | |
185 #endif | |
186 +#endif | |
187 | |
188 /* Datatype used for image dimensions. The JPEG standard only supports | |
189 * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore | |
190 @@ -210,11 +214,13 @@ | |
191 * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol. | |
192 */ | |
193 | |
194 +#ifndef FAR | |
195 #ifdef NEED_FAR_POINTERS | |
196 #define FAR far | |
197 #else | |
198 #define FAR | |
199 #endif | |
200 +#endif | |
201 | |
202 | |
203 /* | |
204 Index: jpeglib.h | |
205 =================================================================== | |
206 --- jpeglib.h (revision 829) | |
207 +++ jpeglib.h (working copy) | |
208 @@ -15,6 +15,10 @@ | |
209 #ifndef JPEGLIB_H | |
210 #define JPEGLIB_H | |
211 | |
212 +/* Begin chromium edits */ | |
213 +#include "jpeglibmangler.h" | |
214 +/* End chromium edits */ | |
215 + | |
216 /* | |
217 * First we include the configuration files that record how this | |
218 * installation of the JPEG library is set up. jconfig.h can be | |
219 Index: jpeglibmangler.h | |
220 =================================================================== | |
221 --- jpeglibmangler.h (revision 0) | |
222 +++ jpeglibmangler.h (revision 0) | |
223 @@ -0,0 +1,113 @@ | |
224 +// Copyright (c) 2009 The Chromium Authors. All rights reserved. | |
225 +// Use of this source code is governed by a BSD-style license that can be | |
226 +// found in the LICENSE file. | |
227 + | |
228 +#ifndef THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_ | |
229 +#define THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_ | |
230 + | |
231 +// Mangle all externally visible function names so we can build our own libjpeg | |
232 +// without system libraries trying to use it. | |
233 + | |
234 +#define jpeg_make_c_derived_tbl chromium_jpeg_make_c_derived_tbl | |
235 +#define jpeg_gen_optimal_table chromium_jpeg_gen_optimal_table | |
236 +#define jpeg_make_d_derived_tbl chromium_jpeg_make_d_derived_tbl | |
237 +#define jpeg_fill_bit_buffer chromium_jpeg_fill_bit_buffer | |
238 +#define jpeg_huff_decode chromium_jpeg_huff_decode | |
239 +#define jpeg_fdct_islow chromium_jpeg_fdct_islow | |
240 +#define jpeg_fdct_ifast chromium_jpeg_fdct_ifast | |
241 +#define jpeg_fdct_float chromium_jpeg_fdct_float | |
242 +#define jpeg_idct_islow chromium_jpeg_idct_islow | |
243 +#define jpeg_idct_ifast chromium_jpeg_idct_ifast | |
244 +#define jpeg_idct_float chromium_jpeg_idct_float | |
245 +#define jpeg_idct_4x4 chromium_jpeg_idct_4x4 | |
246 +#define jpeg_idct_2x2 chromium_jpeg_idct_2x2 | |
247 +#define jpeg_idct_1x1 chromium_jpeg_idct_1x1 | |
248 +#define jinit_compress_master chromium_jinit_compress_master | |
249 +#define jinit_c_master_control chromium_jinit_c_master_control | |
250 +#define jinit_c_main_controller chromium_jinit_c_main_controller | |
251 +#define jinit_c_prep_controller chromium_jinit_c_prep_controller | |
252 +#define jinit_c_coef_controller chromium_jinit_c_coef_controller | |
253 +#define jinit_color_converter chromium_jinit_color_converter | |
254 +#define jinit_downsampler chromium_jinit_downsampler | |
255 +#define jinit_forward_dct chromium_jinit_forward_dct | |
256 +#define jinit_huff_encoder chromium_jinit_huff_encoder | |
257 +#define jinit_phuff_encoder chromium_jinit_phuff_encoder | |
258 +#define jinit_marker_writer chromium_jinit_marker_writer | |
259 +#define jinit_master_decompress chromium_jinit_master_decompress | |
260 +#define jinit_d_main_controller chromium_jinit_d_main_controller | |
261 +#define jinit_d_coef_controller chromium_jinit_d_coef_controller | |
262 +#define jinit_d_post_controller chromium_jinit_d_post_controller | |
263 +#define jinit_input_controller chromium_jinit_input_controller | |
264 +#define jinit_marker_reader chromium_jinit_marker_reader | |
265 +#define jinit_huff_decoder chromium_jinit_huff_decoder | |
266 +#define jinit_phuff_decoder chromium_jinit_phuff_decoder | |
267 +#define jinit_inverse_dct chromium_jinit_inverse_dct | |
268 +#define jinit_upsampler chromium_jinit_upsampler | |
269 +#define jinit_color_deconverter chromium_jinit_color_deconverter | |
270 +#define jinit_1pass_quantizer chromium_jinit_1pass_quantizer | |
271 +#define jinit_2pass_quantizer chromium_jinit_2pass_quantizer | |
272 +#define jinit_merged_upsampler chromium_jinit_merged_upsampler | |
273 +#define jinit_memory_mgr chromium_jinit_memory_mgr | |
274 +#define jdiv_round_up chromium_jdiv_round_up | |
275 +#define jround_up chromium_jround_up | |
276 +#define jcopy_sample_rows chromium_jcopy_sample_rows | |
277 +#define jcopy_block_row chromium_jcopy_block_row | |
278 +#define jzero_far chromium_jzero_far | |
279 +#define jpeg_std_error chromium_jpeg_std_error | |
280 +#define jpeg_CreateCompress chromium_jpeg_CreateCompress | |
281 +#define jpeg_CreateDecompress chromium_jpeg_CreateDecompress | |
282 +#define jpeg_destroy_compress chromium_jpeg_destroy_compress | |
283 +#define jpeg_destroy_decompress chromium_jpeg_destroy_decompress | |
284 +#define jpeg_stdio_dest chromium_jpeg_stdio_dest | |
285 +#define jpeg_stdio_src chromium_jpeg_stdio_src | |
286 +#define jpeg_set_defaults chromium_jpeg_set_defaults | |
287 +#define jpeg_set_colorspace chromium_jpeg_set_colorspace | |
288 +#define jpeg_default_colorspace chromium_jpeg_default_colorspace | |
289 +#define jpeg_set_quality chromium_jpeg_set_quality | |
290 +#define jpeg_set_linear_quality chromium_jpeg_set_linear_quality | |
291 +#define jpeg_add_quant_table chromium_jpeg_add_quant_table | |
292 +#define jpeg_quality_scaling chromium_jpeg_quality_scaling | |
293 +#define jpeg_simple_progression chromium_jpeg_simple_progression | |
294 +#define jpeg_suppress_tables chromium_jpeg_suppress_tables | |
295 +#define jpeg_alloc_quant_table chromium_jpeg_alloc_quant_table | |
296 +#define jpeg_alloc_huff_table chromium_jpeg_alloc_huff_table | |
297 +#define jpeg_start_compress chromium_jpeg_start_compress | |
298 +#define jpeg_write_scanlines chromium_jpeg_write_scanlines | |
299 +#define jpeg_finish_compress chromium_jpeg_finish_compress | |
300 +#define jpeg_write_raw_data chromium_jpeg_write_raw_data | |
301 +#define jpeg_write_marker chromium_jpeg_write_marker | |
302 +#define jpeg_write_m_header chromium_jpeg_write_m_header | |
303 +#define jpeg_write_m_byte chromium_jpeg_write_m_byte | |
304 +#define jpeg_write_tables chromium_jpeg_write_tables | |
305 +#define jpeg_read_header chromium_jpeg_read_header | |
306 +#define jpeg_start_decompress chromium_jpeg_start_decompress | |
307 +#define jpeg_read_scanlines chromium_jpeg_read_scanlines | |
308 +#define jpeg_finish_decompress chromium_jpeg_finish_decompress | |
309 +#define jpeg_read_raw_data chromium_jpeg_read_raw_data | |
310 +#define jpeg_has_multiple_scans chromium_jpeg_has_multiple_scans | |
311 +#define jpeg_start_output chromium_jpeg_start_output | |
312 +#define jpeg_finish_output chromium_jpeg_finish_output | |
313 +#define jpeg_input_complete chromium_jpeg_input_complete | |
314 +#define jpeg_new_colormap chromium_jpeg_new_colormap | |
315 +#define jpeg_consume_input chromium_jpeg_consume_input | |
316 +#define jpeg_calc_output_dimensions chromium_jpeg_calc_output_dimensions | |
317 +#define jpeg_save_markers chromium_jpeg_save_markers | |
318 +#define jpeg_set_marker_processor chromium_jpeg_set_marker_processor | |
319 +#define jpeg_read_coefficients chromium_jpeg_read_coefficients | |
320 +#define jpeg_write_coefficients chromium_jpeg_write_coefficients | |
321 +#define jpeg_copy_critical_parameters chromium_jpeg_copy_critical_parameters | |
322 +#define jpeg_abort_compress chromium_jpeg_abort_compress | |
323 +#define jpeg_abort_decompress chromium_jpeg_abort_decompress | |
324 +#define jpeg_abort chromium_jpeg_abort | |
325 +#define jpeg_destroy chromium_jpeg_destroy | |
326 +#define jpeg_resync_to_restart chromium_jpeg_resync_to_restart | |
327 +#define jpeg_get_small chromium_jpeg_get_small | |
328 +#define jpeg_free_small chromium_jpeg_free_small | |
329 +#define jpeg_get_large chromium_jpeg_get_large | |
330 +#define jpeg_free_large chromium_jpeg_free_large | |
331 +#define jpeg_mem_available chromium_jpeg_mem_available | |
332 +#define jpeg_open_backing_store chromium_jpeg_open_backing_store | |
333 +#define jpeg_mem_init chromium_jpeg_mem_init | |
334 +#define jpeg_mem_term chromium_jpeg_mem_term | |
335 + | |
336 +#endif // THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_ | |
337 Index: simd/jcgrass2-64.asm | |
338 =================================================================== | |
339 --- simd/jcgrass2-64.asm (revision 829) | |
340 +++ simd/jcgrass2-64.asm (working copy) | |
341 @@ -30,7 +30,7 @@ | |
342 SECTION SEG_CONST | |
343 | |
344 alignz 16 | |
345 - global EXTN(jconst_rgb_gray_convert_sse2) | |
346 + global EXTN(jconst_rgb_gray_convert_sse2) PRIVATE | |
347 | |
348 EXTN(jconst_rgb_gray_convert_sse2): | |
349 | |
350 Index: simd/jiss2fst.asm | |
351 =================================================================== | |
352 --- simd/jiss2fst.asm (revision 829) | |
353 +++ simd/jiss2fst.asm (working copy) | |
354 @@ -59,7 +59,7 @@ | |
355 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
356 | |
357 alignz 16 | |
358 - global EXTN(jconst_idct_ifast_sse2) | |
359 + global EXTN(jconst_idct_ifast_sse2) PRIVATE | |
360 | |
361 EXTN(jconst_idct_ifast_sse2): | |
362 | |
363 @@ -92,7 +92,7 @@ | |
364 %define WK_NUM 2 | |
365 | |
366 align 16 | |
367 - global EXTN(jsimd_idct_ifast_sse2) | |
368 + global EXTN(jsimd_idct_ifast_sse2) PRIVATE | |
369 | |
370 EXTN(jsimd_idct_ifast_sse2): | |
371 push ebp | |
372 Index: simd/jcclrss2-64.asm | |
373 =================================================================== | |
374 --- simd/jcclrss2-64.asm (revision 829) | |
375 +++ simd/jcclrss2-64.asm (working copy) | |
376 @@ -37,7 +37,7 @@ | |
377 | |
378 align 16 | |
379 | |
380 - global EXTN(jsimd_rgb_ycc_convert_sse2) | |
381 + global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE | |
382 | |
383 EXTN(jsimd_rgb_ycc_convert_sse2): | |
384 push rbp | |
385 Index: simd/jiss2red-64.asm | |
386 =================================================================== | |
387 --- simd/jiss2red-64.asm (revision 829) | |
388 +++ simd/jiss2red-64.asm (working copy) | |
389 @@ -73,7 +73,7 @@ | |
390 SECTION SEG_CONST | |
391 | |
392 alignz 16 | |
393 - global EXTN(jconst_idct_red_sse2) | |
394 + global EXTN(jconst_idct_red_sse2) PRIVATE | |
395 | |
396 EXTN(jconst_idct_red_sse2): | |
397 | |
398 @@ -114,7 +114,7 @@ | |
399 %define WK_NUM 2 | |
400 | |
401 align 16 | |
402 - global EXTN(jsimd_idct_4x4_sse2) | |
403 + global EXTN(jsimd_idct_4x4_sse2) PRIVATE | |
404 | |
405 EXTN(jsimd_idct_4x4_sse2): | |
406 push rbp | |
407 @@ -413,7 +413,7 @@ | |
408 ; r13 = JDIMENSION output_col | |
409 | |
410 align 16 | |
411 - global EXTN(jsimd_idct_2x2_sse2) | |
412 + global EXTN(jsimd_idct_2x2_sse2) PRIVATE | |
413 | |
414 EXTN(jsimd_idct_2x2_sse2): | |
415 push rbp | |
416 Index: simd/ji3dnflt.asm | |
417 =================================================================== | |
418 --- simd/ji3dnflt.asm (revision 829) | |
419 +++ simd/ji3dnflt.asm (working copy) | |
420 @@ -27,7 +27,7 @@ | |
421 SECTION SEG_CONST | |
422 | |
423 alignz 16 | |
424 - global EXTN(jconst_idct_float_3dnow) | |
425 + global EXTN(jconst_idct_float_3dnow) PRIVATE | |
426 | |
427 EXTN(jconst_idct_float_3dnow): | |
428 | |
429 @@ -63,7 +63,7 @@ | |
430 ; FAST_FLOAT workspace[DCTSIZE2] | |
431 | |
432 align 16 | |
433 - global EXTN(jsimd_idct_float_3dnow) | |
434 + global EXTN(jsimd_idct_float_3dnow) PRIVATE | |
435 | |
436 EXTN(jsimd_idct_float_3dnow): | |
437 push ebp | |
438 Index: simd/jsimdcpu.asm | |
439 =================================================================== | |
440 --- simd/jsimdcpu.asm (revision 829) | |
441 +++ simd/jsimdcpu.asm (working copy) | |
442 @@ -29,7 +29,7 @@ | |
443 ; | |
444 | |
445 align 16 | |
446 - global EXTN(jpeg_simd_cpu_support) | |
447 + global EXTN(jpeg_simd_cpu_support) PRIVATE | |
448 | |
449 EXTN(jpeg_simd_cpu_support): | |
450 push ebx | |
451 Index: simd/jdmerss2-64.asm | |
452 =================================================================== | |
453 --- simd/jdmerss2-64.asm (revision 829) | |
454 +++ simd/jdmerss2-64.asm (working copy) | |
455 @@ -35,7 +35,7 @@ | |
456 SECTION SEG_CONST | |
457 | |
458 alignz 16 | |
459 - global EXTN(jconst_merged_upsample_sse2) | |
460 + global EXTN(jconst_merged_upsample_sse2) PRIVATE | |
461 | |
462 EXTN(jconst_merged_upsample_sse2): | |
463 | |
464 Index: simd/jdsammmx.asm | |
465 =================================================================== | |
466 --- simd/jdsammmx.asm (revision 829) | |
467 +++ simd/jdsammmx.asm (working copy) | |
468 @@ -22,7 +22,7 @@ | |
469 SECTION SEG_CONST | |
470 | |
471 alignz 16 | |
472 - global EXTN(jconst_fancy_upsample_mmx) | |
473 + global EXTN(jconst_fancy_upsample_mmx) PRIVATE | |
474 | |
475 EXTN(jconst_fancy_upsample_mmx): | |
476 | |
477 @@ -58,7 +58,7 @@ | |
478 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
479 | |
480 align 16 | |
481 - global EXTN(jsimd_h2v1_fancy_upsample_mmx) | |
482 + global EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE | |
483 | |
484 EXTN(jsimd_h2v1_fancy_upsample_mmx): | |
485 push ebp | |
486 @@ -216,7 +216,7 @@ | |
487 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
488 | |
489 align 16 | |
490 - global EXTN(jsimd_h2v2_fancy_upsample_mmx) | |
491 + global EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE | |
492 | |
493 EXTN(jsimd_h2v2_fancy_upsample_mmx): | |
494 push ebp | |
495 @@ -542,7 +542,7 @@ | |
496 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
497 | |
498 align 16 | |
499 - global EXTN(jsimd_h2v1_upsample_mmx) | |
500 + global EXTN(jsimd_h2v1_upsample_mmx) PRIVATE | |
501 | |
502 EXTN(jsimd_h2v1_upsample_mmx): | |
503 push ebp | |
504 @@ -643,7 +643,7 @@ | |
505 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
506 | |
507 align 16 | |
508 - global EXTN(jsimd_h2v2_upsample_mmx) | |
509 + global EXTN(jsimd_h2v2_upsample_mmx) PRIVATE | |
510 | |
511 EXTN(jsimd_h2v2_upsample_mmx): | |
512 push ebp | |
513 Index: simd/jdmrgmmx.asm | |
514 =================================================================== | |
515 --- simd/jdmrgmmx.asm (revision 829) | |
516 +++ simd/jdmrgmmx.asm (working copy) | |
517 @@ -40,7 +40,7 @@ | |
518 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
519 | |
520 align 16 | |
521 - global EXTN(jsimd_h2v1_merged_upsample_mmx) | |
522 + global EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE | |
523 | |
524 EXTN(jsimd_h2v1_merged_upsample_mmx): | |
525 push ebp | |
526 @@ -409,7 +409,7 @@ | |
527 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf | |
528 | |
529 align 16 | |
530 - global EXTN(jsimd_h2v2_merged_upsample_mmx) | |
531 + global EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE | |
532 | |
533 EXTN(jsimd_h2v2_merged_upsample_mmx): | |
534 push ebp | |
535 Index: simd/jdsamss2.asm | |
536 =================================================================== | |
537 --- simd/jdsamss2.asm (revision 829) | |
538 +++ simd/jdsamss2.asm (working copy) | |
539 @@ -22,7 +22,7 @@ | |
540 SECTION SEG_CONST | |
541 | |
542 alignz 16 | |
543 - global EXTN(jconst_fancy_upsample_sse2) | |
544 + global EXTN(jconst_fancy_upsample_sse2) PRIVATE | |
545 | |
546 EXTN(jconst_fancy_upsample_sse2): | |
547 | |
548 @@ -58,7 +58,7 @@ | |
549 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
550 | |
551 align 16 | |
552 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) | |
553 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE | |
554 | |
555 EXTN(jsimd_h2v1_fancy_upsample_sse2): | |
556 push ebp | |
557 @@ -214,7 +214,7 @@ | |
558 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
559 | |
560 align 16 | |
561 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) | |
562 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE | |
563 | |
564 EXTN(jsimd_h2v2_fancy_upsample_sse2): | |
565 push ebp | |
566 @@ -538,7 +538,7 @@ | |
567 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
568 | |
569 align 16 | |
570 - global EXTN(jsimd_h2v1_upsample_sse2) | |
571 + global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE | |
572 | |
573 EXTN(jsimd_h2v1_upsample_sse2): | |
574 push ebp | |
575 @@ -637,7 +637,7 @@ | |
576 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
577 | |
578 align 16 | |
579 - global EXTN(jsimd_h2v2_upsample_sse2) | |
580 + global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE | |
581 | |
582 EXTN(jsimd_h2v2_upsample_sse2): | |
583 push ebp | |
584 Index: simd/jiss2flt-64.asm | |
585 =================================================================== | |
586 --- simd/jiss2flt-64.asm (revision 829) | |
587 +++ simd/jiss2flt-64.asm (working copy) | |
588 @@ -38,7 +38,7 @@ | |
589 SECTION SEG_CONST | |
590 | |
591 alignz 16 | |
592 - global EXTN(jconst_idct_float_sse2) | |
593 + global EXTN(jconst_idct_float_sse2) PRIVATE | |
594 | |
595 EXTN(jconst_idct_float_sse2): | |
596 | |
597 @@ -74,7 +74,7 @@ | |
598 ; FAST_FLOAT workspace[DCTSIZE2] | |
599 | |
600 align 16 | |
601 - global EXTN(jsimd_idct_float_sse2) | |
602 + global EXTN(jsimd_idct_float_sse2) PRIVATE | |
603 | |
604 EXTN(jsimd_idct_float_sse2): | |
605 push rbp | |
606 Index: simd/jfss2int-64.asm | |
607 =================================================================== | |
608 --- simd/jfss2int-64.asm (revision 829) | |
609 +++ simd/jfss2int-64.asm (working copy) | |
610 @@ -67,7 +67,7 @@ | |
611 SECTION SEG_CONST | |
612 | |
613 alignz 16 | |
614 - global EXTN(jconst_fdct_islow_sse2) | |
615 + global EXTN(jconst_fdct_islow_sse2) PRIVATE | |
616 | |
617 EXTN(jconst_fdct_islow_sse2): | |
618 | |
619 @@ -101,7 +101,7 @@ | |
620 %define WK_NUM 6 | |
621 | |
622 align 16 | |
623 - global EXTN(jsimd_fdct_islow_sse2) | |
624 + global EXTN(jsimd_fdct_islow_sse2) PRIVATE | |
625 | |
626 EXTN(jsimd_fdct_islow_sse2): | |
627 push rbp | |
628 Index: simd/jcqnts2f.asm | |
629 =================================================================== | |
630 --- simd/jcqnts2f.asm (revision 829) | |
631 +++ simd/jcqnts2f.asm (working copy) | |
632 @@ -35,7 +35,7 @@ | |
633 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
634 | |
635 align 16 | |
636 - global EXTN(jsimd_convsamp_float_sse2) | |
637 + global EXTN(jsimd_convsamp_float_sse2) PRIVATE | |
638 | |
639 EXTN(jsimd_convsamp_float_sse2): | |
640 push ebp | |
641 @@ -115,7 +115,7 @@ | |
642 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
643 | |
644 align 16 | |
645 - global EXTN(jsimd_quantize_float_sse2) | |
646 + global EXTN(jsimd_quantize_float_sse2) PRIVATE | |
647 | |
648 EXTN(jsimd_quantize_float_sse2): | |
649 push ebp | |
650 Index: simd/jdmrgss2.asm | |
651 =================================================================== | |
652 --- simd/jdmrgss2.asm (revision 829) | |
653 +++ simd/jdmrgss2.asm (working copy) | |
654 @@ -40,7 +40,7 @@ | |
655 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
656 | |
657 align 16 | |
658 - global EXTN(jsimd_h2v1_merged_upsample_sse2) | |
659 + global EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE | |
660 | |
661 EXTN(jsimd_h2v1_merged_upsample_sse2): | |
662 push ebp | |
663 @@ -560,7 +560,7 @@ | |
664 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf | |
665 | |
666 align 16 | |
667 - global EXTN(jsimd_h2v2_merged_upsample_sse2) | |
668 + global EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE | |
669 | |
670 EXTN(jsimd_h2v2_merged_upsample_sse2): | |
671 push ebp | |
672 Index: simd/jfmmxint.asm | |
673 =================================================================== | |
674 --- simd/jfmmxint.asm (revision 829) | |
675 +++ simd/jfmmxint.asm (working copy) | |
676 @@ -66,7 +66,7 @@ | |
677 SECTION SEG_CONST | |
678 | |
679 alignz 16 | |
680 - global EXTN(jconst_fdct_islow_mmx) | |
681 + global EXTN(jconst_fdct_islow_mmx) PRIVATE | |
682 | |
683 EXTN(jconst_fdct_islow_mmx): | |
684 | |
685 @@ -101,7 +101,7 @@ | |
686 %define WK_NUM 2 | |
687 | |
688 align 16 | |
689 - global EXTN(jsimd_fdct_islow_mmx) | |
690 + global EXTN(jsimd_fdct_islow_mmx) PRIVATE | |
691 | |
692 EXTN(jsimd_fdct_islow_mmx): | |
693 push ebp | |
694 Index: simd/jcgryss2-64.asm | |
695 =================================================================== | |
696 --- simd/jcgryss2-64.asm (revision 829) | |
697 +++ simd/jcgryss2-64.asm (working copy) | |
698 @@ -37,7 +37,7 @@ | |
699 | |
700 align 16 | |
701 | |
702 - global EXTN(jsimd_rgb_gray_convert_sse2) | |
703 + global EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE | |
704 | |
705 EXTN(jsimd_rgb_gray_convert_sse2): | |
706 push rbp | |
707 Index: simd/jcqnts2i.asm | |
708 =================================================================== | |
709 --- simd/jcqnts2i.asm (revision 829) | |
710 +++ simd/jcqnts2i.asm (working copy) | |
711 @@ -35,7 +35,7 @@ | |
712 %define workspace ebp+16 ; DCTELEM * workspace | |
713 | |
714 align 16 | |
715 - global EXTN(jsimd_convsamp_sse2) | |
716 + global EXTN(jsimd_convsamp_sse2) PRIVATE | |
717 | |
718 EXTN(jsimd_convsamp_sse2): | |
719 push ebp | |
720 @@ -117,7 +117,7 @@ | |
721 %define workspace ebp+16 ; DCTELEM * workspace | |
722 | |
723 align 16 | |
724 - global EXTN(jsimd_quantize_sse2) | |
725 + global EXTN(jsimd_quantize_sse2) PRIVATE | |
726 | |
727 EXTN(jsimd_quantize_sse2): | |
728 push ebp | |
729 Index: simd/jiss2fst-64.asm | |
730 =================================================================== | |
731 --- simd/jiss2fst-64.asm (revision 829) | |
732 +++ simd/jiss2fst-64.asm (working copy) | |
733 @@ -60,7 +60,7 @@ | |
734 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
735 | |
736 alignz 16 | |
737 - global EXTN(jconst_idct_ifast_sse2) | |
738 + global EXTN(jconst_idct_ifast_sse2) PRIVATE | |
739 | |
740 EXTN(jconst_idct_ifast_sse2): | |
741 | |
742 @@ -93,7 +93,7 @@ | |
743 %define WK_NUM 2 | |
744 | |
745 align 16 | |
746 - global EXTN(jsimd_idct_ifast_sse2) | |
747 + global EXTN(jsimd_idct_ifast_sse2) PRIVATE | |
748 | |
749 EXTN(jsimd_idct_ifast_sse2): | |
750 push rbp | |
751 Index: simd/jiss2flt.asm | |
752 =================================================================== | |
753 --- simd/jiss2flt.asm (revision 829) | |
754 +++ simd/jiss2flt.asm (working copy) | |
755 @@ -37,7 +37,7 @@ | |
756 SECTION SEG_CONST | |
757 | |
758 alignz 16 | |
759 - global EXTN(jconst_idct_float_sse2) | |
760 + global EXTN(jconst_idct_float_sse2) PRIVATE | |
761 | |
762 EXTN(jconst_idct_float_sse2): | |
763 | |
764 @@ -73,7 +73,7 @@ | |
765 ; FAST_FLOAT workspace[DCTSIZE2] | |
766 | |
767 align 16 | |
768 - global EXTN(jsimd_idct_float_sse2) | |
769 + global EXTN(jsimd_idct_float_sse2) PRIVATE | |
770 | |
771 EXTN(jsimd_idct_float_sse2): | |
772 push ebp | |
773 Index: simd/jiss2int.asm | |
774 =================================================================== | |
775 --- simd/jiss2int.asm (revision 829) | |
776 +++ simd/jiss2int.asm (working copy) | |
777 @@ -66,7 +66,7 @@ | |
778 SECTION SEG_CONST | |
779 | |
780 alignz 16 | |
781 - global EXTN(jconst_idct_islow_sse2) | |
782 + global EXTN(jconst_idct_islow_sse2) PRIVATE | |
783 | |
784 EXTN(jconst_idct_islow_sse2): | |
785 | |
786 @@ -105,7 +105,7 @@ | |
787 %define WK_NUM 12 | |
788 | |
789 align 16 | |
790 - global EXTN(jsimd_idct_islow_sse2) | |
791 + global EXTN(jsimd_idct_islow_sse2) PRIVATE | |
792 | |
793 EXTN(jsimd_idct_islow_sse2): | |
794 push ebp | |
795 Index: simd/jfsseflt-64.asm | |
796 =================================================================== | |
797 --- simd/jfsseflt-64.asm (revision 829) | |
798 +++ simd/jfsseflt-64.asm (working copy) | |
799 @@ -38,7 +38,7 @@ | |
800 SECTION SEG_CONST | |
801 | |
802 alignz 16 | |
803 - global EXTN(jconst_fdct_float_sse) | |
804 + global EXTN(jconst_fdct_float_sse) PRIVATE | |
805 | |
806 EXTN(jconst_fdct_float_sse): | |
807 | |
808 @@ -65,7 +65,7 @@ | |
809 %define WK_NUM 2 | |
810 | |
811 align 16 | |
812 - global EXTN(jsimd_fdct_float_sse) | |
813 + global EXTN(jsimd_fdct_float_sse) PRIVATE | |
814 | |
815 EXTN(jsimd_fdct_float_sse): | |
816 push rbp | |
817 Index: simd/jccolss2-64.asm | |
818 =================================================================== | |
819 --- simd/jccolss2-64.asm (revision 829) | |
820 +++ simd/jccolss2-64.asm (working copy) | |
821 @@ -34,7 +34,7 @@ | |
822 SECTION SEG_CONST | |
823 | |
824 alignz 16 | |
825 - global EXTN(jconst_rgb_ycc_convert_sse2) | |
826 + global EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE | |
827 | |
828 EXTN(jconst_rgb_ycc_convert_sse2): | |
829 | |
830 Index: simd/jcsamss2-64.asm | |
831 =================================================================== | |
832 --- simd/jcsamss2-64.asm (revision 829) | |
833 +++ simd/jcsamss2-64.asm (working copy) | |
834 @@ -41,7 +41,7 @@ | |
835 ; r15 = JSAMPARRAY output_data | |
836 | |
837 align 16 | |
838 - global EXTN(jsimd_h2v1_downsample_sse2) | |
839 + global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | |
840 | |
841 EXTN(jsimd_h2v1_downsample_sse2): | |
842 push rbp | |
843 @@ -185,7 +185,7 @@ | |
844 ; r15 = JSAMPARRAY output_data | |
845 | |
846 align 16 | |
847 - global EXTN(jsimd_h2v2_downsample_sse2) | |
848 + global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | |
849 | |
850 EXTN(jsimd_h2v2_downsample_sse2): | |
851 push rbp | |
852 Index: simd/jdclrss2-64.asm | |
853 =================================================================== | |
854 --- simd/jdclrss2-64.asm (revision 829) | |
855 +++ simd/jdclrss2-64.asm (working copy) | |
856 @@ -39,7 +39,7 @@ | |
857 %define WK_NUM 2 | |
858 | |
859 align 16 | |
860 - global EXTN(jsimd_ycc_rgb_convert_sse2) | |
861 + global EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE | |
862 | |
863 EXTN(jsimd_ycc_rgb_convert_sse2): | |
864 push rbp | |
865 Index: simd/jdcolmmx.asm | |
866 =================================================================== | |
867 --- simd/jdcolmmx.asm (revision 829) | |
868 +++ simd/jdcolmmx.asm (working copy) | |
869 @@ -35,7 +35,7 @@ | |
870 SECTION SEG_CONST | |
871 | |
872 alignz 16 | |
873 - global EXTN(jconst_ycc_rgb_convert_mmx) | |
874 + global EXTN(jconst_ycc_rgb_convert_mmx) PRIVATE | |
875 | |
876 EXTN(jconst_ycc_rgb_convert_mmx): | |
877 | |
878 Index: simd/jcclrmmx.asm | |
879 =================================================================== | |
880 --- simd/jcclrmmx.asm (revision 829) | |
881 +++ simd/jcclrmmx.asm (working copy) | |
882 @@ -40,7 +40,7 @@ | |
883 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
884 | |
885 align 16 | |
886 - global EXTN(jsimd_rgb_ycc_convert_mmx) | |
887 + global EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE | |
888 | |
889 EXTN(jsimd_rgb_ycc_convert_mmx): | |
890 push ebp | |
891 Index: simd/jfsseflt.asm | |
892 =================================================================== | |
893 --- simd/jfsseflt.asm (revision 829) | |
894 +++ simd/jfsseflt.asm (working copy) | |
895 @@ -37,7 +37,7 @@ | |
896 SECTION SEG_CONST | |
897 | |
898 alignz 16 | |
899 - global EXTN(jconst_fdct_float_sse) | |
900 + global EXTN(jconst_fdct_float_sse) PRIVATE | |
901 | |
902 EXTN(jconst_fdct_float_sse): | |
903 | |
904 @@ -65,7 +65,7 @@ | |
905 %define WK_NUM 2 | |
906 | |
907 align 16 | |
908 - global EXTN(jsimd_fdct_float_sse) | |
909 + global EXTN(jsimd_fdct_float_sse) PRIVATE | |
910 | |
911 EXTN(jsimd_fdct_float_sse): | |
912 push ebp | |
913 Index: simd/jdmrgss2-64.asm | |
914 =================================================================== | |
915 --- simd/jdmrgss2-64.asm (revision 829) | |
916 +++ simd/jdmrgss2-64.asm (working copy) | |
917 @@ -39,7 +39,7 @@ | |
918 %define WK_NUM 3 | |
919 | |
920 align 16 | |
921 - global EXTN(jsimd_h2v1_merged_upsample_sse2) | |
922 + global EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE | |
923 | |
924 EXTN(jsimd_h2v1_merged_upsample_sse2): | |
925 push rbp | |
926 @@ -543,7 +543,7 @@ | |
927 ; r13 = JSAMPARRAY output_buf | |
928 | |
929 align 16 | |
930 - global EXTN(jsimd_h2v2_merged_upsample_sse2) | |
931 + global EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE | |
932 | |
933 EXTN(jsimd_h2v2_merged_upsample_sse2): | |
934 push rbp | |
935 Index: simd/jdcolss2.asm | |
936 =================================================================== | |
937 --- simd/jdcolss2.asm (revision 829) | |
938 +++ simd/jdcolss2.asm (working copy) | |
939 @@ -35,7 +35,7 @@ | |
940 SECTION SEG_CONST | |
941 | |
942 alignz 16 | |
943 - global EXTN(jconst_ycc_rgb_convert_sse2) | |
944 + global EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE | |
945 | |
946 EXTN(jconst_ycc_rgb_convert_sse2): | |
947 | |
948 Index: simd/jdmermmx.asm | |
949 =================================================================== | |
950 --- simd/jdmermmx.asm (revision 829) | |
951 +++ simd/jdmermmx.asm (working copy) | |
952 @@ -35,7 +35,7 @@ | |
953 SECTION SEG_CONST | |
954 | |
955 alignz 16 | |
956 - global EXTN(jconst_merged_upsample_mmx) | |
957 + global EXTN(jconst_merged_upsample_mmx) PRIVATE | |
958 | |
959 EXTN(jconst_merged_upsample_mmx): | |
960 | |
961 Index: simd/jcclrss2.asm | |
962 =================================================================== | |
963 --- simd/jcclrss2.asm (revision 829) | |
964 +++ simd/jcclrss2.asm (working copy) | |
965 @@ -38,7 +38,7 @@ | |
966 | |
967 align 16 | |
968 | |
969 - global EXTN(jsimd_rgb_ycc_convert_sse2) | |
970 + global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE | |
971 | |
972 EXTN(jsimd_rgb_ycc_convert_sse2): | |
973 push ebp | |
974 Index: simd/jiss2red.asm | |
975 =================================================================== | |
976 --- simd/jiss2red.asm (revision 829) | |
977 +++ simd/jiss2red.asm (working copy) | |
978 @@ -72,7 +72,7 @@ | |
979 SECTION SEG_CONST | |
980 | |
981 alignz 16 | |
982 - global EXTN(jconst_idct_red_sse2) | |
983 + global EXTN(jconst_idct_red_sse2) PRIVATE | |
984 | |
985 EXTN(jconst_idct_red_sse2): | |
986 | |
987 @@ -113,7 +113,7 @@ | |
988 %define WK_NUM 2 | |
989 | |
990 align 16 | |
991 - global EXTN(jsimd_idct_4x4_sse2) | |
992 + global EXTN(jsimd_idct_4x4_sse2) PRIVATE | |
993 | |
994 EXTN(jsimd_idct_4x4_sse2): | |
995 push ebp | |
996 @@ -424,7 +424,7 @@ | |
997 %define output_col(b) (b)+20 ; JDIMENSION output_col | |
998 | |
999 align 16 | |
1000 - global EXTN(jsimd_idct_2x2_sse2) | |
1001 + global EXTN(jsimd_idct_2x2_sse2) PRIVATE | |
1002 | |
1003 EXTN(jsimd_idct_2x2_sse2): | |
1004 push ebp | |
1005 Index: simd/jdmerss2.asm | |
1006 =================================================================== | |
1007 --- simd/jdmerss2.asm (revision 829) | |
1008 +++ simd/jdmerss2.asm (working copy) | |
1009 @@ -35,7 +35,7 @@ | |
1010 SECTION SEG_CONST | |
1011 | |
1012 alignz 16 | |
1013 - global EXTN(jconst_merged_upsample_sse2) | |
1014 + global EXTN(jconst_merged_upsample_sse2) PRIVATE | |
1015 | |
1016 EXTN(jconst_merged_upsample_sse2): | |
1017 | |
1018 Index: simd/jfss2fst-64.asm | |
1019 =================================================================== | |
1020 --- simd/jfss2fst-64.asm (revision 829) | |
1021 +++ simd/jfss2fst-64.asm (working copy) | |
1022 @@ -53,7 +53,7 @@ | |
1023 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
1024 | |
1025 alignz 16 | |
1026 - global EXTN(jconst_fdct_ifast_sse2) | |
1027 + global EXTN(jconst_fdct_ifast_sse2) PRIVATE | |
1028 | |
1029 EXTN(jconst_fdct_ifast_sse2): | |
1030 | |
1031 @@ -80,7 +80,7 @@ | |
1032 %define WK_NUM 2 | |
1033 | |
1034 align 16 | |
1035 - global EXTN(jsimd_fdct_ifast_sse2) | |
1036 + global EXTN(jsimd_fdct_ifast_sse2) PRIVATE | |
1037 | |
1038 EXTN(jsimd_fdct_ifast_sse2): | |
1039 push rbp | |
1040 Index: simd/jcqntmmx.asm | |
1041 =================================================================== | |
1042 --- simd/jcqntmmx.asm (revision 829) | |
1043 +++ simd/jcqntmmx.asm (working copy) | |
1044 @@ -35,7 +35,7 @@ | |
1045 %define workspace ebp+16 ; DCTELEM * workspace | |
1046 | |
1047 align 16 | |
1048 - global EXTN(jsimd_convsamp_mmx) | |
1049 + global EXTN(jsimd_convsamp_mmx) PRIVATE | |
1050 | |
1051 EXTN(jsimd_convsamp_mmx): | |
1052 push ebp | |
1053 @@ -140,7 +140,7 @@ | |
1054 %define workspace ebp+16 ; DCTELEM * workspace | |
1055 | |
1056 align 16 | |
1057 - global EXTN(jsimd_quantize_mmx) | |
1058 + global EXTN(jsimd_quantize_mmx) PRIVATE | |
1059 | |
1060 EXTN(jsimd_quantize_mmx): | |
1061 push ebp | |
1062 Index: simd/jimmxfst.asm | |
1063 =================================================================== | |
1064 --- simd/jimmxfst.asm (revision 829) | |
1065 +++ simd/jimmxfst.asm (working copy) | |
1066 @@ -59,7 +59,7 @@ | |
1067 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
1068 | |
1069 alignz 16 | |
1070 - global EXTN(jconst_idct_ifast_mmx) | |
1071 + global EXTN(jconst_idct_ifast_mmx) PRIVATE | |
1072 | |
1073 EXTN(jconst_idct_ifast_mmx): | |
1074 | |
1075 @@ -94,7 +94,7 @@ | |
1076 ; JCOEF workspace[DCTSIZE2] | |
1077 | |
1078 align 16 | |
1079 - global EXTN(jsimd_idct_ifast_mmx) | |
1080 + global EXTN(jsimd_idct_ifast_mmx) PRIVATE | |
1081 | |
1082 EXTN(jsimd_idct_ifast_mmx): | |
1083 push ebp | |
1084 Index: simd/jfss2fst.asm | |
1085 =================================================================== | |
1086 --- simd/jfss2fst.asm (revision 829) | |
1087 +++ simd/jfss2fst.asm (working copy) | |
1088 @@ -52,7 +52,7 @@ | |
1089 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
1090 | |
1091 alignz 16 | |
1092 - global EXTN(jconst_fdct_ifast_sse2) | |
1093 + global EXTN(jconst_fdct_ifast_sse2) PRIVATE | |
1094 | |
1095 EXTN(jconst_fdct_ifast_sse2): | |
1096 | |
1097 @@ -80,7 +80,7 @@ | |
1098 %define WK_NUM 2 | |
1099 | |
1100 align 16 | |
1101 - global EXTN(jsimd_fdct_ifast_sse2) | |
1102 + global EXTN(jsimd_fdct_ifast_sse2) PRIVATE | |
1103 | |
1104 EXTN(jsimd_fdct_ifast_sse2): | |
1105 push ebp | |
1106 Index: simd/jcgrammx.asm | |
1107 =================================================================== | |
1108 --- simd/jcgrammx.asm (revision 829) | |
1109 +++ simd/jcgrammx.asm (working copy) | |
1110 @@ -33,7 +33,7 @@ | |
1111 SECTION SEG_CONST | |
1112 | |
1113 alignz 16 | |
1114 - global EXTN(jconst_rgb_gray_convert_mmx) | |
1115 + global EXTN(jconst_rgb_gray_convert_mmx) PRIVATE | |
1116 | |
1117 EXTN(jconst_rgb_gray_convert_mmx): | |
1118 | |
1119 Index: simd/jdcolss2-64.asm | |
1120 =================================================================== | |
1121 --- simd/jdcolss2-64.asm (revision 829) | |
1122 +++ simd/jdcolss2-64.asm (working copy) | |
1123 @@ -35,7 +35,7 @@ | |
1124 SECTION SEG_CONST | |
1125 | |
1126 alignz 16 | |
1127 - global EXTN(jconst_ycc_rgb_convert_sse2) | |
1128 + global EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE | |
1129 | |
1130 EXTN(jconst_ycc_rgb_convert_sse2): | |
1131 | |
1132 Index: simd/jf3dnflt.asm | |
1133 =================================================================== | |
1134 --- simd/jf3dnflt.asm (revision 829) | |
1135 +++ simd/jf3dnflt.asm (working copy) | |
1136 @@ -27,7 +27,7 @@ | |
1137 SECTION SEG_CONST | |
1138 | |
1139 alignz 16 | |
1140 - global EXTN(jconst_fdct_float_3dnow) | |
1141 + global EXTN(jconst_fdct_float_3dnow) PRIVATE | |
1142 | |
1143 EXTN(jconst_fdct_float_3dnow): | |
1144 | |
1145 @@ -55,7 +55,7 @@ | |
1146 %define WK_NUM 2 | |
1147 | |
1148 align 16 | |
1149 - global EXTN(jsimd_fdct_float_3dnow) | |
1150 + global EXTN(jsimd_fdct_float_3dnow) PRIVATE | |
1151 | |
1152 EXTN(jsimd_fdct_float_3dnow): | |
1153 push ebp | |
1154 Index: simd/jdsamss2-64.asm | |
1155 =================================================================== | |
1156 --- simd/jdsamss2-64.asm (revision 829) | |
1157 +++ simd/jdsamss2-64.asm (working copy) | |
1158 @@ -23,7 +23,7 @@ | |
1159 SECTION SEG_CONST | |
1160 | |
1161 alignz 16 | |
1162 - global EXTN(jconst_fancy_upsample_sse2) | |
1163 + global EXTN(jconst_fancy_upsample_sse2) PRIVATE | |
1164 | |
1165 EXTN(jconst_fancy_upsample_sse2): | |
1166 | |
1167 @@ -59,7 +59,7 @@ | |
1168 ; r13 = JSAMPARRAY * output_data_ptr | |
1169 | |
1170 align 16 | |
1171 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) | |
1172 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE | |
1173 | |
1174 EXTN(jsimd_h2v1_fancy_upsample_sse2): | |
1175 push rbp | |
1176 @@ -201,7 +201,7 @@ | |
1177 %define WK_NUM 4 | |
1178 | |
1179 align 16 | |
1180 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) | |
1181 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE | |
1182 | |
1183 EXTN(jsimd_h2v2_fancy_upsample_sse2): | |
1184 push rbp | |
1185 @@ -498,7 +498,7 @@ | |
1186 ; r13 = JSAMPARRAY * output_data_ptr | |
1187 | |
1188 align 16 | |
1189 - global EXTN(jsimd_h2v1_upsample_sse2) | |
1190 + global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE | |
1191 | |
1192 EXTN(jsimd_h2v1_upsample_sse2): | |
1193 push rbp | |
1194 @@ -587,7 +587,7 @@ | |
1195 ; r13 = JSAMPARRAY * output_data_ptr | |
1196 | |
1197 align 16 | |
1198 - global EXTN(jsimd_h2v2_upsample_sse2) | |
1199 + global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE | |
1200 | |
1201 EXTN(jsimd_h2v2_upsample_sse2): | |
1202 push rbp | |
1203 Index: simd/jcgrass2.asm | |
1204 =================================================================== | |
1205 --- simd/jcgrass2.asm (revision 829) | |
1206 +++ simd/jcgrass2.asm (working copy) | |
1207 @@ -30,7 +30,7 @@ | |
1208 SECTION SEG_CONST | |
1209 | |
1210 alignz 16 | |
1211 - global EXTN(jconst_rgb_gray_convert_sse2) | |
1212 + global EXTN(jconst_rgb_gray_convert_sse2) PRIVATE | |
1213 | |
1214 EXTN(jconst_rgb_gray_convert_sse2): | |
1215 | |
1216 Index: simd/jcsammmx.asm | |
1217 =================================================================== | |
1218 --- simd/jcsammmx.asm (revision 829) | |
1219 +++ simd/jcsammmx.asm (working copy) | |
1220 @@ -40,7 +40,7 @@ | |
1221 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
1222 | |
1223 align 16 | |
1224 - global EXTN(jsimd_h2v1_downsample_mmx) | |
1225 + global EXTN(jsimd_h2v1_downsample_mmx) PRIVATE | |
1226 | |
1227 EXTN(jsimd_h2v1_downsample_mmx): | |
1228 push ebp | |
1229 @@ -182,7 +182,7 @@ | |
1230 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
1231 | |
1232 align 16 | |
1233 - global EXTN(jsimd_h2v2_downsample_mmx) | |
1234 + global EXTN(jsimd_h2v2_downsample_mmx) PRIVATE | |
1235 | |
1236 EXTN(jsimd_h2v2_downsample_mmx): | |
1237 push ebp | |
1238 +Index: simd/jsimd_arm.c | |
1239 +=================================================================== | |
1240 +--- simd/jsimd_arm.c (revision 272637) | |
1241 ++++ simd/jsimd_arm.c (working copy) | |
1242 +@@ -29,0 +29,0 @@ | |
1243 + | |
1244 + static unsigned int simd_support = ~0; | |
1245 + | |
1246 +-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) | |
1247 ++#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defin
ed(__ANDROID__)) | |
1248 + | |
1249 + #define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) | |
1250 + | |
1251 +@@ -100,6 +100,6 @@ | |
1252 + init_simd (void) | |
1253 + { | |
1254 + char *env = NULL; | |
1255 +-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || define
d(__ANDROID__) | |
1256 ++#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defin
ed(__ANDROID__)) | |
1257 + int bufsize = 1024; /* an initial guess for the line buffer size limit */ | |
1258 + #endif | |
1259 + | |
1260 Index: simd/jsimd_arm_neon.S | |
1261 =================================================================== | |
1262 --- simd/jsimd_arm_neon.S (revision 272637) | |
1263 +++ simd/jsimd_arm_neon.S (working copy) | |
1264 @@ -41,11 +41,9 @@ | |
1265 /* Supplementary macro for setting function attributes */ | |
1266 .macro asm_function fname | |
1267 #ifdef __APPLE__ | |
1268 - .func _\fname | |
1269 .globl _\fname | |
1270 _\fname: | |
1271 #else | |
1272 - .func \fname | |
1273 .global \fname | |
1274 #ifdef __ELF__ | |
1275 .hidden \fname | |
1276 @@ -670,7 +668,6 @@ | |
1277 .unreq ROW6R | |
1278 .unreq ROW7L | |
1279 .unreq ROW7R | |
1280 -.endfunc | |
1281 | |
1282 | |
1283 /*****************************************************************************/ | |
1284 @@ -895,7 +892,6 @@ | |
1285 .unreq TMP2 | |
1286 .unreq TMP3 | |
1287 .unreq TMP4 | |
1288 -.endfunc | |
1289 | |
1290 | |
1291 /*****************************************************************************/ | |
1292 @@ -1108,7 +1104,6 @@ | |
1293 .unreq TMP2 | |
1294 .unreq TMP3 | |
1295 .unreq TMP4 | |
1296 -.endfunc | |
1297 | |
1298 .purgem idct_helper | |
1299 | |
1300 @@ -1263,7 +1258,6 @@ | |
1301 .unreq OUTPUT_COL | |
1302 .unreq TMP1 | |
1303 .unreq TMP2 | |
1304 -.endfunc | |
1305 | |
1306 .purgem idct_helper | |
1307 | |
1308 @@ -1547,7 +1541,6 @@ | |
1309 .unreq U | |
1310 .unreq V | |
1311 .unreq N | |
1312 -.endfunc | |
1313 | |
1314 .purgem do_yuv_to_rgb | |
1315 .purgem do_yuv_to_rgb_stage1 | |
1316 @@ -1858,7 +1851,6 @@ | |
1317 .unreq U | |
1318 .unreq V | |
1319 .unreq N | |
1320 -.endfunc | |
1321 | |
1322 .purgem do_rgb_to_yuv | |
1323 .purgem do_rgb_to_yuv_stage1 | |
1324 @@ -1940,7 +1932,6 @@ | |
1325 .unreq TMP2 | |
1326 .unreq TMP3 | |
1327 .unreq TMP4 | |
1328 -.endfunc | |
1329 | |
1330 | |
1331 /*****************************************************************************/ | |
1332 @@ -2064,7 +2055,6 @@ | |
1333 | |
1334 .unreq DATA | |
1335 .unreq TMP | |
1336 -.endfunc | |
1337 | |
1338 | |
1339 /*****************************************************************************/ | |
1340 @@ -2166,7 +2156,6 @@ | |
1341 .unreq CORRECTION | |
1342 .unreq SHIFT | |
1343 .unreq LOOP_COUNT | |
1344 -.endfunc | |
1345 | |
1346 | |
1347 /*****************************************************************************/ | |
1348 @@ -2401,7 +2390,6 @@ | |
1349 .unreq WIDTH | |
1350 .unreq TMP | |
1351 | |
1352 -.endfunc | |
1353 | |
1354 .purgem upsample16 | |
1355 .purgem upsample32 | |
1356 Index: simd/jsimd_i386.c | |
1357 =================================================================== | |
1358 --- simd/jsimd_i386.c (revision 829) | |
1359 +++ simd/jsimd_i386.c (working copy) | |
1360 @@ -61,6 +61,7 @@ | |
1361 simd_support &= JSIMD_SSE2; | |
1362 } | |
1363 | |
1364 +#ifndef JPEG_DECODE_ONLY | |
1365 GLOBAL(int) | |
1366 jsimd_can_rgb_ycc (void) | |
1367 { | |
1368 @@ -82,6 +83,7 @@ | |
1369 | |
1370 return 0; | |
1371 } | |
1372 +#endif | |
1373 | |
1374 GLOBAL(int) | |
1375 jsimd_can_rgb_gray (void) | |
1376 @@ -127,6 +129,7 @@ | |
1377 return 0; | |
1378 } | |
1379 | |
1380 +#ifndef JPEG_DECODE_ONLY | |
1381 GLOBAL(void) | |
1382 jsimd_rgb_ycc_convert (j_compress_ptr cinfo, | |
1383 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | |
1384 @@ -179,6 +182,7 @@ | |
1385 mmxfct(cinfo->image_width, input_buf, | |
1386 output_buf, output_row, num_rows); | |
1387 } | |
1388 +#endif | |
1389 | |
1390 GLOBAL(void) | |
1391 jsimd_rgb_gray_convert (j_compress_ptr cinfo, | |
1392 @@ -286,6 +290,7 @@ | |
1393 input_row, output_buf, num_rows); | |
1394 } | |
1395 | |
1396 +#ifndef JPEG_DECODE_ONLY | |
1397 GLOBAL(int) | |
1398 jsimd_can_h2v2_downsample (void) | |
1399 { | |
1400 @@ -351,6 +356,7 @@ | |
1401 compptr->v_samp_factor, compptr->width_in_blocks, | |
1402 input_data, output_data); | |
1403 } | |
1404 +#endif | |
1405 | |
1406 GLOBAL(int) | |
1407 jsimd_can_h2v2_upsample (void) | |
1408 @@ -636,6 +642,7 @@ | |
1409 in_row_group_ctr, output_buf); | |
1410 } | |
1411 | |
1412 +#ifndef JPEG_DECODE_ONLY | |
1413 GLOBAL(int) | |
1414 jsimd_can_convsamp (void) | |
1415 { | |
1416 @@ -855,6 +862,7 @@ | |
1417 else if (simd_support & JSIMD_3DNOW) | |
1418 jsimd_quantize_float_3dnow(coef_block, divisors, workspace); | |
1419 } | |
1420 +#endif | |
1421 | |
1422 GLOBAL(int) | |
1423 jsimd_can_idct_2x2 (void) | |
1424 @@ -1045,4 +1053,3 @@ | |
1425 jsimd_idct_float_3dnow(compptr->dct_table, coef_block, | |
1426 output_buf, output_col); | |
1427 } | |
1428 - | |
1429 Index: simd/jcqnts2f-64.asm | |
1430 =================================================================== | |
1431 --- simd/jcqnts2f-64.asm (revision 829) | |
1432 +++ simd/jcqnts2f-64.asm (working copy) | |
1433 @@ -36,7 +36,7 @@ | |
1434 ; r12 = FAST_FLOAT * workspace | |
1435 | |
1436 align 16 | |
1437 - global EXTN(jsimd_convsamp_float_sse2) | |
1438 + global EXTN(jsimd_convsamp_float_sse2) PRIVATE | |
1439 | |
1440 EXTN(jsimd_convsamp_float_sse2): | |
1441 push rbp | |
1442 @@ -110,7 +110,7 @@ | |
1443 ; r12 = FAST_FLOAT * workspace | |
1444 | |
1445 align 16 | |
1446 - global EXTN(jsimd_quantize_float_sse2) | |
1447 + global EXTN(jsimd_quantize_float_sse2) PRIVATE | |
1448 | |
1449 EXTN(jsimd_quantize_float_sse2): | |
1450 push rbp | |
1451 Index: simd/jcqnt3dn.asm | |
1452 =================================================================== | |
1453 --- simd/jcqnt3dn.asm (revision 829) | |
1454 +++ simd/jcqnt3dn.asm (working copy) | |
1455 @@ -35,7 +35,7 @@ | |
1456 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
1457 | |
1458 align 16 | |
1459 - global EXTN(jsimd_convsamp_float_3dnow) | |
1460 + global EXTN(jsimd_convsamp_float_3dnow) PRIVATE | |
1461 | |
1462 EXTN(jsimd_convsamp_float_3dnow): | |
1463 push ebp | |
1464 @@ -138,7 +138,7 @@ | |
1465 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
1466 | |
1467 align 16 | |
1468 - global EXTN(jsimd_quantize_float_3dnow) | |
1469 + global EXTN(jsimd_quantize_float_3dnow) PRIVATE | |
1470 | |
1471 EXTN(jsimd_quantize_float_3dnow): | |
1472 push ebp | |
1473 Index: simd/jcsamss2.asm | |
1474 =================================================================== | |
1475 --- simd/jcsamss2.asm (revision 829) | |
1476 +++ simd/jcsamss2.asm (working copy) | |
1477 @@ -40,7 +40,7 @@ | |
1478 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
1479 | |
1480 align 16 | |
1481 - global EXTN(jsimd_h2v1_downsample_sse2) | |
1482 + global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | |
1483 | |
1484 EXTN(jsimd_h2v1_downsample_sse2): | |
1485 push ebp | |
1486 @@ -195,7 +195,7 @@ | |
1487 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
1488 | |
1489 align 16 | |
1490 - global EXTN(jsimd_h2v2_downsample_sse2) | |
1491 + global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | |
1492 | |
1493 EXTN(jsimd_h2v2_downsample_sse2): | |
1494 push ebp | |
1495 Index: simd/jsimd_x86_64.c | |
1496 =================================================================== | |
1497 --- simd/jsimd_x86_64.c (revision 829) | |
1498 +++ simd/jsimd_x86_64.c (working copy) | |
1499 @@ -29,6 +29,7 @@ | |
1500 | |
1501 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ | |
1502 | |
1503 +#ifndef JPEG_DECODE_ONLY | |
1504 GLOBAL(int) | |
1505 jsimd_can_rgb_ycc (void) | |
1506 { | |
1507 @@ -45,6 +46,7 @@ | |
1508 | |
1509 return 1; | |
1510 } | |
1511 +#endif | |
1512 | |
1513 GLOBAL(int) | |
1514 jsimd_can_rgb_gray (void) | |
1515 @@ -80,6 +82,7 @@ | |
1516 return 1; | |
1517 } | |
1518 | |
1519 +#ifndef JPEG_DECODE_ONLY | |
1520 GLOBAL(void) | |
1521 jsimd_rgb_ycc_convert (j_compress_ptr cinfo, | |
1522 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | |
1523 @@ -118,6 +121,7 @@ | |
1524 | |
1525 sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); | |
1526 } | |
1527 +#endif | |
1528 | |
1529 GLOBAL(void) | |
1530 jsimd_rgb_gray_convert (j_compress_ptr cinfo, | |
1531 @@ -197,6 +201,7 @@ | |
1532 sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); | |
1533 } | |
1534 | |
1535 +#ifndef JPEG_DECODE_ONLY | |
1536 GLOBAL(int) | |
1537 jsimd_can_h2v2_downsample (void) | |
1538 { | |
1539 @@ -242,6 +247,7 @@ | |
1540 compptr->width_in_blocks, | |
1541 input_data, output_data); | |
1542 } | |
1543 +#endif | |
1544 | |
1545 GLOBAL(int) | |
1546 jsimd_can_h2v2_upsample (void) | |
1547 @@ -451,6 +457,7 @@ | |
1548 sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); | |
1549 } | |
1550 | |
1551 +#ifndef JPEG_DECODE_ONLY | |
1552 GLOBAL(int) | |
1553 jsimd_can_convsamp (void) | |
1554 { | |
1555 @@ -601,6 +608,7 @@ | |
1556 { | |
1557 jsimd_quantize_float_sse2(coef_block, divisors, workspace); | |
1558 } | |
1559 +#endif | |
1560 | |
1561 GLOBAL(int) | |
1562 jsimd_can_idct_2x2 (void) | |
1563 @@ -750,4 +758,3 @@ | |
1564 jsimd_idct_float_sse2(compptr->dct_table, coef_block, | |
1565 output_buf, output_col); | |
1566 } | |
1567 - | |
1568 Index: simd/jimmxint.asm | |
1569 =================================================================== | |
1570 --- simd/jimmxint.asm (revision 829) | |
1571 +++ simd/jimmxint.asm (working copy) | |
1572 @@ -66,7 +66,7 @@ | |
1573 SECTION SEG_CONST | |
1574 | |
1575 alignz 16 | |
1576 - global EXTN(jconst_idct_islow_mmx) | |
1577 + global EXTN(jconst_idct_islow_mmx) PRIVATE | |
1578 | |
1579 EXTN(jconst_idct_islow_mmx): | |
1580 | |
1581 @@ -107,7 +107,7 @@ | |
1582 ; JCOEF workspace[DCTSIZE2] | |
1583 | |
1584 align 16 | |
1585 - global EXTN(jsimd_idct_islow_mmx) | |
1586 + global EXTN(jsimd_idct_islow_mmx) PRIVATE | |
1587 | |
1588 EXTN(jsimd_idct_islow_mmx): | |
1589 push ebp | |
1590 Index: simd/jcgrymmx.asm | |
1591 =================================================================== | |
1592 --- simd/jcgrymmx.asm (revision 829) | |
1593 +++ simd/jcgrymmx.asm (working copy) | |
1594 @@ -41,7 +41,7 @@ | |
1595 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
1596 | |
1597 align 16 | |
1598 - global EXTN(jsimd_rgb_gray_convert_mmx) | |
1599 + global EXTN(jsimd_rgb_gray_convert_mmx) PRIVATE | |
1600 | |
1601 EXTN(jsimd_rgb_gray_convert_mmx): | |
1602 push ebp | |
1603 Index: simd/jfss2int.asm | |
1604 =================================================================== | |
1605 --- simd/jfss2int.asm (revision 829) | |
1606 +++ simd/jfss2int.asm (working copy) | |
1607 @@ -66,7 +66,7 @@ | |
1608 SECTION SEG_CONST | |
1609 | |
1610 alignz 16 | |
1611 - global EXTN(jconst_fdct_islow_sse2) | |
1612 + global EXTN(jconst_fdct_islow_sse2) PRIVATE | |
1613 | |
1614 EXTN(jconst_fdct_islow_sse2): | |
1615 | |
1616 @@ -101,7 +101,7 @@ | |
1617 %define WK_NUM 6 | |
1618 | |
1619 align 16 | |
1620 - global EXTN(jsimd_fdct_islow_sse2) | |
1621 + global EXTN(jsimd_fdct_islow_sse2) PRIVATE | |
1622 | |
1623 EXTN(jsimd_fdct_islow_sse2): | |
1624 push ebp | |
1625 Index: simd/jcgryss2.asm | |
1626 =================================================================== | |
1627 --- simd/jcgryss2.asm (revision 829) | |
1628 +++ simd/jcgryss2.asm (working copy) | |
1629 @@ -39,7 +39,7 @@ | |
1630 | |
1631 align 16 | |
1632 | |
1633 - global EXTN(jsimd_rgb_gray_convert_sse2) | |
1634 + global EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE | |
1635 | |
1636 EXTN(jsimd_rgb_gray_convert_sse2): | |
1637 push ebp | |
1638 Index: simd/jccolmmx.asm | |
1639 =================================================================== | |
1640 --- simd/jccolmmx.asm (revision 829) | |
1641 +++ simd/jccolmmx.asm (working copy) | |
1642 @@ -37,7 +37,7 @@ | |
1643 SECTION SEG_CONST | |
1644 | |
1645 alignz 16 | |
1646 - global EXTN(jconst_rgb_ycc_convert_mmx) | |
1647 + global EXTN(jconst_rgb_ycc_convert_mmx) PRIVATE | |
1648 | |
1649 EXTN(jconst_rgb_ycc_convert_mmx): | |
1650 | |
1651 Index: simd/jimmxred.asm | |
1652 =================================================================== | |
1653 --- simd/jimmxred.asm (revision 829) | |
1654 +++ simd/jimmxred.asm (working copy) | |
1655 @@ -72,7 +72,7 @@ | |
1656 SECTION SEG_CONST | |
1657 | |
1658 alignz 16 | |
1659 - global EXTN(jconst_idct_red_mmx) | |
1660 + global EXTN(jconst_idct_red_mmx) PRIVATE | |
1661 | |
1662 EXTN(jconst_idct_red_mmx): | |
1663 | |
1664 @@ -115,7 +115,7 @@ | |
1665 ; JCOEF workspace[DCTSIZE2] | |
1666 | |
1667 align 16 | |
1668 - global EXTN(jsimd_idct_4x4_mmx) | |
1669 + global EXTN(jsimd_idct_4x4_mmx) PRIVATE | |
1670 | |
1671 EXTN(jsimd_idct_4x4_mmx): | |
1672 push ebp | |
1673 @@ -503,7 +503,7 @@ | |
1674 %define output_col(b) (b)+20 ; JDIMENSION output_col | |
1675 | |
1676 align 16 | |
1677 - global EXTN(jsimd_idct_2x2_mmx) | |
1678 + global EXTN(jsimd_idct_2x2_mmx) PRIVATE | |
1679 | |
1680 EXTN(jsimd_idct_2x2_mmx): | |
1681 push ebp | |
1682 Index: simd/jsimdext.inc | |
1683 =================================================================== | |
1684 --- simd/jsimdext.inc (revision 829) | |
1685 +++ simd/jsimdext.inc (working copy) | |
1686 @@ -73,6 +73,9 @@ | |
1687 ; * *BSD family Unix using elf format | |
1688 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix | |
1689 | |
1690 +; PIC is the default on Linux | |
1691 +%define PIC | |
1692 + | |
1693 ; mark stack as non-executable | |
1694 section .note.GNU-stack noalloc noexec nowrite progbits | |
1695 | |
1696 @@ -375,4 +378,14 @@ | |
1697 ; | |
1698 %include "jsimdcfg.inc" | |
1699 | |
1700 +; Begin chromium edits | |
1701 +%ifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- | |
1702 +%define PRIVATE :private_extern | |
1703 +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ | |
1704 +%define PRIVATE :hidden | |
1705 +%else | |
1706 +%define PRIVATE | |
1707 +%endif | |
1708 +; End chromium edits | |
1709 + | |
1710 ; -------------------------------------------------------------------------- | |
1711 Index: simd/jdclrmmx.asm | |
1712 =================================================================== | |
1713 --- simd/jdclrmmx.asm (revision 829) | |
1714 +++ simd/jdclrmmx.asm (working copy) | |
1715 @@ -40,7 +40,7 @@ | |
1716 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
1717 | |
1718 align 16 | |
1719 - global EXTN(jsimd_ycc_rgb_convert_mmx) | |
1720 + global EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE | |
1721 | |
1722 EXTN(jsimd_ycc_rgb_convert_mmx): | |
1723 push ebp | |
1724 Index: simd/jccolss2.asm | |
1725 =================================================================== | |
1726 --- simd/jccolss2.asm (revision 829) | |
1727 +++ simd/jccolss2.asm (working copy) | |
1728 @@ -34,7 +34,7 @@ | |
1729 SECTION SEG_CONST | |
1730 | |
1731 alignz 16 | |
1732 - global EXTN(jconst_rgb_ycc_convert_sse2) | |
1733 + global EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE | |
1734 | |
1735 EXTN(jconst_rgb_ycc_convert_sse2): | |
1736 | |
1737 Index: simd/jisseflt.asm | |
1738 =================================================================== | |
1739 --- simd/jisseflt.asm (revision 829) | |
1740 +++ simd/jisseflt.asm (working copy) | |
1741 @@ -37,7 +37,7 @@ | |
1742 SECTION SEG_CONST | |
1743 | |
1744 alignz 16 | |
1745 - global EXTN(jconst_idct_float_sse) | |
1746 + global EXTN(jconst_idct_float_sse) PRIVATE | |
1747 | |
1748 EXTN(jconst_idct_float_sse): | |
1749 | |
1750 @@ -73,7 +73,7 @@ | |
1751 ; FAST_FLOAT workspace[DCTSIZE2] | |
1752 | |
1753 align 16 | |
1754 - global EXTN(jsimd_idct_float_sse) | |
1755 + global EXTN(jsimd_idct_float_sse) PRIVATE | |
1756 | |
1757 EXTN(jsimd_idct_float_sse): | |
1758 push ebp | |
1759 Index: simd/jcqnts2i-64.asm | |
1760 =================================================================== | |
1761 --- simd/jcqnts2i-64.asm (revision 829) | |
1762 +++ simd/jcqnts2i-64.asm (working copy) | |
1763 @@ -36,7 +36,7 @@ | |
1764 ; r12 = DCTELEM * workspace | |
1765 | |
1766 align 16 | |
1767 - global EXTN(jsimd_convsamp_sse2) | |
1768 + global EXTN(jsimd_convsamp_sse2) PRIVATE | |
1769 | |
1770 EXTN(jsimd_convsamp_sse2): | |
1771 push rbp | |
1772 @@ -112,7 +112,7 @@ | |
1773 ; r12 = DCTELEM * workspace | |
1774 | |
1775 align 16 | |
1776 - global EXTN(jsimd_quantize_sse2) | |
1777 + global EXTN(jsimd_quantize_sse2) PRIVATE | |
1778 | |
1779 EXTN(jsimd_quantize_sse2): | |
1780 push rbp | |
1781 Index: simd/jdclrss2.asm | |
1782 =================================================================== | |
1783 --- simd/jdclrss2.asm (revision 829) | |
1784 +++ simd/jdclrss2.asm (working copy) | |
1785 @@ -40,7 +40,7 @@ | |
1786 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
1787 | |
1788 align 16 | |
1789 - global EXTN(jsimd_ycc_rgb_convert_sse2) | |
1790 + global EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE | |
1791 | |
1792 EXTN(jsimd_ycc_rgb_convert_sse2): | |
1793 push ebp | |
1794 Index: simd/jcqntsse.asm | |
1795 =================================================================== | |
1796 --- simd/jcqntsse.asm (revision 829) | |
1797 +++ simd/jcqntsse.asm (working copy) | |
1798 @@ -35,7 +35,7 @@ | |
1799 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
1800 | |
1801 align 16 | |
1802 - global EXTN(jsimd_convsamp_float_sse) | |
1803 + global EXTN(jsimd_convsamp_float_sse) PRIVATE | |
1804 | |
1805 EXTN(jsimd_convsamp_float_sse): | |
1806 push ebp | |
1807 @@ -138,7 +138,7 @@ | |
1808 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
1809 | |
1810 align 16 | |
1811 - global EXTN(jsimd_quantize_float_sse) | |
1812 + global EXTN(jsimd_quantize_float_sse) PRIVATE | |
1813 | |
1814 EXTN(jsimd_quantize_float_sse): | |
1815 push ebp | |
1816 Index: simd/jiss2int-64.asm | |
1817 =================================================================== | |
1818 --- simd/jiss2int-64.asm (revision 829) | |
1819 +++ simd/jiss2int-64.asm (working copy) | |
1820 @@ -67,7 +67,7 @@ | |
1821 SECTION SEG_CONST | |
1822 | |
1823 alignz 16 | |
1824 - global EXTN(jconst_idct_islow_sse2) | |
1825 + global EXTN(jconst_idct_islow_sse2) PRIVATE | |
1826 | |
1827 EXTN(jconst_idct_islow_sse2): | |
1828 | |
1829 @@ -106,7 +106,7 @@ | |
1830 %define WK_NUM 12 | |
1831 | |
1832 align 16 | |
1833 - global EXTN(jsimd_idct_islow_sse2) | |
1834 + global EXTN(jsimd_idct_islow_sse2) PRIVATE | |
1835 | |
1836 EXTN(jsimd_idct_islow_sse2): | |
1837 push rbp | |
1838 Index: simd/jfmmxfst.asm | |
1839 =================================================================== | |
1840 --- simd/jfmmxfst.asm (revision 829) | |
1841 +++ simd/jfmmxfst.asm (working copy) | |
1842 @@ -52,7 +52,7 @@ | |
1843 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
1844 | |
1845 alignz 16 | |
1846 - global EXTN(jconst_fdct_ifast_mmx) | |
1847 + global EXTN(jconst_fdct_ifast_mmx) PRIVATE | |
1848 | |
1849 EXTN(jconst_fdct_ifast_mmx): | |
1850 | |
1851 @@ -80,7 +80,7 @@ | |
1852 %define WK_NUM 2 | |
1853 | |
1854 align 16 | |
1855 - global EXTN(jsimd_fdct_ifast_mmx) | |
1856 + global EXTN(jsimd_fdct_ifast_mmx) PRIVATE | |
1857 | |
1858 EXTN(jsimd_fdct_ifast_mmx): | |
1859 push ebp | |
1860 Index: jdarith.c | |
1861 =================================================================== | |
1862 --- jdarith.c (revision 829) | |
1863 +++ jdarith.c (working copy) | |
1864 @@ -150,8 +150,8 @@ | |
1865 */ | |
1866 sv = *st; | |
1867 qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */ | |
1868 - nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */ | |
1869 - nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ | |
1870 + nl = (unsigned char) qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS *
/ | |
1871 + nm = (unsigned char) qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ | |
1872 | |
1873 /* Decode & estimation procedures per sections D.2.4 & D.2.5 */ | |
1874 temp = e->a - qe; | |
1875 Index: jdhuff.c | |
1876 =================================================================== | |
1877 --- jdhuff.c (revision 1541) | |
1878 +++ jdhuff.c (working copy) | |
1879 @@ -662,7 +662,7 @@ | |
1880 d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn]; | |
1881 register int s, k, r, l; | |
1882 | |
1883 - HUFF_DECODE_FAST(s, l, dctbl); | |
1884 + HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu); | |
1885 if (s) { | |
1886 FILL_BIT_BUFFER_FAST | |
1887 r = GET_BITS(s); | |
1888 @@ -679,7 +679,7 @@ | |
1889 if (entropy->ac_needed[blkn]) { | |
1890 | |
1891 for (k = 1; k < DCTSIZE2; k++) { | |
1892 - HUFF_DECODE_FAST(s, l, actbl); | |
1893 + HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu); | |
1894 r = s >> 4; | |
1895 s &= 15; | |
1896 | |
1897 @@ -698,7 +698,7 @@ | |
1898 } else { | |
1899 | |
1900 for (k = 1; k < DCTSIZE2; k++) { | |
1901 - HUFF_DECODE_FAST(s, l, actbl); | |
1902 + HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu); | |
1903 r = s >> 4; | |
1904 s &= 15; | |
1905 | |
1906 @@ -715,6 +715,7 @@ | |
1907 } | |
1908 | |
1909 if (cinfo->unread_marker != 0) { | |
1910 +slow_decode_mcu: | |
1911 cinfo->unread_marker = 0; | |
1912 return FALSE; | |
1913 } | |
1914 @@ -742,7 +743,7 @@ | |
1915 * this module, since we'll just re-assign them on the next call.) | |
1916 */ | |
1917 | |
1918 -#define BUFSIZE (DCTSIZE2 * 2) | |
1919 +#define BUFSIZE (DCTSIZE2 * 2u) | |
1920 | |
1921 METHODDEF(boolean) | |
1922 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) | |
1923 Index: jdhuff.h | |
1924 =================================================================== | |
1925 --- jdhuff.h (revision 1541) | |
1926 +++ jdhuff.h (working copy) | |
1927 @@ -208,7 +208,7 @@ | |
1928 } \ | |
1929 } | |
1930 | |
1931 -#define HUFF_DECODE_FAST(s,nb,htbl) \ | |
1932 +#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \ | |
1933 FILL_BIT_BUFFER_FAST; \ | |
1934 s = PEEK_BITS(HUFF_LOOKAHEAD); \ | |
1935 s = htbl->lookup[s]; \ | |
1936 @@ -225,7 +225,9 @@ | |
1937 s |= GET_BITS(1); \ | |
1938 nb++; \ | |
1939 } \ | |
1940 - s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) & 0xFF ]; \ | |
1941 + if (nb > 16) \ | |
1942 + goto slowlabel; \ | |
1943 + s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \ | |
1944 } | |
1945 | |
1946 /* Out-of-line case for Huffman code fetching */ | |
1947 | |
1948 Index: jchuff.c | |
1949 =================================================================== | |
1950 --- jchuff.c (revision 1219) | |
1951 +++ jchuff.c (revision 1220) | |
1952 @@ -22,8 +22,36 @@ | |
1953 #include "jchuff.h" /* Declarations shared with jcphuff.c */ | |
1954 #include <limits.h> | |
1955 | |
1956 +/* | |
1957 + * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be | |
1958 + * used for bit counting rather than the lookup table. This will reduce the | |
1959 + * memory footprint by 64k, which is important for some mobile applications | |
1960 + * that create many isolated instances of libjpeg-turbo (web browsers, for | |
1961 + * instance.) This may improve performance on some mobile platforms as well. | |
1962 + * This feature is enabled by default only on ARM processors, because some x86 | |
1963 + * chips have a slow implementation of bsr, and the use of clz/bsr cannot be | |
1964 + * shown to have a significant performance impact even on the x86 chips that | |
1965 + * have a fast implementation of it. When building for ARMv6, you can | |
1966 + * explicitly disable the use of clz/bsr by adding -mthumb to the compiler | |
1967 + * flags (this defines __thumb__). | |
1968 + */ | |
1969 + | |
1970 +/* NOTE: Both GCC and Clang define __GNUC__ */ | |
1971 +#if defined __GNUC__ && defined __arm__ | |
1972 +#if !defined __thumb__ || defined __thumb2__ | |
1973 +#define USE_CLZ_INTRINSIC | |
1974 +#endif | |
1975 +#endif | |
1976 + | |
1977 +#ifdef USE_CLZ_INTRINSIC | |
1978 +#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) | |
1979 +#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) | |
1980 +#else | |
1981 static unsigned char jpeg_nbits_table[65536]; | |
1982 static int jpeg_nbits_table_init = 0; | |
1983 +#define JPEG_NBITS(x) (jpeg_nbits_table[x]) | |
1984 +#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) | |
1985 +#endif | |
1986 | |
1987 #ifndef min | |
1988 #define min(a,b) ((a)<(b)?(a):(b)) | |
1989 @@ -272,6 +300,7 @@ | |
1990 dtbl->ehufsi[i] = huffsize[p]; | |
1991 } | |
1992 | |
1993 +#ifndef USE_CLZ_INTRINSIC | |
1994 if(!jpeg_nbits_table_init) { | |
1995 for(i = 0; i < 65536; i++) { | |
1996 int nbits = 0, temp = i; | |
1997 @@ -280,6 +309,7 @@ | |
1998 } | |
1999 jpeg_nbits_table_init = 1; | |
2000 } | |
2001 +#endif | |
2002 } | |
2003 | |
2004 | |
2005 @@ -482,7 +512,7 @@ | |
2006 temp2 += temp3; | |
2007 | |
2008 /* Find the number of bits needed for the magnitude of the coefficient */ | |
2009 - nbits = jpeg_nbits_table[temp]; | |
2010 + nbits = JPEG_NBITS(temp); | |
2011 | |
2012 /* Emit the Huffman-coded symbol for the number of bits */ | |
2013 code = dctbl->ehufco[nbits]; | |
2014 @@ -516,7 +546,7 @@ | |
2015 temp ^= temp3; \ | |
2016 temp -= temp3; \ | |
2017 temp2 += temp3; \ | |
2018 - nbits = jpeg_nbits_table[temp]; \ | |
2019 + nbits = JPEG_NBITS_NONZERO(temp); \ | |
2020 /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ | |
2021 while (r > 15) { \ | |
2022 EMIT_BITS(code_0xf0, size_0xf0) \ | |
2023 Index: simd/jsimd_arm64.c | |
2024 =================================================================== | |
2025 --- /dev/null | |
2026 +++ simd/jsimd_arm64.c | |
2027 @@ -0,0 +1,544 @@ | |
2028 +/* | |
2029 + * jsimd_arm64.c | |
2030 + * | |
2031 + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
2032 + * Copyright 2009-2011, 2013-2014 D. R. Commander | |
2033 + * | |
2034 + * Based on the x86 SIMD extension for IJG JPEG library, | |
2035 + * Copyright (C) 1999-2006, MIYASAKA Masaru. | |
2036 + * For conditions of distribution and use, see copyright notice in jsimdext.inc | |
2037 + * | |
2038 + * This file contains the interface between the "normal" portions | |
2039 + * of the library and the SIMD implementations when running on a | |
2040 + * 64-bit ARM architecture. | |
2041 + */ | |
2042 + | |
2043 +#define JPEG_INTERNALS | |
2044 +#include "../jinclude.h" | |
2045 +#include "../jpeglib.h" | |
2046 +#include "../jsimd.h" | |
2047 +#include "../jdct.h" | |
2048 +#include "../jsimddct.h" | |
2049 +#include "jsimd.h" | |
2050 + | |
2051 +#include <stdio.h> | |
2052 +#include <string.h> | |
2053 +#include <ctype.h> | |
2054 + | |
2055 +static unsigned int simd_support = ~0; | |
2056 + | |
2057 +/* | |
2058 + * Check what SIMD accelerations are supported. | |
2059 + * | |
2060 + * FIXME: This code is racy under a multi-threaded environment. | |
2061 + */ | |
2062 + | |
2063 +/* | |
2064 + * ARMv8 architectures support NEON extensions by default. | |
2065 + * It is no longer optional as it was with ARMv7. | |
2066 + */ | |
2067 + | |
2068 + | |
2069 +LOCAL(void) | |
2070 +init_simd (void) | |
2071 +{ | |
2072 + char *env = NULL; | |
2073 + | |
2074 + if (simd_support != ~0U) | |
2075 + return; | |
2076 + | |
2077 + simd_support = 0; | |
2078 + | |
2079 + simd_support |= JSIMD_ARM_NEON; | |
2080 + | |
2081 + /* Force different settings through environment variables */ | |
2082 + env = getenv("JSIMD_FORCENEON"); | |
2083 + if ((env != NULL) && (strcmp(env, "1") == 0)) | |
2084 + simd_support &= JSIMD_ARM_NEON; | |
2085 + env = getenv("JSIMD_FORCENONE"); | |
2086 + if ((env != NULL) && (strcmp(env, "1") == 0)) | |
2087 + simd_support = 0; | |
2088 +} | |
2089 + | |
2090 +GLOBAL(int) | |
2091 +jsimd_can_rgb_ycc (void) | |
2092 +{ | |
2093 + init_simd(); | |
2094 + | |
2095 + return 0; | |
2096 +} | |
2097 + | |
2098 +GLOBAL(int) | |
2099 +jsimd_can_rgb_gray (void) | |
2100 +{ | |
2101 + init_simd(); | |
2102 + | |
2103 + return 0; | |
2104 +} | |
2105 + | |
2106 +GLOBAL(int) | |
2107 +jsimd_can_ycc_rgb (void) | |
2108 +{ | |
2109 + init_simd(); | |
2110 + | |
2111 + /* The code is optimised for these values only */ | |
2112 + if (BITS_IN_JSAMPLE != 8) | |
2113 + return 0; | |
2114 + if (sizeof(JDIMENSION) != 4) | |
2115 + return 0; | |
2116 + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) | |
2117 + return 0; | |
2118 + | |
2119 + if (simd_support & JSIMD_ARM_NEON) | |
2120 + return 1; | |
2121 + | |
2122 + return 0; | |
2123 +} | |
2124 + | |
2125 +GLOBAL(int) | |
2126 +jsimd_can_ycc_rgb565 (void) | |
2127 +{ | |
2128 + init_simd(); | |
2129 + | |
2130 + /* The code is optimised for these values only */ | |
2131 + if (BITS_IN_JSAMPLE != 8) | |
2132 + return 0; | |
2133 + if (sizeof(JDIMENSION) != 4) | |
2134 + return 0; | |
2135 + | |
2136 + if (simd_support & JSIMD_ARM_NEON) | |
2137 + return 1; | |
2138 + | |
2139 + return 0; | |
2140 +} | |
2141 + | |
2142 +GLOBAL(void) | |
2143 +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, | |
2144 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | |
2145 + JDIMENSION output_row, int num_rows) | |
2146 +{ | |
2147 +} | |
2148 + | |
2149 +GLOBAL(void) | |
2150 +jsimd_rgb_gray_convert (j_compress_ptr cinfo, | |
2151 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | |
2152 + JDIMENSION output_row, int num_rows) | |
2153 +{ | |
2154 +} | |
2155 + | |
2156 +GLOBAL(void) | |
2157 +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, | |
2158 + JSAMPIMAGE input_buf, JDIMENSION input_row, | |
2159 + JSAMPARRAY output_buf, int num_rows) | |
2160 +{ | |
2161 + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); | |
2162 + | |
2163 + switch(cinfo->out_color_space) { | |
2164 + case JCS_EXT_RGB: | |
2165 + neonfct=jsimd_ycc_extrgb_convert_neon; | |
2166 + break; | |
2167 + case JCS_EXT_RGBX: | |
2168 + case JCS_EXT_RGBA: | |
2169 + neonfct=jsimd_ycc_extrgbx_convert_neon; | |
2170 + break; | |
2171 + case JCS_EXT_BGR: | |
2172 + neonfct=jsimd_ycc_extbgr_convert_neon; | |
2173 + break; | |
2174 + case JCS_EXT_BGRX: | |
2175 + case JCS_EXT_BGRA: | |
2176 + neonfct=jsimd_ycc_extbgrx_convert_neon; | |
2177 + break; | |
2178 + case JCS_EXT_XBGR: | |
2179 + case JCS_EXT_ABGR: | |
2180 + neonfct=jsimd_ycc_extxbgr_convert_neon; | |
2181 + break; | |
2182 + case JCS_EXT_XRGB: | |
2183 + case JCS_EXT_ARGB: | |
2184 + neonfct=jsimd_ycc_extxrgb_convert_neon; | |
2185 + break; | |
2186 + default: | |
2187 + neonfct=jsimd_ycc_extrgb_convert_neon; | |
2188 + break; | |
2189 + } | |
2190 + | |
2191 + if (simd_support & JSIMD_ARM_NEON) | |
2192 + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); | |
2193 +} | |
2194 + | |
2195 +GLOBAL(void) | |
2196 +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, | |
2197 + JSAMPIMAGE input_buf, JDIMENSION input_row, | |
2198 + JSAMPARRAY output_buf, int num_rows) | |
2199 +{ | |
2200 + if (simd_support & JSIMD_ARM_NEON) | |
2201 + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, | |
2202 + output_buf, num_rows); | |
2203 +} | |
2204 + | |
2205 +GLOBAL(int) | |
2206 +jsimd_can_h2v2_downsample (void) | |
2207 +{ | |
2208 + init_simd(); | |
2209 + | |
2210 + return 0; | |
2211 +} | |
2212 + | |
2213 +GLOBAL(int) | |
2214 +jsimd_can_h2v1_downsample (void) | |
2215 +{ | |
2216 + init_simd(); | |
2217 + | |
2218 + return 0; | |
2219 +} | |
2220 + | |
2221 +GLOBAL(void) | |
2222 +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, | |
2223 + JSAMPARRAY input_data, JSAMPARRAY output_data) | |
2224 +{ | |
2225 +} | |
2226 + | |
2227 +GLOBAL(void) | |
2228 +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, | |
2229 + JSAMPARRAY input_data, JSAMPARRAY output_data) | |
2230 +{ | |
2231 +} | |
2232 + | |
2233 +GLOBAL(int) | |
2234 +jsimd_can_h2v2_upsample (void) | |
2235 +{ | |
2236 + init_simd(); | |
2237 + | |
2238 + return 0; | |
2239 +} | |
2240 + | |
2241 +GLOBAL(int) | |
2242 +jsimd_can_h2v1_upsample (void) | |
2243 +{ | |
2244 + init_simd(); | |
2245 + | |
2246 + return 0; | |
2247 +} | |
2248 + | |
2249 +GLOBAL(void) | |
2250 +jsimd_h2v2_upsample (j_decompress_ptr cinfo, | |
2251 + jpeg_component_info * compptr, | |
2252 + JSAMPARRAY input_data, | |
2253 + JSAMPARRAY * output_data_ptr) | |
2254 +{ | |
2255 +} | |
2256 + | |
2257 +GLOBAL(void) | |
2258 +jsimd_h2v1_upsample (j_decompress_ptr cinfo, | |
2259 + jpeg_component_info * compptr, | |
2260 + JSAMPARRAY input_data, | |
2261 + JSAMPARRAY * output_data_ptr) | |
2262 +{ | |
2263 +} | |
2264 + | |
2265 +GLOBAL(int) | |
2266 +jsimd_can_h2v2_fancy_upsample (void) | |
2267 +{ | |
2268 + init_simd(); | |
2269 + | |
2270 + return 0; | |
2271 +} | |
2272 + | |
2273 +GLOBAL(int) | |
2274 +jsimd_can_h2v1_fancy_upsample (void) | |
2275 +{ | |
2276 + init_simd(); | |
2277 + | |
2278 + return 0; | |
2279 +} | |
2280 + | |
2281 +GLOBAL(void) | |
2282 +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, | |
2283 + jpeg_component_info * compptr, | |
2284 + JSAMPARRAY input_data, | |
2285 + JSAMPARRAY * output_data_ptr) | |
2286 +{ | |
2287 +} | |
2288 + | |
2289 +GLOBAL(void) | |
2290 +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, | |
2291 + jpeg_component_info * compptr, | |
2292 + JSAMPARRAY input_data, | |
2293 + JSAMPARRAY * output_data_ptr) | |
2294 +{ | |
2295 +} | |
2296 + | |
2297 +GLOBAL(int) | |
2298 +jsimd_can_h2v2_merged_upsample (void) | |
2299 +{ | |
2300 + init_simd(); | |
2301 + | |
2302 + return 0; | |
2303 +} | |
2304 + | |
2305 +GLOBAL(int) | |
2306 +jsimd_can_h2v1_merged_upsample (void) | |
2307 +{ | |
2308 + init_simd(); | |
2309 + | |
2310 + return 0; | |
2311 +} | |
2312 + | |
2313 +GLOBAL(void) | |
2314 +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, | |
2315 + JSAMPIMAGE input_buf, | |
2316 + JDIMENSION in_row_group_ctr, | |
2317 + JSAMPARRAY output_buf) | |
2318 +{ | |
2319 +} | |
2320 + | |
2321 +GLOBAL(void) | |
2322 +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, | |
2323 + JSAMPIMAGE input_buf, | |
2324 + JDIMENSION in_row_group_ctr, | |
2325 + JSAMPARRAY output_buf) | |
2326 +{ | |
2327 +} | |
2328 + | |
2329 +GLOBAL(int) | |
2330 +jsimd_can_convsamp (void) | |
2331 +{ | |
2332 + init_simd(); | |
2333 + | |
2334 + return 0; | |
2335 +} | |
2336 + | |
2337 +GLOBAL(int) | |
2338 +jsimd_can_convsamp_float (void) | |
2339 +{ | |
2340 + init_simd(); | |
2341 + | |
2342 + return 0; | |
2343 +} | |
2344 + | |
2345 +GLOBAL(void) | |
2346 +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, | |
2347 + DCTELEM * workspace) | |
2348 +{ | |
2349 +} | |
2350 + | |
2351 +GLOBAL(void) | |
2352 +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, | |
2353 + FAST_FLOAT * workspace) | |
2354 +{ | |
2355 +} | |
2356 + | |
2357 +GLOBAL(int) | |
2358 +jsimd_can_fdct_islow (void) | |
2359 +{ | |
2360 + init_simd(); | |
2361 + | |
2362 + return 0; | |
2363 +} | |
2364 + | |
2365 +GLOBAL(int) | |
2366 +jsimd_can_fdct_ifast (void) | |
2367 +{ | |
2368 + init_simd(); | |
2369 + | |
2370 + return 0; | |
2371 +} | |
2372 + | |
2373 +GLOBAL(int) | |
2374 +jsimd_can_fdct_float (void) | |
2375 +{ | |
2376 + init_simd(); | |
2377 + | |
2378 + return 0; | |
2379 +} | |
2380 + | |
2381 +GLOBAL(void) | |
2382 +jsimd_fdct_islow (DCTELEM * data) | |
2383 +{ | |
2384 +} | |
2385 + | |
2386 +GLOBAL(void) | |
2387 +jsimd_fdct_ifast (DCTELEM * data) | |
2388 +{ | |
2389 +} | |
2390 + | |
2391 +GLOBAL(void) | |
2392 +jsimd_fdct_float (FAST_FLOAT * data) | |
2393 +{ | |
2394 +} | |
2395 + | |
2396 +GLOBAL(int) | |
2397 +jsimd_can_quantize (void) | |
2398 +{ | |
2399 + init_simd(); | |
2400 + | |
2401 + return 0; | |
2402 +} | |
2403 + | |
2404 +GLOBAL(int) | |
2405 +jsimd_can_quantize_float (void) | |
2406 +{ | |
2407 + init_simd(); | |
2408 + | |
2409 + return 0; | |
2410 +} | |
2411 + | |
2412 +GLOBAL(void) | |
2413 +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, | |
2414 + DCTELEM * workspace) | |
2415 +{ | |
2416 +} | |
2417 + | |
2418 +GLOBAL(void) | |
2419 +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, | |
2420 + FAST_FLOAT * workspace) | |
2421 +{ | |
2422 +} | |
2423 + | |
2424 +GLOBAL(int) | |
2425 +jsimd_can_idct_2x2 (void) | |
2426 +{ | |
2427 + init_simd(); | |
2428 + | |
2429 + /* The code is optimised for these values only */ | |
2430 + if (DCTSIZE != 8) | |
2431 + return 0; | |
2432 + if (sizeof(JCOEF) != 2) | |
2433 + return 0; | |
2434 + if (BITS_IN_JSAMPLE != 8) | |
2435 + return 0; | |
2436 + if (sizeof(JDIMENSION) != 4) | |
2437 + return 0; | |
2438 + if (sizeof(ISLOW_MULT_TYPE) != 2) | |
2439 + return 0; | |
2440 + | |
2441 + if (simd_support & JSIMD_ARM_NEON) | |
2442 + return 1; | |
2443 + | |
2444 + return 0; | |
2445 +} | |
2446 + | |
2447 +GLOBAL(int) | |
2448 +jsimd_can_idct_4x4 (void) | |
2449 +{ | |
2450 + init_simd(); | |
2451 + | |
2452 + /* The code is optimised for these values only */ | |
2453 + if (DCTSIZE != 8) | |
2454 + return 0; | |
2455 + if (sizeof(JCOEF) != 2) | |
2456 + return 0; | |
2457 + if (BITS_IN_JSAMPLE != 8) | |
2458 + return 0; | |
2459 + if (sizeof(JDIMENSION) != 4) | |
2460 + return 0; | |
2461 + if (sizeof(ISLOW_MULT_TYPE) != 2) | |
2462 + return 0; | |
2463 + | |
2464 + if (simd_support & JSIMD_ARM_NEON) | |
2465 + return 1; | |
2466 + | |
2467 + return 0; | |
2468 +} | |
2469 + | |
2470 +GLOBAL(void) | |
2471 +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
2472 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
2473 + JDIMENSION output_col) | |
2474 +{ | |
2475 + if (simd_support & JSIMD_ARM_NEON) | |
2476 + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, | |
2477 + output_col); | |
2478 +} | |
2479 + | |
2480 +GLOBAL(void) | |
2481 +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
2482 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
2483 + JDIMENSION output_col) | |
2484 +{ | |
2485 + if (simd_support & JSIMD_ARM_NEON) | |
2486 + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, | |
2487 + output_col); | |
2488 +} | |
2489 + | |
2490 +GLOBAL(int) | |
2491 +jsimd_can_idct_islow (void) | |
2492 +{ | |
2493 + init_simd(); | |
2494 + | |
2495 + /* The code is optimised for these values only */ | |
2496 + if (DCTSIZE != 8) | |
2497 + return 0; | |
2498 + if (sizeof(JCOEF) != 2) | |
2499 + return 0; | |
2500 + if (BITS_IN_JSAMPLE != 8) | |
2501 + return 0; | |
2502 + if (sizeof(JDIMENSION) != 4) | |
2503 + return 0; | |
2504 + if (sizeof(ISLOW_MULT_TYPE) != 2) | |
2505 + return 0; | |
2506 + | |
2507 + if (simd_support & JSIMD_ARM_NEON) | |
2508 + return 1; | |
2509 + | |
2510 + return 0; | |
2511 +} | |
2512 + | |
2513 +GLOBAL(int) | |
2514 +jsimd_can_idct_ifast (void) | |
2515 +{ | |
2516 + init_simd(); | |
2517 + | |
2518 + /* The code is optimised for these values only */ | |
2519 + if (DCTSIZE != 8) | |
2520 + return 0; | |
2521 + if (sizeof(JCOEF) != 2) | |
2522 + return 0; | |
2523 + if (BITS_IN_JSAMPLE != 8) | |
2524 + return 0; | |
2525 + if (sizeof(JDIMENSION) != 4) | |
2526 + return 0; | |
2527 + if (sizeof(IFAST_MULT_TYPE) != 2) | |
2528 + return 0; | |
2529 + if (IFAST_SCALE_BITS != 2) | |
2530 + return 0; | |
2531 + | |
2532 + if (simd_support & JSIMD_ARM_NEON) | |
2533 + return 1; | |
2534 + | |
2535 + return 0; | |
2536 +} | |
2537 + | |
2538 +GLOBAL(int) | |
2539 +jsimd_can_idct_float (void) | |
2540 +{ | |
2541 + init_simd(); | |
2542 + | |
2543 + return 0; | |
2544 +} | |
2545 + | |
2546 +GLOBAL(void) | |
2547 +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
2548 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
2549 + JDIMENSION output_col) | |
2550 +{ | |
2551 + if (simd_support & JSIMD_ARM_NEON) | |
2552 + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, | |
2553 + output_col); | |
2554 +} | |
2555 + | |
2556 +GLOBAL(void) | |
2557 +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
2558 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
2559 + JDIMENSION output_col) | |
2560 +{ | |
2561 + if (simd_support & JSIMD_ARM_NEON) | |
2562 + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, | |
2563 + output_col); | |
2564 +} | |
2565 + | |
2566 +GLOBAL(void) | |
2567 +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
2568 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
2569 + JDIMENSION output_col) | |
2570 +{ | |
2571 +} | |
2572 Index: simd/jsimd_arm64_neon.S | |
2573 new file mode 100644 | |
2574 =================================================================== | |
2575 --- /dev/null | |
2576 +++ simd/jsimd_arm64_neon.S | |
2577 @@ -0,0 +1,1861 @@ | |
2578 +/* | |
2579 + * ARMv8 NEON optimizations for libjpeg-turbo | |
2580 + * | |
2581 + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | |
2582 + * All rights reserved. | |
2583 + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | |
2584 + * Copyright (C) 2013-2014, Linaro Limited | |
2585 + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> | |
2586 + * | |
2587 + * This software is provided 'as-is', without any express or implied | |
2588 + * warranty. In no event will the authors be held liable for any damages | |
2589 + * arising from the use of this software. | |
2590 + * | |
2591 + * Permission is granted to anyone to use this software for any purpose, | |
2592 + * including commercial applications, and to alter it and redistribute it | |
2593 + * freely, subject to the following restrictions: | |
2594 + * | |
2595 + * 1. The origin of this software must not be misrepresented; you must not | |
2596 + * claim that you wrote the original software. If you use this software | |
2597 + * in a product, an acknowledgment in the product documentation would be | |
2598 + * appreciated but is not required. | |
2599 + * 2. Altered source versions must be plainly marked as such, and must not be | |
2600 + * misrepresented as being the original software. | |
2601 + * 3. This notice may not be removed or altered from any source distribution. | |
2602 + */ | |
2603 + | |
2604 +#if defined(__linux__) && defined(__ELF__) | |
2605 +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ | |
2606 +#endif | |
2607 + | |
2608 +.text | |
2609 +.arch armv8-a+fp+simd | |
2610 + | |
2611 + | |
2612 +#define RESPECT_STRICT_ALIGNMENT 1 | |
2613 + | |
2614 + | |
2615 +/*****************************************************************************/ | |
2616 + | |
2617 +/* Supplementary macro for setting function attributes */ | |
2618 +.macro asm_function fname | |
2619 +#ifdef __APPLE__ | |
2620 + .globl _\fname | |
2621 +_\fname: | |
2622 +#else | |
2623 + .global \fname | |
2624 +#ifdef __ELF__ | |
2625 + .hidden \fname | |
2626 + .type \fname, %function | |
2627 +#endif | |
2628 +\fname: | |
2629 +#endif | |
2630 +.endm | |
2631 + | |
2632 +/* Transpose elements of single 128 bit registers */ | |
2633 +.macro transpose_single x0,x1,xi,xilen,literal | |
2634 + ins \xi\xilen[0], \x0\xilen[0] | |
2635 + ins \x1\xilen[0], \x0\xilen[1] | |
2636 + trn1 \x0\literal, \x0\literal, \x1\literal | |
2637 + trn2 \x1\literal, \xi\literal, \x1\literal | |
2638 +.endm | |
2639 + | |
2640 +/* Transpose elements of 2 differnet registers */ | |
2641 +.macro transpose x0,x1,xi,xilen,literal | |
2642 + mov \xi\xilen, \x0\xilen | |
2643 + trn1 \x0\literal, \x0\literal, \x1\literal | |
2644 + trn2 \x1\literal, \xi\literal, \x1\literal | |
2645 +.endm | |
2646 + | |
2647 +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ | |
2648 +.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen | |
2649 + mov \xi\xilen, \x0\xilen | |
2650 + trn1 \x0\x0len, \x0\x0len, \x2\x2len | |
2651 + trn2 \x2\x2len, \xi\x0len, \x2\x2len | |
2652 + mov \xi\xilen, \x1\xilen | |
2653 + trn1 \x1\x1len, \x1\x1len, \x3\x3len | |
2654 + trn2 \x3\x3len, \xi\x1len, \x3\x3len | |
2655 +.endm | |
2656 + | |
2657 +.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen | |
2658 + mov \xi\xilen, \x0\xilen | |
2659 + trn1 \x0\x0len, \x0\x0len, \x1\x1len | |
2660 + trn2 \x1\x2len, \xi\x0len, \x1\x2len | |
2661 + mov \xi\xilen, \x2\xilen | |
2662 + trn1 \x2\x2len, \x2\x2len, \x3\x3len | |
2663 + trn2 \x3\x2len, \xi\x1len, \x3\x3len | |
2664 +.endm | |
2665 + | |
2666 +.macro transpose_4x4 x0, x1, x2, x3,x5 | |
2667 + transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b | |
2668 + transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b | |
2669 +.endm | |
2670 + | |
2671 + | |
2672 +#define CENTERJSAMPLE 128 | |
2673 + | |
2674 +/*****************************************************************************/ | |
2675 + | |
2676 +/* | |
2677 + * Perform dequantization and inverse DCT on one block of coefficients. | |
2678 + * | |
2679 + * GLOBAL(void) | |
2680 + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, | |
2681 + * JSAMPARRAY output_buf, JDIMENSION output_col) | |
2682 + */ | |
2683 + | |
2684 +#define FIX_0_298631336 (2446) | |
2685 +#define FIX_0_390180644 (3196) | |
2686 +#define FIX_0_541196100 (4433) | |
2687 +#define FIX_0_765366865 (6270) | |
2688 +#define FIX_0_899976223 (7373) | |
2689 +#define FIX_1_175875602 (9633) | |
2690 +#define FIX_1_501321110 (12299) | |
2691 +#define FIX_1_847759065 (15137) | |
2692 +#define FIX_1_961570560 (16069) | |
2693 +#define FIX_2_053119869 (16819) | |
2694 +#define FIX_2_562915447 (20995) | |
2695 +#define FIX_3_072711026 (25172) | |
2696 + | |
2697 +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) | |
2698 +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) | |
2699 +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) | |
2700 +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) | |
2701 +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) | |
2702 +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) | |
2703 +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) | |
2704 +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) | |
2705 + | |
2706 +/* | |
2707 + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. | |
2708 + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' | |
2709 + */ | |
2710 +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ | |
2711 +{ \ | |
2712 + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ | |
2713 + INT32 q1, q2, q3, q4, q5, q6, q7; \ | |
2714 + INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ | |
2715 + \ | |
2716 + /* 1-D iDCT input data */ \ | |
2717 + row0 = xrow0; \ | |
2718 + row1 = xrow1; \ | |
2719 + row2 = xrow2; \ | |
2720 + row3 = xrow3; \ | |
2721 + row4 = xrow4; \ | |
2722 + row5 = xrow5; \ | |
2723 + row6 = xrow6; \ | |
2724 + row7 = xrow7; \ | |
2725 + \ | |
2726 + q5 = row7 + row3; \ | |
2727 + q4 = row5 + row1; \ | |
2728 + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ | |
2729 + MULTIPLY(q4, FIX_1_175875602); \ | |
2730 + q7 = MULTIPLY(q5, FIX_1_175875602) + \ | |
2731 + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ | |
2732 + q2 = MULTIPLY(row2, FIX_0_541196100) + \ | |
2733 + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ | |
2734 + q4 = q6; \ | |
2735 + q3 = ((INT32) row0 - (INT32) row4) << 13; \ | |
2736 + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ | |
2737 + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ | |
2738 + /* now we can use q1 (reloadable constants have been used up) */ \ | |
2739 + q1 = q3 + q2; \ | |
2740 + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ | |
2741 + MULTIPLY(row1, -FIX_0_899976223); \ | |
2742 + q5 = q7; \ | |
2743 + q1 = q1 + q6; \ | |
2744 + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ | |
2745 + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ | |
2746 + \ | |
2747 + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ | |
2748 + tmp11_plus_tmp2 = q1; \ | |
2749 + row1 = 0; \ | |
2750 + \ | |
2751 + q1 = q1 - q6; \ | |
2752 + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ | |
2753 + MULTIPLY(row3, -FIX_2_562915447); \ | |
2754 + q1 = q1 - q6; \ | |
2755 + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ | |
2756 + MULTIPLY(row6, FIX_0_541196100); \ | |
2757 + q3 = q3 - q2; \ | |
2758 + \ | |
2759 + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ | |
2760 + tmp11_minus_tmp2 = q1; \ | |
2761 + \ | |
2762 + q1 = ((INT32) row0 + (INT32) row4) << 13; \ | |
2763 + q2 = q1 + q6; \ | |
2764 + q1 = q1 - q6; \ | |
2765 + \ | |
2766 + /* pick up the results */ \ | |
2767 + tmp0 = q4; \ | |
2768 + tmp1 = q5; \ | |
2769 + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ | |
2770 + tmp3 = q7; \ | |
2771 + tmp10 = q2; \ | |
2772 + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ | |
2773 + tmp12 = q3; \ | |
2774 + tmp13 = q1; \ | |
2775 +} | |
2776 + | |
2777 +#define XFIX_0_899976223 v0.4h[0] | |
2778 +#define XFIX_0_541196100 v0.4h[1] | |
2779 +#define XFIX_2_562915447 v0.4h[2] | |
2780 +#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3] | |
2781 +#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0] | |
2782 +#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1] | |
2783 +#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2] | |
2784 +#define XFIX_1_175875602 v1.4h[3] | |
2785 +#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0] | |
2786 +#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1] | |
2787 +#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2] | |
2788 +#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3] | |
2789 + | |
2790 +.balign 16 | |
2791 +jsimd_idct_islow_neon_consts: | |
2792 + .short FIX_0_899976223 /* d0[0] */ | |
2793 + .short FIX_0_541196100 /* d0[1] */ | |
2794 + .short FIX_2_562915447 /* d0[2] */ | |
2795 + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ | |
2796 + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ | |
2797 + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ | |
2798 + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ | |
2799 + .short FIX_1_175875602 /* d1[3] */ | |
2800 + /* reloadable constants */ | |
2801 + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ | |
2802 + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ | |
2803 + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ | |
2804 + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ | |
2805 + | |
2806 +asm_function jsimd_idct_islow_neon | |
2807 + | |
2808 + DCT_TABLE .req x0 | |
2809 + COEF_BLOCK .req x1 | |
2810 + OUTPUT_BUF .req x2 | |
2811 + OUTPUT_COL .req x3 | |
2812 + TMP1 .req x0 | |
2813 + TMP2 .req x1 | |
2814 + TMP3 .req x2 | |
2815 + TMP4 .req x15 | |
2816 + | |
2817 + ROW0L .req v16 | |
2818 + ROW0R .req v17 | |
2819 + ROW1L .req v18 | |
2820 + ROW1R .req v19 | |
2821 + ROW2L .req v20 | |
2822 + ROW2R .req v21 | |
2823 + ROW3L .req v22 | |
2824 + ROW3R .req v23 | |
2825 + ROW4L .req v24 | |
2826 + ROW4R .req v25 | |
2827 + ROW5L .req v26 | |
2828 + ROW5R .req v27 | |
2829 + ROW6L .req v28 | |
2830 + ROW6R .req v29 | |
2831 + ROW7L .req v30 | |
2832 + ROW7R .req v31 | |
2833 + /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */ | |
2834 + sub sp, sp, 272 | |
2835 + str x15, [sp], 16 | |
2836 + adr x15, jsimd_idct_islow_neon_consts | |
2837 + st1 {v0.8b - v3.8b}, [sp], 32 | |
2838 + st1 {v4.8b - v7.8b}, [sp], 32 | |
2839 + st1 {v8.8b - v11.8b}, [sp], 32 | |
2840 + st1 {v12.8b - v15.8b}, [sp], 32 | |
2841 + st1 {v16.8b - v19.8b}, [sp], 32 | |
2842 + st1 {v20.8b - v23.8b}, [sp], 32 | |
2843 + st1 {v24.8b - v27.8b}, [sp], 32 | |
2844 + st1 {v28.8b - v31.8b}, [sp], 32 | |
2845 + ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 | |
2846 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | |
2847 + ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 | |
2848 + mul v16.4h, v16.4h, v0.4h | |
2849 + mul v17.4h, v17.4h, v1.4h | |
2850 + ins v16.2d[1], v17.2d[0] /* 128 bit q8 */ | |
2851 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | |
2852 + mul v18.4h, v18.4h, v2.4h | |
2853 + mul v19.4h, v19.4h, v3.4h | |
2854 + ins v18.2d[1], v19.2d[0] /* 128 bit q9 */ | |
2855 + ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 | |
2856 + mul v20.4h, v20.4h, v4.4h | |
2857 + mul v21.4h, v21.4h, v5.4h | |
2858 + ins v20.2d[1], v21.2d[0] /* 128 bit q10 */ | |
2859 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | |
2860 + mul v22.4h, v22.4h, v6.4h | |
2861 + mul v23.4h, v23.4h, v7.4h | |
2862 + ins v22.2d[1], v23.2d[0] /* 128 bit q11 */ | |
2863 + ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] | |
2864 + mul v24.4h, v24.4h, v0.4h | |
2865 + mul v25.4h, v25.4h, v1.4h | |
2866 + ins v24.2d[1], v25.2d[0] /* 128 bit q12 */ | |
2867 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | |
2868 + mul v28.4h, v28.4h, v4.4h | |
2869 + mul v29.4h, v29.4h, v5.4h | |
2870 + ins v28.2d[1], v29.2d[0] /* 128 bit q14 */ | |
2871 + mul v26.4h, v26.4h, v2.4h | |
2872 + mul v27.4h, v27.4h, v3.4h | |
2873 + ins v26.2d[1], v27.2d[0] /* 128 bit q13 */ | |
2874 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ | |
2875 + add x15, x15, #16 | |
2876 + mul v30.4h, v30.4h, v6.4h | |
2877 + mul v31.4h, v31.4h, v7.4h | |
2878 + ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ | |
2879 + /* Go to the bottom of the stack */ | |
2880 + sub sp, sp, 352 | |
2881 + stp x4, x5, [sp], 16 | |
2882 + st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ | |
2883 + st1 {v12.4h - v15.4h}, [sp], 32 | |
2884 + /* 1-D IDCT, pass 1, left 4x8 half */ | |
2885 + add v4.4h, ROW7L.4h, ROW3L.4h | |
2886 + add v5.4h, ROW5L.4h, ROW1L.4h | |
2887 + smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 | |
2888 + smlal v12.4s, v5.4h, XFIX_1_175875602 | |
2889 + smull v14.4s, v4.4h, XFIX_1_175875602 | |
2890 + /* Check for the zero coefficients in the right 4x8 half */ | |
2891 + smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644 | |
2892 + ssubl v6.4s, ROW0L.4h, ROW4L.4h | |
2893 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] | |
2894 + smull v4.4s, ROW2L.4h, XFIX_0_541196100 | |
2895 + smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065 | |
2896 + orr x0, x4, x5 | |
2897 + mov v8.16b, v12.16b | |
2898 + smlsl v12.4s, ROW5L.4h, XFIX_2_562915447 | |
2899 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] | |
2900 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
2901 + shl v6.4s, v6.4s, #13 | |
2902 + orr x0, x0, x4 | |
2903 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | |
2904 + orr x0, x0 , x5 | |
2905 + add v2.4s, v6.4s, v4.4s | |
2906 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] | |
2907 + mov v10.16b, v14.16b | |
2908 + add v2.4s, v2.4s, v12.4s | |
2909 + orr x0, x0, x4 | |
2910 + smlsl v14.4s, ROW7L.4h, XFIX_0_899976223 | |
2911 + orr x0, x0, x5 | |
2912 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
2913 + rshrn ROW1L.4h, v2.4s, #11 | |
2914 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] | |
2915 + sub v2.4s, v2.4s, v12.4s | |
2916 + smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447 | |
2917 + orr x0, x0, x4 | |
2918 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | |
2919 + orr x0, x0, x5 | |
2920 + sub v2.4s, v2.4s, v12.4s | |
2921 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
2922 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] | |
2923 + smlal v12.4s, ROW6L.4h, XFIX_0_541196100 | |
2924 + sub v6.4s, v6.4s, v4.4s | |
2925 + orr x0, x0, x4 | |
2926 + rshrn ROW6L.4h, v2.4s, #11 | |
2927 + orr x0, x0, x5 | |
2928 + add v2.4s, v6.4s, v10.4s | |
2929 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] | |
2930 + sub v6.4s, v6.4s, v10.4s | |
2931 + saddl v10.4s, ROW0L.4h, ROW4L.4h | |
2932 + orr x0, x0, x4 | |
2933 + rshrn ROW2L.4h, v2.4s, #11 | |
2934 + orr x0, x0, x5 | |
2935 + rshrn ROW5L.4h, v6.4s, #11 | |
2936 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] | |
2937 + shl v10.4s, v10.4s, #13 | |
2938 + smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223 | |
2939 + orr x0, x0, x4 | |
2940 + add v4.4s, v10.4s, v12.4s | |
2941 + orr x0, x0, x5 | |
2942 + cmp x0, #0 /* orrs instruction removed */ | |
2943 + sub v2.4s, v10.4s, v12.4s | |
2944 + add v12.4s, v4.4s, v14.4s | |
2945 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] | |
2946 + sub v4.4s, v4.4s, v14.4s | |
2947 + add v10.4s, v2.4s, v8.4s | |
2948 + orr x0, x4, x5 | |
2949 + sub v6.4s, v2.4s, v8.4s | |
2950 + /* pop {x4, x5} */ | |
2951 + sub sp, sp, 80 | |
2952 + ldp x4, x5, [sp], 16 | |
2953 + rshrn ROW7L.4h, v4.4s, #11 | |
2954 + rshrn ROW3L.4h, v10.4s, #11 | |
2955 + rshrn ROW0L.4h, v12.4s, #11 | |
2956 + rshrn ROW4L.4h, v6.4s, #11 | |
2957 + | |
2958 + beq 3f /* Go to do some special handling for the sparse right
4x8 half */ | |
2959 + | |
2960 + /* 1-D IDCT, pass 1, right 4x8 half */ | |
2961 + ld1 {v2.4h}, [x15] /* reload constants */ | |
2962 + add v10.4h, ROW7R.4h, ROW3R.4h | |
2963 + add v8.4h, ROW5R.4h, ROW1R.4h | |
2964 + /* Transpose ROW6L <-> ROW7L (v3 available free register) */ | |
2965 + transpose ROW6L, ROW7L, v3, .16b, .4h | |
2966 + smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560 | |
2967 + smlal v12.4s, v8.4h, XFIX_1_175875602 | |
2968 + /* Transpose ROW2L <-> ROW3L (v3 available free register) */ | |
2969 + transpose ROW2L, ROW3L, v3, .16b, .4h | |
2970 + smull v14.4s, v10.4h, XFIX_1_175875602 | |
2971 + smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644 | |
2972 + /* Transpose ROW0L <-> ROW1L (v3 available free register) */ | |
2973 + transpose ROW0L, ROW1L, v3, .16b, .4h | |
2974 + ssubl v6.4s, ROW0R.4h, ROW4R.4h | |
2975 + smull v4.4s, ROW2R.4h, XFIX_0_541196100 | |
2976 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 | |
2977 + /* Transpose ROW4L <-> ROW5L (v3 available free register) */ | |
2978 + transpose ROW4L, ROW5L, v3, .16b, .4h | |
2979 + mov v8.16b, v12.16b | |
2980 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 | |
2981 + smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447 | |
2982 + /* Transpose ROW1L <-> ROW3L (v3 available free register) */ | |
2983 + transpose ROW1L, ROW3L, v3, .16b, .2s | |
2984 + shl v6.4s, v6.4s, #13 | |
2985 + smlsl v8.4s, ROW1R.4h, XFIX_0_899976223 | |
2986 + /* Transpose ROW4L <-> ROW6L (v3 available free register) */ | |
2987 + transpose ROW4L, ROW6L, v3, .16b, .2s | |
2988 + add v2.4s, v6.4s, v4.4s | |
2989 + mov v10.16b, v14.16b | |
2990 + add v2.4s, v2.4s, v12.4s | |
2991 + /* Transpose ROW0L <-> ROW2L (v3 available free register) */ | |
2992 + transpose ROW0L, ROW2L, v3, .16b, .2s | |
2993 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 | |
2994 + smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223 | |
2995 + rshrn ROW1R.4h, v2.4s, #11 | |
2996 + /* Transpose ROW5L <-> ROW7L (v3 available free register) */ | |
2997 + transpose ROW5L, ROW7L, v3, .16b, .2s | |
2998 + sub v2.4s, v2.4s, v12.4s | |
2999 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 | |
3000 + smlsl v10.4s, ROW3R.4h, XFIX_2_562915447 | |
3001 + sub v2.4s, v2.4s, v12.4s | |
3002 + smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865 | |
3003 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100 | |
3004 + sub v6.4s, v6.4s, v4.4s | |
3005 + rshrn ROW6R.4h, v2.4s, #11 | |
3006 + add v2.4s, v6.4s, v10.4s | |
3007 + sub v6.4s, v6.4s, v10.4s | |
3008 + saddl v10.4s, ROW0R.4h, ROW4R.4h | |
3009 + rshrn ROW2R.4h, v2.4s, #11 | |
3010 + rshrn ROW5R.4h, v6.4s, #11 | |
3011 + shl v10.4s, v10.4s, #13 | |
3012 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 | |
3013 + add v4.4s, v10.4s, v12.4s | |
3014 + sub v2.4s, v10.4s, v12.4s | |
3015 + add v12.4s, v4.4s, v14.4s | |
3016 + sub v4.4s, v4.4s, v14.4s | |
3017 + add v10.4s, v2.4s, v8.4s | |
3018 + sub v6.4s, v2.4s, v8.4s | |
3019 + rshrn ROW7R.4h, v4.4s, #11 | |
3020 + rshrn ROW3R.4h, v10.4s, #11 | |
3021 + rshrn ROW0R.4h, v12.4s, #11 | |
3022 + rshrn ROW4R.4h, v6.4s, #11 | |
3023 + /* Transpose right 4x8 half */ | |
3024 + transpose ROW6R, ROW7R, v3, .16b, .4h | |
3025 + transpose ROW2R, ROW3R, v3, .16b, .4h | |
3026 + transpose ROW0R, ROW1R, v3, .16b, .4h | |
3027 + transpose ROW4R, ROW5R, v3, .16b, .4h | |
3028 + transpose ROW1R, ROW3R, v3, .16b, .2s | |
3029 + transpose ROW4R, ROW6R, v3, .16b, .2s | |
3030 + transpose ROW0R, ROW2R, v3, .16b, .2s | |
3031 + transpose ROW5R, ROW7R, v3, .16b, .2s | |
3032 + | |
3033 +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ | |
3034 + ld1 {v2.4h}, [x15] /* reload constants */ | |
3035 + smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.
4h */ | |
3036 + smlal v12.4s, ROW1L.4h, XFIX_1_175875602 | |
3037 + smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* R
OW7L.4h <-> ROW3R.4h */ | |
3038 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 | |
3039 + smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.
4h */ | |
3040 + smlal v14.4s, ROW3L.4h, XFIX_1_175875602 | |
3041 + smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* R
OW5L.4h <-> ROW1R.4h */ | |
3042 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 | |
3043 + ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
3044 + smull v4.4s, ROW2L.4h, XFIX_0_541196100 | |
3045 + smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* R
OW6L.4h <-> ROW2R.4h */ | |
3046 + mov v8.16b, v12.16b | |
3047 + smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.
4h */ | |
3048 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
3049 + shl v6.4s, v6.4s, #13 | |
3050 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | |
3051 + add v2.4s, v6.4s, v4.4s | |
3052 + mov v10.16b, v14.16b | |
3053 + add v2.4s, v2.4s, v12.4s | |
3054 + smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.
4h */ | |
3055 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
3056 + shrn ROW1L.4h, v2.4s, #16 | |
3057 + sub v2.4s, v2.4s, v12.4s | |
3058 + smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* R
OW5L.4h <-> ROW1R.4h */ | |
3059 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | |
3060 + sub v2.4s, v2.4s, v12.4s | |
3061 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
3062 + smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.
4h */ | |
3063 + sub v6.4s, v6.4s, v4.4s | |
3064 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
3065 + add v2.4s, v6.4s, v10.4s | |
3066 + sub v6.4s, v6.4s, v10.4s | |
3067 + saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
3068 + shrn ROW2L.4h, v2.4s, #16 | |
3069 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
3070 + shl v10.4s, v10.4s, #13 | |
3071 + smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* R
OW7L.4h <-> ROW3R.4h */ | |
3072 + add v4.4s, v10.4s, v12.4s | |
3073 + sub v2.4s, v10.4s, v12.4s | |
3074 + add v12.4s, v4.4s, v14.4s | |
3075 + sub v4.4s, v4.4s, v14.4s | |
3076 + add v10.4s, v2.4s, v8.4s | |
3077 + sub v6.4s, v2.4s, v8.4s | |
3078 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
3079 + shrn ROW3L.4h, v10.4s, #16 | |
3080 + shrn ROW0L.4h, v12.4s, #16 | |
3081 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
3082 + /* 1-D IDCT, pass 2, right 4x8 half */ | |
3083 + ld1 {v2.4h}, [x15] /* reload constants */ | |
3084 + smull v12.4s, ROW5R.4h, XFIX_1_175875602 | |
3085 + smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.
4h */ | |
3086 + smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560 | |
3087 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* R
OW7L.4h <-> ROW3R.4h */ | |
3088 + smull v14.4s, ROW7R.4h, XFIX_1_175875602 | |
3089 + smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.
4h */ | |
3090 + smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644 | |
3091 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* R
OW5L.4h <-> ROW1R.4h */ | |
3092 + ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
3093 + smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.
4h */ | |
3094 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 | |
3095 + mov v8.16b, v12.16b | |
3096 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 | |
3097 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* R
OW7L.4h <-> ROW3R.4h */ | |
3098 + shl v6.4s, v6.4s, #13 | |
3099 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.
4h */ | |
3100 + add v2.4s, v6.4s, v4.4s | |
3101 + mov v10.16b, v14.16b | |
3102 + add v2.4s, v2.4s, v12.4s | |
3103 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 | |
3104 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* R
OW5L.4h <-> ROW1R.4h */ | |
3105 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
3106 + sub v2.4s, v2.4s, v12.4s | |
3107 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 | |
3108 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.
4h */ | |
3109 + sub v2.4s, v2.4s, v12.4s | |
3110 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* RO
W6L.4h <-> ROW2R.4h */ | |
3111 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100 | |
3112 + sub v6.4s, v6.4s, v4.4s | |
3113 + shrn ROW6R.4h, v2.4s, #16 | |
3114 + add v2.4s, v6.4s, v10.4s | |
3115 + sub v6.4s, v6.4s, v10.4s | |
3116 + saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
3117 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
3118 + shrn ROW5R.4h, v6.4s, #16 | |
3119 + shl v10.4s, v10.4s, #13 | |
3120 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 | |
3121 + add v4.4s, v10.4s, v12.4s | |
3122 + sub v2.4s, v10.4s, v12.4s | |
3123 + add v12.4s, v4.4s, v14.4s | |
3124 + sub v4.4s, v4.4s, v14.4s | |
3125 + add v10.4s, v2.4s, v8.4s | |
3126 + sub v6.4s, v2.4s, v8.4s | |
3127 + shrn ROW7R.4h, v4.4s, #16 | |
3128 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
3129 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
3130 + shrn ROW4R.4h, v6.4s, #16 | |
3131 + | |
3132 +2: /* Descale to 8-bit and range limit */ | |
3133 + ins v16.2d[1], v17.2d[0] | |
3134 + ins v18.2d[1], v19.2d[0] | |
3135 + ins v20.2d[1], v21.2d[0] | |
3136 + ins v22.2d[1], v23.2d[0] | |
3137 + sqrshrn v16.8b, v16.8h, #2 | |
3138 + sqrshrn2 v16.16b, v18.8h, #2 | |
3139 + sqrshrn v18.8b, v20.8h, #2 | |
3140 + sqrshrn2 v18.16b, v22.8h, #2 | |
3141 + | |
3142 + /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ | |
3143 + ld1 {v8.4h - v11.4h}, [sp], 32 | |
3144 + ld1 {v12.4h - v15.4h}, [sp], 32 | |
3145 + ins v24.2d[1], v25.2d[0] | |
3146 + | |
3147 + sqrshrn v20.8b, v24.8h, #2 | |
3148 + /* Transpose the final 8-bit samples and do signed->unsigned conversion *
/ | |
3149 + /* trn1 v16.8h, v16.8h, v18.8h */ | |
3150 + transpose v16, v18, v3, .16b, .8h | |
3151 + ins v26.2d[1], v27.2d[0] | |
3152 + ins v28.2d[1], v29.2d[0] | |
3153 + ins v30.2d[1], v31.2d[0] | |
3154 + sqrshrn2 v20.16b, v26.8h, #2 | |
3155 + sqrshrn v22.8b, v28.8h, #2 | |
3156 + movi v0.16b, #(CENTERJSAMPLE) | |
3157 + sqrshrn2 v22.16b, v30.8h, #2 | |
3158 + transpose_single v16, v17, v3, .2d, .8b | |
3159 + transpose_single v18, v19, v3, .2d, .8b | |
3160 + add v16.8b, v16.8b, v0.8b | |
3161 + add v17.8b, v17.8b, v0.8b | |
3162 + add v18.8b, v18.8b, v0.8b | |
3163 + add v19.8b, v19.8b, v0.8b | |
3164 + transpose v20, v22, v3, .16b, .8h | |
3165 + /* Store results to the output buffer */ | |
3166 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
3167 + add TMP1, TMP1, OUTPUT_COL | |
3168 + add TMP2, TMP2, OUTPUT_COL | |
3169 + st1 {v16.8b}, [TMP1] | |
3170 + transpose_single v20, v21, v3, .2d, .8b | |
3171 + st1 {v17.8b}, [TMP2] | |
3172 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
3173 + add TMP1, TMP1, OUTPUT_COL | |
3174 + add TMP2, TMP2, OUTPUT_COL | |
3175 + st1 {v18.8b}, [TMP1] | |
3176 + add v20.8b, v20.8b, v0.8b | |
3177 + add v21.8b, v21.8b, v0.8b | |
3178 + st1 {v19.8b}, [TMP2] | |
3179 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
3180 + ldp TMP3, TMP4, [OUTPUT_BUF] | |
3181 + add TMP1, TMP1, OUTPUT_COL | |
3182 + add TMP2, TMP2, OUTPUT_COL | |
3183 + add TMP3, TMP3, OUTPUT_COL | |
3184 + add TMP4, TMP4, OUTPUT_COL | |
3185 + transpose_single v22, v23, v3, .2d, .8b | |
3186 + st1 {v20.8b}, [TMP1] | |
3187 + add v22.8b, v22.8b, v0.8b | |
3188 + add v23.8b, v23.8b, v0.8b | |
3189 + st1 {v21.8b}, [TMP2] | |
3190 + st1 {v22.8b}, [TMP3] | |
3191 + st1 {v23.8b}, [TMP4] | |
3192 + ldr x15, [sp], 16 | |
3193 + ld1 {v0.8b - v3.8b}, [sp], 32 | |
3194 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
3195 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
3196 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
3197 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
3198 + ld1 {v20.8b - v23.8b}, [sp], 32 | |
3199 + ld1 {v24.8b - v27.8b}, [sp], 32 | |
3200 + ld1 {v28.8b - v31.8b}, [sp], 32 | |
3201 + blr x30 | |
3202 + | |
3203 +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ | |
3204 + | |
3205 + /* Transpose left 4x8 half */ | |
3206 + transpose ROW6L, ROW7L, v3, .16b, .4h | |
3207 + transpose ROW2L, ROW3L, v3, .16b, .4h | |
3208 + transpose ROW0L, ROW1L, v3, .16b, .4h | |
3209 + transpose ROW4L, ROW5L, v3, .16b, .4h | |
3210 + shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ | |
3211 + transpose ROW1L, ROW3L, v3, .16b, .2s | |
3212 + transpose ROW4L, ROW6L, v3, .16b, .2s | |
3213 + transpose ROW0L, ROW2L, v3, .16b, .2s | |
3214 + transpose ROW5L, ROW7L, v3, .16b, .2s | |
3215 + cmp x0, #0 | |
3216 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second p
ass */ | |
3217 + | |
3218 + /* Only row 0 is non-zero for the right 4x8 half */ | |
3219 + dup ROW1R.4h, ROW0R.4h[1] | |
3220 + dup ROW2R.4h, ROW0R.4h[2] | |
3221 + dup ROW3R.4h, ROW0R.4h[3] | |
3222 + dup ROW4R.4h, ROW0R.4h[0] | |
3223 + dup ROW5R.4h, ROW0R.4h[1] | |
3224 + dup ROW6R.4h, ROW0R.4h[2] | |
3225 + dup ROW7R.4h, ROW0R.4h[3] | |
3226 + dup ROW0R.4h, ROW0R.4h[0] | |
3227 + b 1b /* Go to 'normal' second pass */ | |
3228 + | |
3229 +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ | |
3230 + ld1 {v2.4h}, [x15] /* reload constants */ | |
3231 + smull v12.4s, ROW1L.4h, XFIX_1_175875602 | |
3232 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 | |
3233 + smull v14.4s, ROW3L.4h, XFIX_1_175875602 | |
3234 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 | |
3235 + smull v4.4s, ROW2L.4h, XFIX_0_541196100 | |
3236 + sshll v6.4s, ROW0L.4h, #13 | |
3237 + mov v8.16b, v12.16b | |
3238 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
3239 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | |
3240 + add v2.4s, v6.4s, v4.4s | |
3241 + mov v10.16b, v14.16b | |
3242 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
3243 + add v2.4s, v2.4s, v12.4s | |
3244 + add v12.4s, v12.4s, v12.4s | |
3245 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | |
3246 + shrn ROW1L.4h, v2.4s, #16 | |
3247 + sub v2.4s, v2.4s, v12.4s | |
3248 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
3249 + sub v6.4s, v6.4s, v4.4s | |
3250 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
3251 + add v2.4s, v6.4s, v10.4s | |
3252 + sub v6.4s, v6.4s, v10.4s | |
3253 + sshll v10.4s, ROW0L.4h, #13 | |
3254 + shrn ROW2L.4h, v2.4s, #16 | |
3255 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
3256 + add v4.4s, v10.4s, v12.4s | |
3257 + sub v2.4s, v10.4s, v12.4s | |
3258 + add v12.4s, v4.4s, v14.4s | |
3259 + sub v4.4s, v4.4s, v14.4s | |
3260 + add v10.4s, v2.4s, v8.4s | |
3261 + sub v6.4s, v2.4s, v8.4s | |
3262 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
3263 + shrn ROW3L.4h, v10.4s, #16 | |
3264 + shrn ROW0L.4h, v12.4s, #16 | |
3265 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
3266 + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ | |
3267 + ld1 {v2.4h}, [x15] /* reload constants */ | |
3268 + smull v12.4s, ROW5L.4h, XFIX_1_175875602 | |
3269 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 | |
3270 + smull v14.4s, ROW7L.4h, XFIX_1_175875602 | |
3271 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 | |
3272 + smull v4.4s, ROW6L.4h, XFIX_0_541196100 | |
3273 + sshll v6.4s, ROW4L.4h, #13 | |
3274 + mov v8.16b, v12.16b | |
3275 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
3276 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 | |
3277 + add v2.4s, v6.4s, v4.4s | |
3278 + mov v10.16b, v14.16b | |
3279 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
3280 + add v2.4s, v2.4s, v12.4s | |
3281 + add v12.4s, v12.4s, v12.4s | |
3282 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 | |
3283 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
3284 + sub v2.4s, v2.4s, v12.4s | |
3285 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
3286 + sub v6.4s, v6.4s, v4.4s | |
3287 + shrn ROW6R.4h, v2.4s, #16 | |
3288 + add v2.4s, v6.4s, v10.4s | |
3289 + sub v6.4s, v6.4s, v10.4s | |
3290 + sshll v10.4s, ROW4L.4h, #13 | |
3291 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
3292 + shrn ROW5R.4h, v6.4s, #16 | |
3293 + add v4.4s, v10.4s, v12.4s | |
3294 + sub v2.4s, v10.4s, v12.4s | |
3295 + add v12.4s, v4.4s, v14.4s | |
3296 + sub v4.4s, v4.4s, v14.4s | |
3297 + add v10.4s, v2.4s, v8.4s | |
3298 + sub v6.4s, v2.4s, v8.4s | |
3299 + shrn ROW7R.4h, v4.4s, #16 | |
3300 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
3301 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
3302 + shrn ROW4R.4h, v6.4s, #16 | |
3303 + b 2b /* Go to epilogue */ | |
3304 + | |
3305 + .unreq DCT_TABLE | |
3306 + .unreq COEF_BLOCK | |
3307 + .unreq OUTPUT_BUF | |
3308 + .unreq OUTPUT_COL | |
3309 + .unreq TMP1 | |
3310 + .unreq TMP2 | |
3311 + .unreq TMP3 | |
3312 + .unreq TMP4 | |
3313 + | |
3314 + .unreq ROW0L | |
3315 + .unreq ROW0R | |
3316 + .unreq ROW1L | |
3317 + .unreq ROW1R | |
3318 + .unreq ROW2L | |
3319 + .unreq ROW2R | |
3320 + .unreq ROW3L | |
3321 + .unreq ROW3R | |
3322 + .unreq ROW4L | |
3323 + .unreq ROW4R | |
3324 + .unreq ROW5L | |
3325 + .unreq ROW5R | |
3326 + .unreq ROW6L | |
3327 + .unreq ROW6R | |
3328 + .unreq ROW7L | |
3329 + .unreq ROW7R | |
3330 + | |
3331 + | |
3332 +/*****************************************************************************/ | |
3333 + | |
3334 +/* | |
3335 + * jsimd_idct_ifast_neon | |
3336 + * | |
3337 + * This function contains a fast, not so accurate integer implementation of | |
3338 + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | |
3339 + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' | |
3340 + * function from jidctfst.c | |
3341 + * | |
3342 + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. | |
3343 + * But in ARM NEON case some extra additions are required because VQDMULH | |
3344 + * instruction can't handle the constants larger than 1. So the expressions | |
3345 + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", | |
3346 + * which introduces an extra addition. Overall, there are 6 extra additions | |
3347 + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. | |
3348 + */ | |
3349 + | |
3350 +#define XFIX_1_082392200 v0.4h[0] | |
3351 +#define XFIX_1_414213562 v0.4h[1] | |
3352 +#define XFIX_1_847759065 v0.4h[2] | |
3353 +#define XFIX_2_613125930 v0.4h[3] | |
3354 + | |
3355 +.balign 16 | |
3356 +jsimd_idct_ifast_neon_consts: | |
3357 + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ | |
3358 + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ | |
3359 + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ | |
3360 + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ | |
3361 + | |
3362 +asm_function jsimd_idct_ifast_neon | |
3363 + | |
3364 + DCT_TABLE .req x0 | |
3365 + COEF_BLOCK .req x1 | |
3366 + OUTPUT_BUF .req x2 | |
3367 + OUTPUT_COL .req x3 | |
3368 + TMP1 .req x0 | |
3369 + TMP2 .req x1 | |
3370 + TMP3 .req x2 | |
3371 + TMP4 .req x22 | |
3372 + TMP5 .req x23 | |
3373 + | |
3374 + /* Load and dequantize coefficients into NEON registers | |
3375 + * with the following allocation: | |
3376 + * 0 1 2 3 | 4 5 6 7 | |
3377 + * ---------+-------- | |
3378 + * 0 | d16 | d17 ( v8.8h ) | |
3379 + * 1 | d18 | d19 ( v9.8h ) | |
3380 + * 2 | d20 | d21 ( v10.8h ) | |
3381 + * 3 | d22 | d23 ( v11.8h ) | |
3382 + * 4 | d24 | d25 ( v12.8h ) | |
3383 + * 5 | d26 | d27 ( v13.8h ) | |
3384 + * 6 | d28 | d29 ( v14.8h ) | |
3385 + * 7 | d30 | d31 ( v15.8h ) | |
3386 + */ | |
3387 + /* Save NEON registers used in fast IDCT */ | |
3388 + sub sp, sp, #176 | |
3389 + stp x22, x23, [sp], 16 | |
3390 + adr x23, jsimd_idct_ifast_neon_consts | |
3391 + st1 {v0.8b - v3.8b}, [sp], 32 | |
3392 + st1 {v4.8b - v7.8b}, [sp], 32 | |
3393 + st1 {v8.8b - v11.8b}, [sp], 32 | |
3394 + st1 {v12.8b - v15.8b}, [sp], 32 | |
3395 + st1 {v16.8b - v19.8b}, [sp], 32 | |
3396 + ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32 | |
3397 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 | |
3398 + ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32 | |
3399 + mul v8.8h, v8.8h, v0.8h | |
3400 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 | |
3401 + mul v9.8h, v9.8h, v1.8h | |
3402 + ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32 | |
3403 + mul v10.8h, v10.8h, v2.8h | |
3404 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 | |
3405 + mul v11.8h, v11.8h, v3.8h | |
3406 + ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32 | |
3407 + mul v12.8h, v12.8h, v0.8h | |
3408 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 | |
3409 + mul v14.8h, v14.8h, v2.8h | |
3410 + mul v13.8h, v13.8h, v1.8h | |
3411 + ld1 {v0.4h}, [x23] /* load constants */ | |
3412 + mul v15.8h, v15.8h, v3.8h | |
3413 + | |
3414 + /* 1-D IDCT, pass 1 */ | |
3415 + sub v2.8h, v10.8h, v14.8h | |
3416 + add v14.8h, v10.8h, v14.8h | |
3417 + sub v1.8h, v11.8h, v13.8h | |
3418 + add v13.8h, v11.8h, v13.8h | |
3419 + sub v5.8h, v9.8h, v15.8h | |
3420 + add v15.8h, v9.8h, v15.8h | |
3421 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 | |
3422 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 | |
3423 + add v3.8h, v1.8h, v1.8h | |
3424 + sub v1.8h, v5.8h, v1.8h | |
3425 + add v10.8h, v2.8h, v4.8h | |
3426 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 | |
3427 + sub v2.8h, v15.8h, v13.8h | |
3428 + add v3.8h, v3.8h, v6.8h | |
3429 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 | |
3430 + add v1.8h, v1.8h, v4.8h | |
3431 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 | |
3432 + sub v10.8h, v10.8h, v14.8h | |
3433 + add v2.8h, v2.8h, v6.8h | |
3434 + sub v6.8h, v8.8h, v12.8h | |
3435 + add v12.8h, v8.8h, v12.8h | |
3436 + add v9.8h, v5.8h, v4.8h | |
3437 + add v5.8h, v6.8h, v10.8h | |
3438 + sub v10.8h, v6.8h, v10.8h | |
3439 + add v6.8h, v15.8h, v13.8h | |
3440 + add v8.8h, v12.8h, v14.8h | |
3441 + sub v3.8h, v6.8h, v3.8h | |
3442 + sub v12.8h, v12.8h, v14.8h | |
3443 + sub v3.8h, v3.8h, v1.8h | |
3444 + sub v1.8h, v9.8h, v1.8h | |
3445 + add v2.8h, v3.8h, v2.8h | |
3446 + sub v15.8h, v8.8h, v6.8h | |
3447 + add v1.8h, v1.8h, v2.8h | |
3448 + add v8.8h, v8.8h, v6.8h | |
3449 + add v14.8h, v5.8h, v3.8h | |
3450 + sub v9.8h, v5.8h, v3.8h | |
3451 + sub v13.8h, v10.8h, v2.8h | |
3452 + add v10.8h, v10.8h, v2.8h | |
3453 + /* Transpose q8-q9 */ | |
3454 + mov v18.16b, v8.16b | |
3455 + trn1 v8.8h, v8.8h, v9.8h | |
3456 + trn2 v9.8h, v18.8h, v9.8h | |
3457 + sub v11.8h, v12.8h, v1.8h | |
3458 + /* Transpose q14-q15 */ | |
3459 + mov v18.16b, v14.16b | |
3460 + trn1 v14.8h, v14.8h, v15.8h | |
3461 + trn2 v15.8h, v18.8h, v15.8h | |
3462 + add v12.8h, v12.8h, v1.8h | |
3463 + /* Transpose q10-q11 */ | |
3464 + mov v18.16b, v10.16b | |
3465 + trn1 v10.8h, v10.8h, v11.8h | |
3466 + trn2 v11.8h, v18.8h, v11.8h | |
3467 + /* Transpose q12-q13 */ | |
3468 + mov v18.16b, v12.16b | |
3469 + trn1 v12.8h, v12.8h, v13.8h | |
3470 + trn2 v13.8h, v18.8h, v13.8h | |
3471 + /* Transpose q9-q11 */ | |
3472 + mov v18.16b, v9.16b | |
3473 + trn1 v9.4s, v9.4s, v11.4s | |
3474 + trn2 v11.4s, v18.4s, v11.4s | |
3475 + /* Transpose q12-q14 */ | |
3476 + mov v18.16b, v12.16b | |
3477 + trn1 v12.4s, v12.4s, v14.4s | |
3478 + trn2 v14.4s, v18.4s, v14.4s | |
3479 + /* Transpose q8-q10 */ | |
3480 + mov v18.16b, v8.16b | |
3481 + trn1 v8.4s, v8.4s, v10.4s | |
3482 + trn2 v10.4s, v18.4s, v10.4s | |
3483 + /* Transpose q13-q15 */ | |
3484 + mov v18.16b, v13.16b | |
3485 + trn1 v13.4s, v13.4s, v15.4s | |
3486 + trn2 v15.4s, v18.4s, v15.4s | |
3487 + /* vswp v14.4h, v10-MSB.4h */ | |
3488 + umov x22, v14.d[0] | |
3489 + ins v14.2d[0], v10.2d[1] | |
3490 + ins v10.2d[1], x22 | |
3491 + /* vswp v13.4h, v9MSB.4h */ | |
3492 + | |
3493 + umov x22, v13.d[0] | |
3494 + ins v13.2d[0], v9.2d[1] | |
3495 + ins v9.2d[1], x22 | |
3496 + /* 1-D IDCT, pass 2 */ | |
3497 + sub v2.8h, v10.8h, v14.8h | |
3498 + /* vswp v15.4h, v11MSB.4h */ | |
3499 + umov x22, v15.d[0] | |
3500 + ins v15.2d[0], v11.2d[1] | |
3501 + ins v11.2d[1], x22 | |
3502 + add v14.8h, v10.8h, v14.8h | |
3503 + /* vswp v12.4h, v8-MSB.4h */ | |
3504 + umov x22, v12.d[0] | |
3505 + ins v12.2d[0], v8.2d[1] | |
3506 + ins v8.2d[1], x22 | |
3507 + sub v1.8h, v11.8h, v13.8h | |
3508 + add v13.8h, v11.8h, v13.8h | |
3509 + sub v5.8h, v9.8h, v15.8h | |
3510 + add v15.8h, v9.8h, v15.8h | |
3511 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 | |
3512 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 | |
3513 + add v3.8h, v1.8h, v1.8h | |
3514 + sub v1.8h, v5.8h, v1.8h | |
3515 + add v10.8h, v2.8h, v4.8h | |
3516 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 | |
3517 + sub v2.8h, v15.8h, v13.8h | |
3518 + add v3.8h, v3.8h, v6.8h | |
3519 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 | |
3520 + add v1.8h, v1.8h, v4.8h | |
3521 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 | |
3522 + sub v10.8h, v10.8h, v14.8h | |
3523 + add v2.8h, v2.8h, v6.8h | |
3524 + sub v6.8h, v8.8h, v12.8h | |
3525 + add v12.8h, v8.8h, v12.8h | |
3526 + add v9.8h, v5.8h, v4.8h | |
3527 + add v5.8h, v6.8h, v10.8h | |
3528 + sub v10.8h, v6.8h, v10.8h | |
3529 + add v6.8h, v15.8h, v13.8h | |
3530 + add v8.8h, v12.8h, v14.8h | |
3531 + sub v3.8h, v6.8h, v3.8h | |
3532 + sub v12.8h, v12.8h, v14.8h | |
3533 + sub v3.8h, v3.8h, v1.8h | |
3534 + sub v1.8h, v9.8h, v1.8h | |
3535 + add v2.8h, v3.8h, v2.8h | |
3536 + sub v15.8h, v8.8h, v6.8h | |
3537 + add v1.8h, v1.8h, v2.8h | |
3538 + add v8.8h, v8.8h, v6.8h | |
3539 + add v14.8h, v5.8h, v3.8h | |
3540 + sub v9.8h, v5.8h, v3.8h | |
3541 + sub v13.8h, v10.8h, v2.8h | |
3542 + add v10.8h, v10.8h, v2.8h | |
3543 + sub v11.8h, v12.8h, v1.8h | |
3544 + add v12.8h, v12.8h, v1.8h | |
3545 + /* Descale to 8-bit and range limit */ | |
3546 + movi v0.16b, #0x80 | |
3547 + sqshrn v8.8b, v8.8h, #5 | |
3548 + sqshrn2 v8.16b, v9.8h, #5 | |
3549 + sqshrn v9.8b, v10.8h, #5 | |
3550 + sqshrn2 v9.16b, v11.8h, #5 | |
3551 + sqshrn v10.8b, v12.8h, #5 | |
3552 + sqshrn2 v10.16b, v13.8h, #5 | |
3553 + sqshrn v11.8b, v14.8h, #5 | |
3554 + sqshrn2 v11.16b, v15.8h, #5 | |
3555 + add v8.16b, v8.16b, v0.16b | |
3556 + add v9.16b, v9.16b, v0.16b | |
3557 + add v10.16b, v10.16b, v0.16b | |
3558 + add v11.16b, v11.16b, v0.16b | |
3559 + /* Transpose the final 8-bit samples */ | |
3560 + /* Transpose q8-q9 */ | |
3561 + mov v18.16b, v8.16b | |
3562 + trn1 v8.8h, v8.8h, v9.8h | |
3563 + trn2 v9.8h, v18.8h, v9.8h | |
3564 + /* Transpose q10-q11 */ | |
3565 + mov v18.16b, v10.16b | |
3566 + trn1 v10.8h, v10.8h, v11.8h | |
3567 + trn2 v11.8h, v18.8h, v11.8h | |
3568 + /* Transpose q8-q10 */ | |
3569 + mov v18.16b, v8.16b | |
3570 + trn1 v8.4s, v8.4s, v10.4s | |
3571 + trn2 v10.4s, v18.4s, v10.4s | |
3572 + /* Transpose q9-q11 */ | |
3573 + mov v18.16b, v9.16b | |
3574 + trn1 v9.4s, v9.4s, v11.4s | |
3575 + trn2 v11.4s, v18.4s, v11.4s | |
3576 + /* make copy */ | |
3577 + ins v17.2d[0], v8.2d[1] | |
3578 + /* Transpose d16-d17-msb */ | |
3579 + mov v18.16b, v8.16b | |
3580 + trn1 v8.8b, v8.8b, v17.8b | |
3581 + trn2 v17.8b, v18.8b, v17.8b | |
3582 + /* make copy */ | |
3583 + ins v19.2d[0], v9.2d[1] | |
3584 + mov v18.16b, v9.16b | |
3585 + trn1 v9.8b, v9.8b, v19.8b | |
3586 + trn2 v19.8b, v18.8b, v19.8b | |
3587 + /* Store results to the output buffer */ | |
3588 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
3589 + add TMP1, TMP1, OUTPUT_COL | |
3590 + add TMP2, TMP2, OUTPUT_COL | |
3591 + st1 {v8.8b}, [TMP1] | |
3592 + st1 {v17.8b}, [TMP2] | |
3593 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
3594 + add TMP1, TMP1, OUTPUT_COL | |
3595 + add TMP2, TMP2, OUTPUT_COL | |
3596 + st1 {v9.8b}, [TMP1] | |
3597 + /* make copy */ | |
3598 + ins v7.2d[0], v10.2d[1] | |
3599 + mov v18.16b, v10.16b | |
3600 + trn1 v10.8b, v10.8b, v7.8b | |
3601 + trn2 v7.8b, v18.8b, v7.8b | |
3602 + st1 {v19.8b}, [TMP2] | |
3603 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
3604 + ldp TMP4, TMP5, [OUTPUT_BUF], 16 | |
3605 + add TMP1, TMP1, OUTPUT_COL | |
3606 + add TMP2, TMP2, OUTPUT_COL | |
3607 + add TMP4, TMP4, OUTPUT_COL | |
3608 + add TMP5, TMP5, OUTPUT_COL | |
3609 + st1 {v10.8b}, [TMP1] | |
3610 + /* make copy */ | |
3611 + ins v16.2d[0], v11.2d[1] | |
3612 + mov v18.16b, v11.16b | |
3613 + trn1 v11.8b, v11.8b, v16.8b | |
3614 + trn2 v16.8b, v18.8b, v16.8b | |
3615 + st1 {v7.8b}, [TMP2] | |
3616 + st1 {v11.8b}, [TMP4] | |
3617 + st1 {v16.8b}, [TMP5] | |
3618 + sub sp, sp, #176 | |
3619 + ldp x22, x23, [sp], 16 | |
3620 + ld1 {v0.8b - v3.8b}, [sp], 32 | |
3621 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
3622 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
3623 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
3624 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
3625 + blr x30 | |
3626 + | |
3627 + .unreq DCT_TABLE | |
3628 + .unreq COEF_BLOCK | |
3629 + .unreq OUTPUT_BUF | |
3630 + .unreq OUTPUT_COL | |
3631 + .unreq TMP1 | |
3632 + .unreq TMP2 | |
3633 + .unreq TMP3 | |
3634 + .unreq TMP4 | |
3635 + | |
3636 + | |
3637 +/*****************************************************************************/ | |
3638 + | |
3639 +/* | |
3640 + * jsimd_idct_4x4_neon | |
3641 + * | |
3642 + * This function contains inverse-DCT code for getting reduced-size | |
3643 + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations | |
3644 + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | |
3645 + * function from jpeg-6b (jidctred.c). | |
3646 + * | |
3647 + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which | |
3648 + * requires much less arithmetic operations and hence should be faster. | |
3649 + * The primary purpose of this particular NEON optimized function is | |
3650 + * bit exact compatibility with jpeg-6b. | |
3651 + * | |
3652 + * TODO: a bit better instructions scheduling can be achieved by expanding | |
3653 + * idct_helper/transpose_4x4 macros and reordering instructions, | |
3654 + * but readability will suffer somewhat. | |
3655 + */ | |
3656 + | |
3657 +#define CONST_BITS 13 | |
3658 + | |
3659 +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ | |
3660 +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ | |
3661 +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ | |
3662 +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ | |
3663 +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ | |
3664 +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ | |
3665 +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ | |
3666 +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ | |
3667 +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ | |
3668 +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ | |
3669 +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ | |
3670 +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ | |
3671 +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ | |
3672 +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ | |
3673 + | |
3674 +.balign 16 | |
3675 +jsimd_idct_4x4_neon_consts: | |
3676 + .short FIX_1_847759065 /* v0.4h[0] */ | |
3677 + .short -FIX_0_765366865 /* v0.4h[1] */ | |
3678 + .short -FIX_0_211164243 /* v0.4h[2] */ | |
3679 + .short FIX_1_451774981 /* v0.4h[3] */ | |
3680 + .short -FIX_2_172734803 /* d1[0] */ | |
3681 + .short FIX_1_061594337 /* d1[1] */ | |
3682 + .short -FIX_0_509795579 /* d1[2] */ | |
3683 + .short -FIX_0_601344887 /* d1[3] */ | |
3684 + .short FIX_0_899976223 /* v2.4h[0] */ | |
3685 + .short FIX_2_562915447 /* v2.4h[1] */ | |
3686 + .short 1 << (CONST_BITS+1) /* v2.4h[2] */ | |
3687 + .short 0 /* v2.4h[3] */ | |
3688 + | |
3689 +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 | |
3690 + smull v28.4s, \x4, v2.4h[2] | |
3691 + smlal v28.4s, \x8, v0.4h[0] | |
3692 + smlal v28.4s, \x14, v0.4h[1] | |
3693 + | |
3694 + smull v26.4s, \x16, v1.4h[2] | |
3695 + smlal v26.4s, \x12, v1.4h[3] | |
3696 + smlal v26.4s, \x10, v2.4h[0] | |
3697 + smlal v26.4s, \x6, v2.4h[1] | |
3698 + | |
3699 + smull v30.4s, \x4, v2.4h[2] | |
3700 + smlsl v30.4s, \x8, v0.4h[0] | |
3701 + smlsl v30.4s, \x14, v0.4h[1] | |
3702 + | |
3703 + smull v24.4s, \x16, v0.4h[2] | |
3704 + smlal v24.4s, \x12, v0.4h[3] | |
3705 + smlal v24.4s, \x10, v1.4h[0] | |
3706 + smlal v24.4s, \x6, v1.4h[1] | |
3707 + | |
3708 + add v20.4s, v28.4s, v26.4s | |
3709 + sub v28.4s, v28.4s, v26.4s | |
3710 + | |
3711 +.if \shift > 16 | |
3712 + srshr v20.4s, v20.4s, #\shift | |
3713 + srshr v28.4s, v28.4s, #\shift | |
3714 + xtn \y26, v20.4s | |
3715 + xtn \y29, v28.4s | |
3716 +.else | |
3717 + rshrn \y26, v20.4s, #\shift | |
3718 + rshrn \y29, v28.4s, #\shift | |
3719 +.endif | |
3720 + | |
3721 + add v20.4s, v30.4s, v24.4s | |
3722 + sub v30.4s, v30.4s, v24.4s | |
3723 + | |
3724 +.if \shift > 16 | |
3725 + srshr v20.4s, v20.4s, #\shift | |
3726 + srshr v30.4s, v30.4s, #\shift | |
3727 + xtn \y27, v20.4s | |
3728 + xtn \y28, v30.4s | |
3729 +.else | |
3730 + rshrn \y27, v20.4s, #\shift | |
3731 + rshrn \y28, v30.4s, #\shift | |
3732 +.endif | |
3733 + | |
3734 +.endm | |
3735 + | |
3736 +asm_function jsimd_idct_4x4_neon | |
3737 + | |
3738 + DCT_TABLE .req x0 | |
3739 + COEF_BLOCK .req x1 | |
3740 + OUTPUT_BUF .req x2 | |
3741 + OUTPUT_COL .req x3 | |
3742 + TMP1 .req x0 | |
3743 + TMP2 .req x1 | |
3744 + TMP3 .req x2 | |
3745 + TMP4 .req x15 | |
3746 + | |
3747 + /* Save all used NEON registers */ | |
3748 + sub sp, sp, 272 | |
3749 + str x15, [sp], 16 | |
3750 + /* Load constants (v3.4h is just used for padding) */ | |
3751 + adr TMP4, jsimd_idct_4x4_neon_consts | |
3752 + st1 {v0.8b - v3.8b}, [sp], 32 | |
3753 + st1 {v4.8b - v7.8b}, [sp], 32 | |
3754 + st1 {v8.8b - v11.8b}, [sp], 32 | |
3755 + st1 {v12.8b - v15.8b}, [sp], 32 | |
3756 + st1 {v16.8b - v19.8b}, [sp], 32 | |
3757 + st1 {v20.8b - v23.8b}, [sp], 32 | |
3758 + st1 {v24.8b - v27.8b}, [sp], 32 | |
3759 + st1 {v28.8b - v31.8b}, [sp], 32 | |
3760 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] | |
3761 + | |
3762 + /* Load all COEF_BLOCK into NEON registers with the following allocation: | |
3763 + * 0 1 2 3 | 4 5 6 7 | |
3764 + * ---------+-------- | |
3765 + * 0 | v4.4h | v5.4h | |
3766 + * 1 | v6.4h | v7.4h | |
3767 + * 2 | v8.4h | v9.4h | |
3768 + * 3 | v10.4h | v11.4h | |
3769 + * 4 | - | - | |
3770 + * 5 | v12.4h | v13.4h | |
3771 + * 6 | v14.4h | v15.4h | |
3772 + * 7 | v16.4h | v17.4h | |
3773 + */ | |
3774 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 | |
3775 + ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 | |
3776 + add COEF_BLOCK, COEF_BLOCK, #16 | |
3777 + ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 | |
3778 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | |
3779 + /* dequantize */ | |
3780 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | |
3781 + mul v4.4h, v4.4h, v18.4h | |
3782 + mul v5.4h, v5.4h, v19.4h | |
3783 + ins v4.2d[1], v5.2d[0] /* 128 bit q4 */ | |
3784 + ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 | |
3785 + mul v6.4h, v6.4h, v20.4h | |
3786 + mul v7.4h, v7.4h, v21.4h | |
3787 + ins v6.2d[1], v7.2d[0] /* 128 bit q6 */ | |
3788 + mul v8.4h, v8.4h, v22.4h | |
3789 + mul v9.4h, v9.4h, v23.4h | |
3790 + ins v8.2d[1], v9.2d[0] /* 128 bit q8 */ | |
3791 + add DCT_TABLE, DCT_TABLE, #16 | |
3792 + ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 | |
3793 + mul v10.4h, v10.4h, v24.4h | |
3794 + mul v11.4h, v11.4h, v25.4h | |
3795 + ins v10.2d[1], v11.2d[0] /* 128 bit q10 */ | |
3796 + mul v12.4h, v12.4h, v26.4h | |
3797 + mul v13.4h, v13.4h, v27.4h | |
3798 + ins v12.2d[1], v13.2d[0] /* 128 bit q12 */ | |
3799 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | |
3800 + mul v14.4h, v14.4h, v28.4h | |
3801 + mul v15.4h, v15.4h, v29.4h | |
3802 + ins v14.2d[1], v15.2d[0] /* 128 bit q14 */ | |
3803 + mul v16.4h, v16.4h, v30.4h | |
3804 + mul v17.4h, v17.4h, v31.4h | |
3805 + ins v16.2d[1], v17.2d[0] /* 128 bit q16 */ | |
3806 + | |
3807 + /* Pass 1 */ | |
3808 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4
.4h, v6.4h, v8.4h, v10.4h | |
3809 + transpose_4x4 v4, v6, v8, v10, v3 | |
3810 + ins v10.2d[1], v11.2d[0] | |
3811 + idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5
.4h, v7.4h, v9.4h, v11.4h | |
3812 + transpose_4x4 v5, v7, v9, v11, v3 | |
3813 + ins v10.2d[1], v11.2d[0] | |
3814 + /* Pass 2 */ | |
3815 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.
4h, v27.4h, v28.4h, v29.4h | |
3816 + transpose_4x4 v26, v27, v28, v29, v3 | |
3817 + | |
3818 + /* Range limit */ | |
3819 + movi v30.8h, #0x80 | |
3820 + ins v26.2d[1], v27.2d[0] | |
3821 + ins v28.2d[1], v29.2d[0] | |
3822 + add v26.8h, v26.8h, v30.8h | |
3823 + add v28.8h, v28.8h, v30.8h | |
3824 + sqxtun v26.8b, v26.8h | |
3825 + sqxtun v27.8b, v28.8h | |
3826 + | |
3827 + /* Store results to the output buffer */ | |
3828 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
3829 + ldp TMP3, TMP4, [OUTPUT_BUF] | |
3830 + add TMP1, TMP1, OUTPUT_COL | |
3831 + add TMP2, TMP2, OUTPUT_COL | |
3832 + add TMP3, TMP3, OUTPUT_COL | |
3833 + add TMP4, TMP4, OUTPUT_COL | |
3834 + | |
3835 +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT | |
3836 + /* We can use much less instructions on little endian systems if the | |
3837 + * OS kernel is not configured to trap unaligned memory accesses | |
3838 + */ | |
3839 + st1 {v26.s}[0], [TMP1], 4 | |
3840 + st1 {v27.s}[0], [TMP3], 4 | |
3841 + st1 {v26.s}[1], [TMP2], 4 | |
3842 + st1 {v27.s}[1], [TMP4], 4 | |
3843 +#else | |
3844 + st1 {v26.b}[0], [TMP1], 1 | |
3845 + st1 {v27.b}[0], [TMP3], 1 | |
3846 + st1 {v26.b}[1], [TMP1], 1 | |
3847 + st1 {v27.b}[1], [TMP3], 1 | |
3848 + st1 {v26.b}[2], [TMP1], 1 | |
3849 + st1 {v27.b}[2], [TMP3], 1 | |
3850 + st1 {v26.b}[3], [TMP1], 1 | |
3851 + st1 {v27.b}[3], [TMP3], 1 | |
3852 + | |
3853 + st1 {v26.b}[4], [TMP2], 1 | |
3854 + st1 {v27.b}[4], [TMP4], 1 | |
3855 + st1 {v26.b}[5], [TMP2], 1 | |
3856 + st1 {v27.b}[5], [TMP4], 1 | |
3857 + st1 {v26.b}[6], [TMP2], 1 | |
3858 + st1 {v27.b}[6], [TMP4], 1 | |
3859 + st1 {v26.b}[7], [TMP2], 1 | |
3860 + st1 {v27.b}[7], [TMP4], 1 | |
3861 +#endif | |
3862 + | |
3863 + /* vpop {v8.4h - v15.4h} ;not available */ | |
3864 + sub sp, sp, #272 | |
3865 + ldr x15, [sp], 16 | |
3866 + ld1 {v0.8b - v3.8b}, [sp], 32 | |
3867 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
3868 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
3869 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
3870 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
3871 + ld1 {v20.8b - v23.8b}, [sp], 32 | |
3872 + ld1 {v24.8b - v27.8b}, [sp], 32 | |
3873 + ld1 {v28.8b - v31.8b}, [sp], 32 | |
3874 + blr x30 | |
3875 + | |
3876 + .unreq DCT_TABLE | |
3877 + .unreq COEF_BLOCK | |
3878 + .unreq OUTPUT_BUF | |
3879 + .unreq OUTPUT_COL | |
3880 + .unreq TMP1 | |
3881 + .unreq TMP2 | |
3882 + .unreq TMP3 | |
3883 + .unreq TMP4 | |
3884 + | |
3885 +.purgem idct_helper | |
3886 + | |
3887 + | |
3888 +/*****************************************************************************/ | |
3889 + | |
3890 +/* | |
3891 + * jsimd_idct_2x2_neon | |
3892 + * | |
3893 + * This function contains inverse-DCT code for getting reduced-size | |
3894 + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations | |
3895 + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' | |
3896 + * function from jpeg-6b (jidctred.c). | |
3897 + * | |
3898 + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which | |
3899 + * requires much less arithmetic operations and hence should be faster. | |
3900 + * The primary purpose of this particular NEON optimized function is | |
3901 + * bit exact compatibility with jpeg-6b. | |
3902 + */ | |
3903 + | |
3904 +.balign 8 | |
3905 +jsimd_idct_2x2_neon_consts: | |
3906 + .short -FIX_0_720959822 /* v14[0] */ | |
3907 + .short FIX_0_850430095 /* v14[1] */ | |
3908 + .short -FIX_1_272758580 /* v14[2] */ | |
3909 + .short FIX_3_624509785 /* v14[3] */ | |
3910 + | |
3911 +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 | |
3912 + sshll v15.4s, \x4, #15 | |
3913 + smull v26.4s, \x6, v14.4h[3] | |
3914 + smlal v26.4s, \x10, v14.4h[2] | |
3915 + smlal v26.4s, \x12, v14.4h[1] | |
3916 + smlal v26.4s, \x16, v14.4h[0] | |
3917 + | |
3918 + add v20.4s, v15.4s, v26.4s | |
3919 + sub v15.4s, v15.4s, v26.4s | |
3920 + | |
3921 +.if \shift > 16 | |
3922 + srshr v20.4s, v20.4s, #\shift | |
3923 + srshr v15.4s, v15.4s, #\shift | |
3924 + xtn \y26, v20.4s | |
3925 + xtn \y27, v15.4s | |
3926 +.else | |
3927 + rshrn \y26, v20.4s, #\shift | |
3928 + rshrn \y27, v15.4s, #\shift | |
3929 +.endif | |
3930 + | |
3931 +.endm | |
3932 + | |
3933 +asm_function jsimd_idct_2x2_neon | |
3934 + | |
3935 + DCT_TABLE .req x0 | |
3936 + COEF_BLOCK .req x1 | |
3937 + OUTPUT_BUF .req x2 | |
3938 + OUTPUT_COL .req x3 | |
3939 + TMP1 .req x0 | |
3940 + TMP2 .req x15 | |
3941 + | |
3942 + /* vpush {v8.4h - v15.4h} ; not available */ | |
3943 + sub sp, sp, 208 | |
3944 + str x15, [sp], 16 | |
3945 + | |
3946 + /* Load constants */ | |
3947 + adr TMP2, jsimd_idct_2x2_neon_consts | |
3948 + st1 {v4.8b - v7.8b}, [sp], 32 | |
3949 + st1 {v8.8b - v11.8b}, [sp], 32 | |
3950 + st1 {v12.8b - v15.8b}, [sp], 32 | |
3951 + st1 {v16.8b - v19.8b}, [sp], 32 | |
3952 + st1 {v21.8b - v22.8b}, [sp], 16 | |
3953 + st1 {v24.8b - v27.8b}, [sp], 32 | |
3954 + st1 {v30.8b - v31.8b}, [sp], 16 | |
3955 + ld1 {v14.4h}, [TMP2] | |
3956 + | |
3957 + /* Load all COEF_BLOCK into NEON registers with the following allocation: | |
3958 + * 0 1 2 3 | 4 5 6 7 | |
3959 + * ---------+-------- | |
3960 + * 0 | v4.4h | v5.4h | |
3961 + * 1 | v6.4h | v7.4h | |
3962 + * 2 | - | - | |
3963 + * 3 | v10.4h | v11.4h | |
3964 + * 4 | - | - | |
3965 + * 5 | v12.4h | v13.4h | |
3966 + * 6 | - | - | |
3967 + * 7 | v16.4h | v17.4h | |
3968 + */ | |
3969 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 | |
3970 + add COEF_BLOCK, COEF_BLOCK, #16 | |
3971 + ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 | |
3972 + add COEF_BLOCK, COEF_BLOCK, #16 | |
3973 + ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 | |
3974 + add COEF_BLOCK, COEF_BLOCK, #16 | |
3975 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | |
3976 + /* Dequantize */ | |
3977 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | |
3978 + mul v4.4h, v4.4h, v18.4h | |
3979 + mul v5.4h, v5.4h, v19.4h | |
3980 + ins v4.2d[1], v5.2d[0] | |
3981 + mul v6.4h, v6.4h, v20.4h | |
3982 + mul v7.4h, v7.4h, v21.4h | |
3983 + ins v6.2d[1], v7.2d[0] | |
3984 + add DCT_TABLE, DCT_TABLE, #16 | |
3985 + ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 | |
3986 + mul v10.4h, v10.4h, v24.4h | |
3987 + mul v11.4h, v11.4h, v25.4h | |
3988 + ins v10.2d[1], v11.2d[0] | |
3989 + add DCT_TABLE, DCT_TABLE, #16 | |
3990 + ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 | |
3991 + mul v12.4h, v12.4h, v26.4h | |
3992 + mul v13.4h, v13.4h, v27.4h | |
3993 + ins v12.2d[1], v13.2d[0] | |
3994 + add DCT_TABLE, DCT_TABLE, #16 | |
3995 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | |
3996 + mul v16.4h, v16.4h, v30.4h | |
3997 + mul v17.4h, v17.4h, v31.4h | |
3998 + ins v16.2d[1], v17.2d[0] | |
3999 + | |
4000 + /* Pass 1 */ | |
4001 +#if 0 | |
4002 + idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h | |
4003 + transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h | |
4004 + idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h | |
4005 + transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h | |
4006 +#else | |
4007 + smull v26.4s, v6.4h, v14.4h[3] | |
4008 + smlal v26.4s, v10.4h, v14.4h[2] | |
4009 + smlal v26.4s, v12.4h, v14.4h[1] | |
4010 + smlal v26.4s, v16.4h, v14.4h[0] | |
4011 + smull v24.4s, v7.4h, v14.4h[3] | |
4012 + smlal v24.4s, v11.4h, v14.4h[2] | |
4013 + smlal v24.4s, v13.4h, v14.4h[1] | |
4014 + smlal v24.4s, v17.4h, v14.4h[0] | |
4015 + sshll v15.4s, v4.4h, #15 | |
4016 + sshll v30.4s, v5.4h, #15 | |
4017 + add v20.4s, v15.4s, v26.4s | |
4018 + sub v15.4s, v15.4s, v26.4s | |
4019 + rshrn v4.4h, v20.4s, #13 | |
4020 + rshrn v6.4h, v15.4s, #13 | |
4021 + add v20.4s, v30.4s, v24.4s | |
4022 + sub v15.4s, v30.4s, v24.4s | |
4023 + rshrn v5.4h, v20.4s, #13 | |
4024 + rshrn v7.4h, v15.4s, #13 | |
4025 + ins v4.2d[1], v5.2d[0] | |
4026 + ins v6.2d[1], v7.2d[0] | |
4027 + transpose v4, v6, v3, .16b, .8h | |
4028 + transpose v6, v10, v3, .16b, .4s | |
4029 + ins v11.2d[0], v10.2d[1] | |
4030 + ins v7.2d[0], v6.2d[1] | |
4031 +#endif | |
4032 + | |
4033 + /* Pass 2 */ | |
4034 + idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h | |
4035 + | |
4036 + /* Range limit */ | |
4037 + movi v30.8h, #0x80 | |
4038 + ins v26.2d[1], v27.2d[0] | |
4039 + add v26.8h, v26.8h, v30.8h | |
4040 + sqxtun v30.8b, v26.8h | |
4041 + ins v26.2d[0], v30.2d[0] | |
4042 + sqxtun v27.8b, v26.8h | |
4043 + | |
4044 + /* Store results to the output buffer */ | |
4045 + ldp TMP1, TMP2, [OUTPUT_BUF] | |
4046 + add TMP1, TMP1, OUTPUT_COL | |
4047 + add TMP2, TMP2, OUTPUT_COL | |
4048 + | |
4049 + st1 {v26.b}[0], [TMP1], 1 | |
4050 + st1 {v27.b}[4], [TMP1], 1 | |
4051 + st1 {v26.b}[1], [TMP2], 1 | |
4052 + st1 {v27.b}[5], [TMP2], 1 | |
4053 + | |
4054 + sub sp, sp, #208 | |
4055 + ldr x15, [sp], 16 | |
4056 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
4057 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
4058 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
4059 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
4060 + ld1 {v21.8b - v22.8b}, [sp], 16 | |
4061 + ld1 {v24.8b - v27.8b}, [sp], 32 | |
4062 + ld1 {v30.8b - v31.8b}, [sp], 16 | |
4063 + blr x30 | |
4064 + | |
4065 + .unreq DCT_TABLE | |
4066 + .unreq COEF_BLOCK | |
4067 + .unreq OUTPUT_BUF | |
4068 + .unreq OUTPUT_COL | |
4069 + .unreq TMP1 | |
4070 + .unreq TMP2 | |
4071 + | |
4072 +.purgem idct_helper | |
4073 + | |
4074 + | |
4075 +/*****************************************************************************/ | |
4076 + | |
4077 +/* | |
4078 + * jsimd_ycc_extrgb_convert_neon | |
4079 + * jsimd_ycc_extbgr_convert_neon | |
4080 + * jsimd_ycc_extrgbx_convert_neon | |
4081 + * jsimd_ycc_extbgrx_convert_neon | |
4082 + * jsimd_ycc_extxbgr_convert_neon | |
4083 + * jsimd_ycc_extxrgb_convert_neon | |
4084 + * | |
4085 + * Colorspace conversion YCbCr -> RGB | |
4086 + */ | |
4087 + | |
4088 + | |
4089 +.macro do_load size | |
4090 + .if \size == 8 | |
4091 + ld1 {v4.8b}, [U], 8 | |
4092 + ld1 {v5.8b}, [V], 8 | |
4093 + ld1 {v0.8b}, [Y], 8 | |
4094 + prfm PLDL1KEEP, [U, #64] | |
4095 + prfm PLDL1KEEP, [V, #64] | |
4096 + prfm PLDL1KEEP, [Y, #64] | |
4097 + .elseif \size == 4 | |
4098 + ld1 {v4.b}[0], [U], 1 | |
4099 + ld1 {v4.b}[1], [U], 1 | |
4100 + ld1 {v4.b}[2], [U], 1 | |
4101 + ld1 {v4.b}[3], [U], 1 | |
4102 + ld1 {v5.b}[0], [V], 1 | |
4103 + ld1 {v5.b}[1], [V], 1 | |
4104 + ld1 {v5.b}[2], [V], 1 | |
4105 + ld1 {v5.b}[3], [V], 1 | |
4106 + ld1 {v0.b}[0], [Y], 1 | |
4107 + ld1 {v0.b}[1], [Y], 1 | |
4108 + ld1 {v0.b}[2], [Y], 1 | |
4109 + ld1 {v0.b}[3], [Y], 1 | |
4110 + .elseif \size == 2 | |
4111 + ld1 {v4.b}[4], [U], 1 | |
4112 + ld1 {v4.b}[5], [U], 1 | |
4113 + ld1 {v5.b}[4], [V], 1 | |
4114 + ld1 {v5.b}[5], [V], 1 | |
4115 + ld1 {v0.b}[4], [Y], 1 | |
4116 + ld1 {v0.b}[5], [Y], 1 | |
4117 + .elseif \size == 1 | |
4118 + ld1 {v4.b}[6], [U], 1 | |
4119 + ld1 {v5.b}[6], [V], 1 | |
4120 + ld1 {v0.b}[6], [Y], 1 | |
4121 + .else | |
4122 + .error unsupported macroblock size | |
4123 + .endif | |
4124 +.endm | |
4125 + | |
4126 +.macro do_store bpp, size | |
4127 + .if \bpp == 24 | |
4128 + .if \size == 8 | |
4129 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 | |
4130 + .elseif \size == 4 | |
4131 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 | |
4132 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 | |
4133 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 | |
4134 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 | |
4135 + .elseif \size == 2 | |
4136 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 | |
4137 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 | |
4138 + .elseif \size == 1 | |
4139 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 | |
4140 + .else | |
4141 + .error unsupported macroblock size | |
4142 + .endif | |
4143 + .elseif \bpp == 32 | |
4144 + .if \size == 8 | |
4145 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 | |
4146 + .elseif \size == 4 | |
4147 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 | |
4148 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 | |
4149 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 | |
4150 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 | |
4151 + .elseif \size == 2 | |
4152 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 | |
4153 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 | |
4154 + .elseif \size == 1 | |
4155 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 | |
4156 + .else | |
4157 + .error unsupported macroblock size | |
4158 + .endif | |
4159 + .elseif \bpp==16 | |
4160 + .if \size == 8 | |
4161 + st1 {v25.8h}, [RGB],16 | |
4162 + .elseif \size == 4 | |
4163 + st1 {v25.4h}, [RGB],8 | |
4164 + .elseif \size == 2 | |
4165 + st1 {v25.h}[4], [RGB],2 | |
4166 + st1 {v25.h}[5], [RGB],2 | |
4167 + .elseif \size == 1 | |
4168 + st1 {v25.h}[6], [RGB],2 | |
4169 + .else | |
4170 + .error unsupported macroblock size | |
4171 + .endif | |
4172 + .else | |
4173 + .error unsupported bpp | |
4174 + .endif | |
4175 +.endm | |
4176 + | |
4177 +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs,
gsize, b_offs, bsize, defsize | |
4178 + | |
4179 +/* | |
4180 + * 2-stage pipelined YCbCr->RGB conversion | |
4181 + */ | |
4182 + | |
4183 +.macro do_yuv_to_rgb_stage1 | |
4184 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ | |
4185 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | |
4186 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | |
4187 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | |
4188 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | |
4189 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | |
4190 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | |
4191 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | |
4192 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | |
4193 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | |
4194 +.endm | |
4195 + | |
4196 +.macro do_yuv_to_rgb_stage2 | |
4197 + rshrn v20.4h, v20.4s, #15 | |
4198 + rshrn2 v20.8h, v22.4s, #15 | |
4199 + rshrn v24.4h, v24.4s, #14 | |
4200 + rshrn2 v24.8h, v26.4s, #14 | |
4201 + rshrn v28.4h, v28.4s, #14 | |
4202 + rshrn2 v28.8h, v30.4s, #14 | |
4203 + uaddw v20.8h, v20.8h, v0.8b | |
4204 + uaddw v24.8h, v24.8h, v0.8b | |
4205 + uaddw v28.8h, v28.8h, v0.8b | |
4206 +.if \bpp != 16 | |
4207 + sqxtun v1\g_offs\defsize, v20.8h | |
4208 + sqxtun v1\r_offs\defsize, v24.8h | |
4209 + sqxtun v1\b_offs\defsize, v28.8h | |
4210 +.else | |
4211 + sqshlu v21.8h, v20.8h, #8 | |
4212 + sqshlu v25.8h, v24.8h, #8 | |
4213 + sqshlu v29.8h, v28.8h, #8 | |
4214 + sri v25.8h, v21.8h, #5 | |
4215 + sri v25.8h, v29.8h, #11 | |
4216 +.endif | |
4217 + | |
4218 +.endm | |
4219 + | |
4220 +.macro do_yuv_to_rgb_stage2_store_load_stage1 | |
4221 + rshrn v20.4h, v20.4s, #15 | |
4222 + rshrn v24.4h, v24.4s, #14 | |
4223 + rshrn v28.4h, v28.4s, #14 | |
4224 + ld1 {v4.8b}, [U], 8 | |
4225 + rshrn2 v20.8h, v22.4s, #15 | |
4226 + rshrn2 v24.8h, v26.4s, #14 | |
4227 + rshrn2 v28.8h, v30.4s, #14 | |
4228 + ld1 {v5.8b}, [V], 8 | |
4229 + uaddw v20.8h, v20.8h, v0.8b | |
4230 + uaddw v24.8h, v24.8h, v0.8b | |
4231 + uaddw v28.8h, v28.8h, v0.8b | |
4232 +.if \bpp != 16 /**************** rgb24/rgb32 *********************************/ | |
4233 + sqxtun v1\g_offs\defsize, v20.8h | |
4234 + ld1 {v0.8b}, [Y], 8 | |
4235 + sqxtun v1\r_offs\defsize, v24.8h | |
4236 + prfm PLDL1KEEP, [U, #64] | |
4237 + prfm PLDL1KEEP, [V, #64] | |
4238 + prfm PLDL1KEEP, [Y, #64] | |
4239 + sqxtun v1\b_offs\defsize, v28.8h | |
4240 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | |
4241 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | |
4242 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | |
4243 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | |
4244 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | |
4245 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | |
4246 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | |
4247 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | |
4248 +.else /**************************** rgb565 ***********************************/ | |
4249 + sqshlu v21.8h, v20.8h, #8 | |
4250 + sqshlu v25.8h, v24.8h, #8 | |
4251 + sqshlu v29.8h, v28.8h, #8 | |
4252 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | |
4253 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | |
4254 + ld1 {v0.8b}, [Y], 8 | |
4255 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | |
4256 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | |
4257 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | |
4258 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | |
4259 + sri v25.8h, v21.8h, #5 | |
4260 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | |
4261 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | |
4262 + prfm PLDL1KEEP, [U, #64] | |
4263 + prfm PLDL1KEEP, [V, #64] | |
4264 + prfm PLDL1KEEP, [Y, #64] | |
4265 + sri v25.8h, v29.8h, #11 | |
4266 +.endif | |
4267 + do_store \bpp, 8 | |
4268 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | |
4269 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | |
4270 +.endm | |
4271 + | |
4272 +.macro do_yuv_to_rgb | |
4273 + do_yuv_to_rgb_stage1 | |
4274 + do_yuv_to_rgb_stage2 | |
4275 +.endm | |
4276 + | |
4277 +/* Apple gas crashes on adrl, work around that by using adr. | |
4278 + * But this requires a copy of these constants for each function. | |
4279 + */ | |
4280 + | |
4281 +.balign 16 | |
4282 +jsimd_ycc_\colorid\()_neon_consts: | |
4283 + .short 0, 0, 0, 0 | |
4284 + .short 22971, -11277, -23401, 29033 | |
4285 + .short -128, -128, -128, -128 | |
4286 + .short -128, -128, -128, -128 | |
4287 + | |
4288 +asm_function jsimd_ycc_\colorid\()_convert_neon | |
4289 + OUTPUT_WIDTH .req x0 | |
4290 + INPUT_BUF .req x1 | |
4291 + INPUT_ROW .req x2 | |
4292 + OUTPUT_BUF .req x3 | |
4293 + NUM_ROWS .req x4 | |
4294 + | |
4295 + INPUT_BUF0 .req x5 | |
4296 + INPUT_BUF1 .req x6 | |
4297 + INPUT_BUF2 .req INPUT_BUF | |
4298 + | |
4299 + RGB .req x7 | |
4300 + Y .req x8 | |
4301 + U .req x9 | |
4302 + V .req x10 | |
4303 + N .req x15 | |
4304 + | |
4305 + sub sp, sp, 336 | |
4306 + str x15, [sp], 16 | |
4307 + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ | |
4308 + adr x15, jsimd_ycc_\colorid\()_neon_consts | |
4309 + /* Save NEON registers */ | |
4310 + st1 {v0.8b - v3.8b}, [sp], 32 | |
4311 + st1 {v4.8b - v7.8b}, [sp], 32 | |
4312 + st1 {v8.8b - v11.8b}, [sp], 32 | |
4313 + st1 {v12.8b - v15.8b}, [sp], 32 | |
4314 + st1 {v16.8b - v19.8b}, [sp], 32 | |
4315 + st1 {v20.8b - v23.8b}, [sp], 32 | |
4316 + st1 {v24.8b - v27.8b}, [sp], 32 | |
4317 + st1 {v28.8b - v31.8b}, [sp], 32 | |
4318 + ld1 {v0.4h, v1.4h}, [x15], 16 | |
4319 + ld1 {v2.8h}, [x15] | |
4320 + | |
4321 + /* Save ARM registers and handle input arguments */ | |
4322 + /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ | |
4323 + stp x4, x5, [sp], 16 | |
4324 + stp x6, x7, [sp], 16 | |
4325 + stp x8, x9, [sp], 16 | |
4326 + stp x10, x30, [sp], 16 | |
4327 + ldr INPUT_BUF0, [INPUT_BUF] | |
4328 + ldr INPUT_BUF1, [INPUT_BUF, 8] | |
4329 + ldr INPUT_BUF2, [INPUT_BUF, 16] | |
4330 + .unreq INPUT_BUF | |
4331 + | |
4332 + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ | |
4333 + movi v10.16b, #255 | |
4334 + movi v13.16b, #255 | |
4335 + | |
4336 + /* Outer loop over scanlines */ | |
4337 + cmp NUM_ROWS, #1 | |
4338 + blt 9f | |
4339 +0: | |
4340 + lsl x16, INPUT_ROW, #3 | |
4341 + ldr Y, [INPUT_BUF0, x16] | |
4342 + ldr U, [INPUT_BUF1, x16] | |
4343 + mov N, OUTPUT_WIDTH | |
4344 + ldr V, [INPUT_BUF2, x16] | |
4345 + add INPUT_ROW, INPUT_ROW, #1 | |
4346 + ldr RGB, [OUTPUT_BUF], #8 | |
4347 + | |
4348 + /* Inner loop over pixels */ | |
4349 + subs N, N, #8 | |
4350 + blt 3f | |
4351 + do_load 8 | |
4352 + do_yuv_to_rgb_stage1 | |
4353 + subs N, N, #8 | |
4354 + blt 2f | |
4355 +1: | |
4356 + do_yuv_to_rgb_stage2_store_load_stage1 | |
4357 + subs N, N, #8 | |
4358 + bge 1b | |
4359 +2: | |
4360 + do_yuv_to_rgb_stage2 | |
4361 + do_store \bpp, 8 | |
4362 + tst N, #7 | |
4363 + beq 8f | |
4364 +3: | |
4365 + tst N, #4 | |
4366 + beq 3f | |
4367 + do_load 4 | |
4368 +3: | |
4369 + tst N, #2 | |
4370 + beq 4f | |
4371 + do_load 2 | |
4372 +4: | |
4373 + tst N, #1 | |
4374 + beq 5f | |
4375 + do_load 1 | |
4376 +5: | |
4377 + do_yuv_to_rgb | |
4378 + tst N, #4 | |
4379 + beq 6f | |
4380 + do_store \bpp, 4 | |
4381 +6: | |
4382 + tst N, #2 | |
4383 + beq 7f | |
4384 + do_store \bpp, 2 | |
4385 +7: | |
4386 + tst N, #1 | |
4387 + beq 8f | |
4388 + do_store \bpp, 1 | |
4389 +8: | |
4390 + subs NUM_ROWS, NUM_ROWS, #1 | |
4391 + bgt 0b | |
4392 +9: | |
4393 + /* Restore all registers and return */ | |
4394 + sub sp, sp, #336 | |
4395 + ldr x15, [sp], 16 | |
4396 + ld1 {v0.8b - v3.8b}, [sp], 32 | |
4397 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
4398 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
4399 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
4400 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
4401 + ld1 {v20.8b - v23.8b}, [sp], 32 | |
4402 + ld1 {v24.8b - v27.8b}, [sp], 32 | |
4403 + ld1 {v28.8b - v31.8b}, [sp], 32 | |
4404 + /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ | |
4405 + ldp x4, x5, [sp], 16 | |
4406 + ldp x6, x7, [sp], 16 | |
4407 + ldp x8, x9, [sp], 16 | |
4408 + ldp x10, x30, [sp], 16 | |
4409 + br x30 | |
4410 + .unreq OUTPUT_WIDTH | |
4411 + .unreq INPUT_ROW | |
4412 + .unreq OUTPUT_BUF | |
4413 + .unreq NUM_ROWS | |
4414 + .unreq INPUT_BUF0 | |
4415 + .unreq INPUT_BUF1 | |
4416 + .unreq INPUT_BUF2 | |
4417 + .unreq RGB | |
4418 + .unreq Y | |
4419 + .unreq U | |
4420 + .unreq V | |
4421 + .unreq N | |
4422 + | |
4423 +.purgem do_yuv_to_rgb | |
4424 +.purgem do_yuv_to_rgb_stage1 | |
4425 +.purgem do_yuv_to_rgb_stage2 | |
4426 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 | |
4427 +.endm | |
4428 + | |
4429 +/*--------------------------------- id ----- bpp R rsize G gsize B bsize
defsize */ | |
4430 +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h,
.8b | |
4431 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h,
.8b | |
4432 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h,
.8b | |
4433 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h,
.8b | |
4434 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h,
.8b | |
4435 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h,
.8b | |
4436 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h,
.8b | |
4437 +.purgem do_load | |
4438 +.purgem do_store | |
OLD | NEW |