| OLD | NEW |
| (Empty) |
| 1 Index: jdmarker.c | |
| 2 =================================================================== | |
| 3 --- jdmarker.c (revision 829) | |
| 4 +++ jdmarker.c (working copy) | |
| 5 @@ -910,7 +910,7 @@ | |
| 6 } | |
| 7 | |
| 8 if (cinfo->marker->discarded_bytes != 0) { | |
| 9 - WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c); | |
| 10 + TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c)
; | |
| 11 cinfo->marker->discarded_bytes = 0; | |
| 12 } | |
| 13 | |
| 14 @@ -944,7 +944,144 @@ | |
| 15 return TRUE; | |
| 16 } | |
| 17 | |
| 18 +#ifdef MOTION_JPEG_SUPPORTED | |
| 19 | |
| 20 +/* The default Huffman tables used by motion JPEG frames. When a motion JPEG | |
| 21 + * frame does not have DHT tables, we should use the huffman tables suggested b
y | |
| 22 + * the JPEG standard. Each of these tables represents a member of the JHUFF_TBL
S | |
| 23 + * struct so we can just copy it to the according JHUFF_TBLS member. | |
| 24 + */ | |
| 25 +/* DC table 0 */ | |
| 26 +LOCAL(const unsigned char) mjpg_dc0_bits[] = { | |
| 27 + 0x00, 0x01, 0x05, 0x01, 0x01, 0x01, 0x01, 0x01, | |
| 28 + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
| 29 +}; | |
| 30 + | |
| 31 +LOCAL(const unsigned char) mjpg_dc0_huffval[] = { | |
| 32 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 33 + 0x08, 0x09, 0x0A, 0x0B | |
| 34 +}; | |
| 35 + | |
| 36 +/* DC table 1 */ | |
| 37 +LOCAL(const unsigned char) mjpg_dc1_bits[] = { | |
| 38 + 0x00, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, | |
| 39 + 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 | |
| 40 +}; | |
| 41 + | |
| 42 +LOCAL(const unsigned char) mjpg_dc1_huffval[] = { | |
| 43 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 44 + 0x08, 0x09, 0x0A, 0x0B | |
| 45 +}; | |
| 46 + | |
| 47 +/* AC table 0 */ | |
| 48 +LOCAL(const unsigned char) mjpg_ac0_bits[] = { | |
| 49 + 0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03, | |
| 50 + 0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7D | |
| 51 +}; | |
| 52 + | |
| 53 +LOCAL(const unsigned char) mjpg_ac0_huffval[] = { | |
| 54 + 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, | |
| 55 + 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07, | |
| 56 + 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08, | |
| 57 + 0x23, 0x42, 0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0, | |
| 58 + 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16, | |
| 59 + 0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28, | |
| 60 + 0x29, 0x2A, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, | |
| 61 + 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, | |
| 62 + 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, | |
| 63 + 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, | |
| 64 + 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, | |
| 65 + 0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, | |
| 66 + 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, | |
| 67 + 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, | |
| 68 + 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, | |
| 69 + 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5, | |
| 70 + 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, | |
| 71 + 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2, | |
| 72 + 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, | |
| 73 + 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, | |
| 74 + 0xF9, 0xFA | |
| 75 +}; | |
| 76 + | |
| 77 +/* AC table 1 */ | |
| 78 +LOCAL(const unsigned char) mjpg_ac1_bits[] = { | |
| 79 + 0x00, 0x02, 0x01, 0x02, 0x04, 0x04, 0x03, 0x04, | |
| 80 + 0x07, 0x05, 0x04, 0x04, 0x00, 0x01, 0x02, 0x77 | |
| 81 +}; | |
| 82 + | |
| 83 +LOCAL(const unsigned char) mjpg_ac1_huffval[] = { | |
| 84 + 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, | |
| 85 + 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71, | |
| 86 + 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, | |
| 87 + 0xA1, 0xB1, 0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0, | |
| 88 + 0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24, 0x34, | |
| 89 + 0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26, | |
| 90 + 0x27, 0x28, 0x29, 0x2A, 0x35, 0x36, 0x37, 0x38, | |
| 91 + 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, | |
| 92 + 0x49, 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, | |
| 93 + 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, | |
| 94 + 0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, | |
| 95 + 0x79, 0x7A, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, | |
| 96 + 0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, | |
| 97 + 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, | |
| 98 + 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, | |
| 99 + 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, | |
| 100 + 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, | |
| 101 + 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, | |
| 102 + 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, | |
| 103 + 0xEA, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, | |
| 104 + 0xF9, 0xFA | |
| 105 +}; | |
| 106 + | |
| 107 +/* Loads the default Huffman tables used by motion JPEG frames. This function | |
| 108 + * just copies the huffman tables suggested in the JPEG standard when we have | |
| 109 + * not load them. | |
| 110 + */ | |
| 111 +LOCAL(void) | |
| 112 +mjpg_load_huff_tables (j_decompress_ptr cinfo) | |
| 113 +{ | |
| 114 + JHUFF_TBL *htblptr; | |
| 115 + | |
| 116 + if (! cinfo->dc_huff_tbl_ptrs[0]) { | |
| 117 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); | |
| 118 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL)); | |
| 119 + MEMCOPY(&htblptr->bits[1], mjpg_dc0_bits, SIZEOF(mjpg_dc0_bits)); | |
| 120 + MEMCOPY(&htblptr->huffval[0], mjpg_dc0_huffval, SIZEOF(mjpg_dc0_huffval)); | |
| 121 + cinfo->dc_huff_tbl_ptrs[0] = htblptr; | |
| 122 + } | |
| 123 + | |
| 124 + if (! cinfo->dc_huff_tbl_ptrs[1]) { | |
| 125 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); | |
| 126 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL)); | |
| 127 + MEMCOPY(&htblptr->bits[1], mjpg_dc1_bits, SIZEOF(mjpg_dc1_bits)); | |
| 128 + MEMCOPY(&htblptr->huffval[0], mjpg_dc1_huffval, SIZEOF(mjpg_dc1_huffval)); | |
| 129 + cinfo->dc_huff_tbl_ptrs[1] = htblptr; | |
| 130 + } | |
| 131 + | |
| 132 + if (! cinfo->ac_huff_tbl_ptrs[0]) { | |
| 133 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); | |
| 134 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL)); | |
| 135 + MEMCOPY(&htblptr->bits[1], mjpg_ac0_bits, SIZEOF(mjpg_ac0_bits)); | |
| 136 + MEMCOPY(&htblptr->huffval[0], mjpg_ac0_huffval, SIZEOF(mjpg_ac0_huffval)); | |
| 137 + cinfo->ac_huff_tbl_ptrs[0] = htblptr; | |
| 138 + } | |
| 139 + | |
| 140 + if (! cinfo->ac_huff_tbl_ptrs[1]) { | |
| 141 + htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); | |
| 142 + MEMZERO(htblptr, SIZEOF(JHUFF_TBL)); | |
| 143 + MEMCOPY(&htblptr->bits[1], mjpg_ac1_bits, SIZEOF(mjpg_ac1_bits)); | |
| 144 + MEMCOPY(&htblptr->huffval[0], mjpg_ac1_huffval, SIZEOF(mjpg_ac1_huffval)); | |
| 145 + cinfo->ac_huff_tbl_ptrs[1] = htblptr; | |
| 146 + } | |
| 147 +} | |
| 148 + | |
| 149 +#else | |
| 150 + | |
| 151 +#define mjpg_load_huff_tables(cinfo) | |
| 152 + | |
| 153 +#endif /* MOTION_JPEG_SUPPORTED */ | |
| 154 + | |
| 155 + | |
| 156 /* | |
| 157 * Read markers until SOS or EOI. | |
| 158 * | |
| 159 @@ -1013,6 +1150,7 @@ | |
| 160 break; | |
| 161 | |
| 162 case M_SOS: | |
| 163 + mjpg_load_huff_tables(cinfo); | |
| 164 if (! get_sos(cinfo)) | |
| 165 return JPEG_SUSPENDED; | |
| 166 cinfo->unread_marker = 0; /* processed the marker */ | |
| 167 Index: jmorecfg.h | |
| 168 =================================================================== | |
| 169 --- jmorecfg.h (revision 829) | |
| 170 +++ jmorecfg.h (working copy) | |
| 171 @@ -153,14 +153,18 @@ | |
| 172 /* INT16 must hold at least the values -32768..32767. */ | |
| 173 | |
| 174 #ifndef XMD_H /* X11/xmd.h correctly defines INT16 */ | |
| 175 +#ifndef _BASETSD_H_ /* basetsd.h correctly defines INT32 */ | |
| 176 typedef short INT16; | |
| 177 #endif | |
| 178 +#endif | |
| 179 | |
| 180 /* INT32 must hold at least signed 32-bit values. */ | |
| 181 | |
| 182 #ifndef XMD_H /* X11/xmd.h correctly defines INT32 */ | |
| 183 +#ifndef _BASETSD_H_ /* basetsd.h correctly defines INT32 */ | |
| 184 typedef long INT32; | |
| 185 #endif | |
| 186 +#endif | |
| 187 | |
| 188 /* Datatype used for image dimensions. The JPEG standard only supports | |
| 189 * images up to 64K*64K due to 16-bit fields in SOF markers. Therefore | |
| 190 @@ -210,11 +214,13 @@ | |
| 191 * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol. | |
| 192 */ | |
| 193 | |
| 194 +#ifndef FAR | |
| 195 #ifdef NEED_FAR_POINTERS | |
| 196 #define FAR far | |
| 197 #else | |
| 198 #define FAR | |
| 199 #endif | |
| 200 +#endif | |
| 201 | |
| 202 | |
| 203 /* | |
| 204 Index: jpeglib.h | |
| 205 =================================================================== | |
| 206 --- jpeglib.h (revision 829) | |
| 207 +++ jpeglib.h (working copy) | |
| 208 @@ -15,6 +15,10 @@ | |
| 209 #ifndef JPEGLIB_H | |
| 210 #define JPEGLIB_H | |
| 211 | |
| 212 +/* Begin chromium edits */ | |
| 213 +#include "jpeglibmangler.h" | |
| 214 +/* End chromium edits */ | |
| 215 + | |
| 216 /* | |
| 217 * First we include the configuration files that record how this | |
| 218 * installation of the JPEG library is set up. jconfig.h can be | |
| 219 Index: jpeglibmangler.h | |
| 220 =================================================================== | |
| 221 --- jpeglibmangler.h (revision 0) | |
| 222 +++ jpeglibmangler.h (revision 0) | |
| 223 @@ -0,0 +1,113 @@ | |
| 224 +// Copyright (c) 2009 The Chromium Authors. All rights reserved. | |
| 225 +// Use of this source code is governed by a BSD-style license that can be | |
| 226 +// found in the LICENSE file. | |
| 227 + | |
| 228 +#ifndef THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_ | |
| 229 +#define THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_ | |
| 230 + | |
| 231 +// Mangle all externally visible function names so we can build our own libjpeg | |
| 232 +// without system libraries trying to use it. | |
| 233 + | |
| 234 +#define jpeg_make_c_derived_tbl chromium_jpeg_make_c_derived_tbl | |
| 235 +#define jpeg_gen_optimal_table chromium_jpeg_gen_optimal_table | |
| 236 +#define jpeg_make_d_derived_tbl chromium_jpeg_make_d_derived_tbl | |
| 237 +#define jpeg_fill_bit_buffer chromium_jpeg_fill_bit_buffer | |
| 238 +#define jpeg_huff_decode chromium_jpeg_huff_decode | |
| 239 +#define jpeg_fdct_islow chromium_jpeg_fdct_islow | |
| 240 +#define jpeg_fdct_ifast chromium_jpeg_fdct_ifast | |
| 241 +#define jpeg_fdct_float chromium_jpeg_fdct_float | |
| 242 +#define jpeg_idct_islow chromium_jpeg_idct_islow | |
| 243 +#define jpeg_idct_ifast chromium_jpeg_idct_ifast | |
| 244 +#define jpeg_idct_float chromium_jpeg_idct_float | |
| 245 +#define jpeg_idct_4x4 chromium_jpeg_idct_4x4 | |
| 246 +#define jpeg_idct_2x2 chromium_jpeg_idct_2x2 | |
| 247 +#define jpeg_idct_1x1 chromium_jpeg_idct_1x1 | |
| 248 +#define jinit_compress_master chromium_jinit_compress_master | |
| 249 +#define jinit_c_master_control chromium_jinit_c_master_control | |
| 250 +#define jinit_c_main_controller chromium_jinit_c_main_controller | |
| 251 +#define jinit_c_prep_controller chromium_jinit_c_prep_controller | |
| 252 +#define jinit_c_coef_controller chromium_jinit_c_coef_controller | |
| 253 +#define jinit_color_converter chromium_jinit_color_converter | |
| 254 +#define jinit_downsampler chromium_jinit_downsampler | |
| 255 +#define jinit_forward_dct chromium_jinit_forward_dct | |
| 256 +#define jinit_huff_encoder chromium_jinit_huff_encoder | |
| 257 +#define jinit_phuff_encoder chromium_jinit_phuff_encoder | |
| 258 +#define jinit_marker_writer chromium_jinit_marker_writer | |
| 259 +#define jinit_master_decompress chromium_jinit_master_decompress | |
| 260 +#define jinit_d_main_controller chromium_jinit_d_main_controller | |
| 261 +#define jinit_d_coef_controller chromium_jinit_d_coef_controller | |
| 262 +#define jinit_d_post_controller chromium_jinit_d_post_controller | |
| 263 +#define jinit_input_controller chromium_jinit_input_controller | |
| 264 +#define jinit_marker_reader chromium_jinit_marker_reader | |
| 265 +#define jinit_huff_decoder chromium_jinit_huff_decoder | |
| 266 +#define jinit_phuff_decoder chromium_jinit_phuff_decoder | |
| 267 +#define jinit_inverse_dct chromium_jinit_inverse_dct | |
| 268 +#define jinit_upsampler chromium_jinit_upsampler | |
| 269 +#define jinit_color_deconverter chromium_jinit_color_deconverter | |
| 270 +#define jinit_1pass_quantizer chromium_jinit_1pass_quantizer | |
| 271 +#define jinit_2pass_quantizer chromium_jinit_2pass_quantizer | |
| 272 +#define jinit_merged_upsampler chromium_jinit_merged_upsampler | |
| 273 +#define jinit_memory_mgr chromium_jinit_memory_mgr | |
| 274 +#define jdiv_round_up chromium_jdiv_round_up | |
| 275 +#define jround_up chromium_jround_up | |
| 276 +#define jcopy_sample_rows chromium_jcopy_sample_rows | |
| 277 +#define jcopy_block_row chromium_jcopy_block_row | |
| 278 +#define jzero_far chromium_jzero_far | |
| 279 +#define jpeg_std_error chromium_jpeg_std_error | |
| 280 +#define jpeg_CreateCompress chromium_jpeg_CreateCompress | |
| 281 +#define jpeg_CreateDecompress chromium_jpeg_CreateDecompress | |
| 282 +#define jpeg_destroy_compress chromium_jpeg_destroy_compress | |
| 283 +#define jpeg_destroy_decompress chromium_jpeg_destroy_decompress | |
| 284 +#define jpeg_stdio_dest chromium_jpeg_stdio_dest | |
| 285 +#define jpeg_stdio_src chromium_jpeg_stdio_src | |
| 286 +#define jpeg_set_defaults chromium_jpeg_set_defaults | |
| 287 +#define jpeg_set_colorspace chromium_jpeg_set_colorspace | |
| 288 +#define jpeg_default_colorspace chromium_jpeg_default_colorspace | |
| 289 +#define jpeg_set_quality chromium_jpeg_set_quality | |
| 290 +#define jpeg_set_linear_quality chromium_jpeg_set_linear_quality | |
| 291 +#define jpeg_add_quant_table chromium_jpeg_add_quant_table | |
| 292 +#define jpeg_quality_scaling chromium_jpeg_quality_scaling | |
| 293 +#define jpeg_simple_progression chromium_jpeg_simple_progression | |
| 294 +#define jpeg_suppress_tables chromium_jpeg_suppress_tables | |
| 295 +#define jpeg_alloc_quant_table chromium_jpeg_alloc_quant_table | |
| 296 +#define jpeg_alloc_huff_table chromium_jpeg_alloc_huff_table | |
| 297 +#define jpeg_start_compress chromium_jpeg_start_compress | |
| 298 +#define jpeg_write_scanlines chromium_jpeg_write_scanlines | |
| 299 +#define jpeg_finish_compress chromium_jpeg_finish_compress | |
| 300 +#define jpeg_write_raw_data chromium_jpeg_write_raw_data | |
| 301 +#define jpeg_write_marker chromium_jpeg_write_marker | |
| 302 +#define jpeg_write_m_header chromium_jpeg_write_m_header | |
| 303 +#define jpeg_write_m_byte chromium_jpeg_write_m_byte | |
| 304 +#define jpeg_write_tables chromium_jpeg_write_tables | |
| 305 +#define jpeg_read_header chromium_jpeg_read_header | |
| 306 +#define jpeg_start_decompress chromium_jpeg_start_decompress | |
| 307 +#define jpeg_read_scanlines chromium_jpeg_read_scanlines | |
| 308 +#define jpeg_finish_decompress chromium_jpeg_finish_decompress | |
| 309 +#define jpeg_read_raw_data chromium_jpeg_read_raw_data | |
| 310 +#define jpeg_has_multiple_scans chromium_jpeg_has_multiple_scans | |
| 311 +#define jpeg_start_output chromium_jpeg_start_output | |
| 312 +#define jpeg_finish_output chromium_jpeg_finish_output | |
| 313 +#define jpeg_input_complete chromium_jpeg_input_complete | |
| 314 +#define jpeg_new_colormap chromium_jpeg_new_colormap | |
| 315 +#define jpeg_consume_input chromium_jpeg_consume_input | |
| 316 +#define jpeg_calc_output_dimensions chromium_jpeg_calc_output_dimensions | |
| 317 +#define jpeg_save_markers chromium_jpeg_save_markers | |
| 318 +#define jpeg_set_marker_processor chromium_jpeg_set_marker_processor | |
| 319 +#define jpeg_read_coefficients chromium_jpeg_read_coefficients | |
| 320 +#define jpeg_write_coefficients chromium_jpeg_write_coefficients | |
| 321 +#define jpeg_copy_critical_parameters chromium_jpeg_copy_critical_parameters | |
| 322 +#define jpeg_abort_compress chromium_jpeg_abort_compress | |
| 323 +#define jpeg_abort_decompress chromium_jpeg_abort_decompress | |
| 324 +#define jpeg_abort chromium_jpeg_abort | |
| 325 +#define jpeg_destroy chromium_jpeg_destroy | |
| 326 +#define jpeg_resync_to_restart chromium_jpeg_resync_to_restart | |
| 327 +#define jpeg_get_small chromium_jpeg_get_small | |
| 328 +#define jpeg_free_small chromium_jpeg_free_small | |
| 329 +#define jpeg_get_large chromium_jpeg_get_large | |
| 330 +#define jpeg_free_large chromium_jpeg_free_large | |
| 331 +#define jpeg_mem_available chromium_jpeg_mem_available | |
| 332 +#define jpeg_open_backing_store chromium_jpeg_open_backing_store | |
| 333 +#define jpeg_mem_init chromium_jpeg_mem_init | |
| 334 +#define jpeg_mem_term chromium_jpeg_mem_term | |
| 335 + | |
| 336 +#endif // THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_ | |
| 337 Index: simd/jcgrass2-64.asm | |
| 338 =================================================================== | |
| 339 --- simd/jcgrass2-64.asm (revision 829) | |
| 340 +++ simd/jcgrass2-64.asm (working copy) | |
| 341 @@ -30,7 +30,7 @@ | |
| 342 SECTION SEG_CONST | |
| 343 | |
| 344 alignz 16 | |
| 345 - global EXTN(jconst_rgb_gray_convert_sse2) | |
| 346 + global EXTN(jconst_rgb_gray_convert_sse2) PRIVATE | |
| 347 | |
| 348 EXTN(jconst_rgb_gray_convert_sse2): | |
| 349 | |
| 350 Index: simd/jiss2fst.asm | |
| 351 =================================================================== | |
| 352 --- simd/jiss2fst.asm (revision 829) | |
| 353 +++ simd/jiss2fst.asm (working copy) | |
| 354 @@ -59,7 +59,7 @@ | |
| 355 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
| 356 | |
| 357 alignz 16 | |
| 358 - global EXTN(jconst_idct_ifast_sse2) | |
| 359 + global EXTN(jconst_idct_ifast_sse2) PRIVATE | |
| 360 | |
| 361 EXTN(jconst_idct_ifast_sse2): | |
| 362 | |
| 363 @@ -92,7 +92,7 @@ | |
| 364 %define WK_NUM 2 | |
| 365 | |
| 366 align 16 | |
| 367 - global EXTN(jsimd_idct_ifast_sse2) | |
| 368 + global EXTN(jsimd_idct_ifast_sse2) PRIVATE | |
| 369 | |
| 370 EXTN(jsimd_idct_ifast_sse2): | |
| 371 push ebp | |
| 372 Index: simd/jcclrss2-64.asm | |
| 373 =================================================================== | |
| 374 --- simd/jcclrss2-64.asm (revision 829) | |
| 375 +++ simd/jcclrss2-64.asm (working copy) | |
| 376 @@ -37,7 +37,7 @@ | |
| 377 | |
| 378 align 16 | |
| 379 | |
| 380 - global EXTN(jsimd_rgb_ycc_convert_sse2) | |
| 381 + global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE | |
| 382 | |
| 383 EXTN(jsimd_rgb_ycc_convert_sse2): | |
| 384 push rbp | |
| 385 Index: simd/jiss2red-64.asm | |
| 386 =================================================================== | |
| 387 --- simd/jiss2red-64.asm (revision 829) | |
| 388 +++ simd/jiss2red-64.asm (working copy) | |
| 389 @@ -73,7 +73,7 @@ | |
| 390 SECTION SEG_CONST | |
| 391 | |
| 392 alignz 16 | |
| 393 - global EXTN(jconst_idct_red_sse2) | |
| 394 + global EXTN(jconst_idct_red_sse2) PRIVATE | |
| 395 | |
| 396 EXTN(jconst_idct_red_sse2): | |
| 397 | |
| 398 @@ -114,7 +114,7 @@ | |
| 399 %define WK_NUM 2 | |
| 400 | |
| 401 align 16 | |
| 402 - global EXTN(jsimd_idct_4x4_sse2) | |
| 403 + global EXTN(jsimd_idct_4x4_sse2) PRIVATE | |
| 404 | |
| 405 EXTN(jsimd_idct_4x4_sse2): | |
| 406 push rbp | |
| 407 @@ -413,7 +413,7 @@ | |
| 408 ; r13 = JDIMENSION output_col | |
| 409 | |
| 410 align 16 | |
| 411 - global EXTN(jsimd_idct_2x2_sse2) | |
| 412 + global EXTN(jsimd_idct_2x2_sse2) PRIVATE | |
| 413 | |
| 414 EXTN(jsimd_idct_2x2_sse2): | |
| 415 push rbp | |
| 416 Index: simd/ji3dnflt.asm | |
| 417 =================================================================== | |
| 418 --- simd/ji3dnflt.asm (revision 829) | |
| 419 +++ simd/ji3dnflt.asm (working copy) | |
| 420 @@ -27,7 +27,7 @@ | |
| 421 SECTION SEG_CONST | |
| 422 | |
| 423 alignz 16 | |
| 424 - global EXTN(jconst_idct_float_3dnow) | |
| 425 + global EXTN(jconst_idct_float_3dnow) PRIVATE | |
| 426 | |
| 427 EXTN(jconst_idct_float_3dnow): | |
| 428 | |
| 429 @@ -63,7 +63,7 @@ | |
| 430 ; FAST_FLOAT workspace[DCTSIZE2] | |
| 431 | |
| 432 align 16 | |
| 433 - global EXTN(jsimd_idct_float_3dnow) | |
| 434 + global EXTN(jsimd_idct_float_3dnow) PRIVATE | |
| 435 | |
| 436 EXTN(jsimd_idct_float_3dnow): | |
| 437 push ebp | |
| 438 Index: simd/jsimdcpu.asm | |
| 439 =================================================================== | |
| 440 --- simd/jsimdcpu.asm (revision 829) | |
| 441 +++ simd/jsimdcpu.asm (working copy) | |
| 442 @@ -29,7 +29,7 @@ | |
| 443 ; | |
| 444 | |
| 445 align 16 | |
| 446 - global EXTN(jpeg_simd_cpu_support) | |
| 447 + global EXTN(jpeg_simd_cpu_support) PRIVATE | |
| 448 | |
| 449 EXTN(jpeg_simd_cpu_support): | |
| 450 push ebx | |
| 451 Index: simd/jdmerss2-64.asm | |
| 452 =================================================================== | |
| 453 --- simd/jdmerss2-64.asm (revision 829) | |
| 454 +++ simd/jdmerss2-64.asm (working copy) | |
| 455 @@ -35,7 +35,7 @@ | |
| 456 SECTION SEG_CONST | |
| 457 | |
| 458 alignz 16 | |
| 459 - global EXTN(jconst_merged_upsample_sse2) | |
| 460 + global EXTN(jconst_merged_upsample_sse2) PRIVATE | |
| 461 | |
| 462 EXTN(jconst_merged_upsample_sse2): | |
| 463 | |
| 464 Index: simd/jdsammmx.asm | |
| 465 =================================================================== | |
| 466 --- simd/jdsammmx.asm (revision 829) | |
| 467 +++ simd/jdsammmx.asm (working copy) | |
| 468 @@ -22,7 +22,7 @@ | |
| 469 SECTION SEG_CONST | |
| 470 | |
| 471 alignz 16 | |
| 472 - global EXTN(jconst_fancy_upsample_mmx) | |
| 473 + global EXTN(jconst_fancy_upsample_mmx) PRIVATE | |
| 474 | |
| 475 EXTN(jconst_fancy_upsample_mmx): | |
| 476 | |
| 477 @@ -58,7 +58,7 @@ | |
| 478 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
| 479 | |
| 480 align 16 | |
| 481 - global EXTN(jsimd_h2v1_fancy_upsample_mmx) | |
| 482 + global EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE | |
| 483 | |
| 484 EXTN(jsimd_h2v1_fancy_upsample_mmx): | |
| 485 push ebp | |
| 486 @@ -216,7 +216,7 @@ | |
| 487 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
| 488 | |
| 489 align 16 | |
| 490 - global EXTN(jsimd_h2v2_fancy_upsample_mmx) | |
| 491 + global EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE | |
| 492 | |
| 493 EXTN(jsimd_h2v2_fancy_upsample_mmx): | |
| 494 push ebp | |
| 495 @@ -542,7 +542,7 @@ | |
| 496 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
| 497 | |
| 498 align 16 | |
| 499 - global EXTN(jsimd_h2v1_upsample_mmx) | |
| 500 + global EXTN(jsimd_h2v1_upsample_mmx) PRIVATE | |
| 501 | |
| 502 EXTN(jsimd_h2v1_upsample_mmx): | |
| 503 push ebp | |
| 504 @@ -643,7 +643,7 @@ | |
| 505 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
| 506 | |
| 507 align 16 | |
| 508 - global EXTN(jsimd_h2v2_upsample_mmx) | |
| 509 + global EXTN(jsimd_h2v2_upsample_mmx) PRIVATE | |
| 510 | |
| 511 EXTN(jsimd_h2v2_upsample_mmx): | |
| 512 push ebp | |
| 513 Index: simd/jdmrgmmx.asm | |
| 514 =================================================================== | |
| 515 --- simd/jdmrgmmx.asm (revision 829) | |
| 516 +++ simd/jdmrgmmx.asm (working copy) | |
| 517 @@ -40,7 +40,7 @@ | |
| 518 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
| 519 | |
| 520 align 16 | |
| 521 - global EXTN(jsimd_h2v1_merged_upsample_mmx) | |
| 522 + global EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE | |
| 523 | |
| 524 EXTN(jsimd_h2v1_merged_upsample_mmx): | |
| 525 push ebp | |
| 526 @@ -409,7 +409,7 @@ | |
| 527 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf | |
| 528 | |
| 529 align 16 | |
| 530 - global EXTN(jsimd_h2v2_merged_upsample_mmx) | |
| 531 + global EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE | |
| 532 | |
| 533 EXTN(jsimd_h2v2_merged_upsample_mmx): | |
| 534 push ebp | |
| 535 Index: simd/jdsamss2.asm | |
| 536 =================================================================== | |
| 537 --- simd/jdsamss2.asm (revision 829) | |
| 538 +++ simd/jdsamss2.asm (working copy) | |
| 539 @@ -22,7 +22,7 @@ | |
| 540 SECTION SEG_CONST | |
| 541 | |
| 542 alignz 16 | |
| 543 - global EXTN(jconst_fancy_upsample_sse2) | |
| 544 + global EXTN(jconst_fancy_upsample_sse2) PRIVATE | |
| 545 | |
| 546 EXTN(jconst_fancy_upsample_sse2): | |
| 547 | |
| 548 @@ -58,7 +58,7 @@ | |
| 549 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
| 550 | |
| 551 align 16 | |
| 552 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) | |
| 553 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE | |
| 554 | |
| 555 EXTN(jsimd_h2v1_fancy_upsample_sse2): | |
| 556 push ebp | |
| 557 @@ -214,7 +214,7 @@ | |
| 558 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
| 559 | |
| 560 align 16 | |
| 561 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) | |
| 562 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE | |
| 563 | |
| 564 EXTN(jsimd_h2v2_fancy_upsample_sse2): | |
| 565 push ebp | |
| 566 @@ -538,7 +538,7 @@ | |
| 567 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
| 568 | |
| 569 align 16 | |
| 570 - global EXTN(jsimd_h2v1_upsample_sse2) | |
| 571 + global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE | |
| 572 | |
| 573 EXTN(jsimd_h2v1_upsample_sse2): | |
| 574 push ebp | |
| 575 @@ -637,7 +637,7 @@ | |
| 576 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
| 577 | |
| 578 align 16 | |
| 579 - global EXTN(jsimd_h2v2_upsample_sse2) | |
| 580 + global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE | |
| 581 | |
| 582 EXTN(jsimd_h2v2_upsample_sse2): | |
| 583 push ebp | |
| 584 Index: simd/jiss2flt-64.asm | |
| 585 =================================================================== | |
| 586 --- simd/jiss2flt-64.asm (revision 829) | |
| 587 +++ simd/jiss2flt-64.asm (working copy) | |
| 588 @@ -38,7 +38,7 @@ | |
| 589 SECTION SEG_CONST | |
| 590 | |
| 591 alignz 16 | |
| 592 - global EXTN(jconst_idct_float_sse2) | |
| 593 + global EXTN(jconst_idct_float_sse2) PRIVATE | |
| 594 | |
| 595 EXTN(jconst_idct_float_sse2): | |
| 596 | |
| 597 @@ -74,7 +74,7 @@ | |
| 598 ; FAST_FLOAT workspace[DCTSIZE2] | |
| 599 | |
| 600 align 16 | |
| 601 - global EXTN(jsimd_idct_float_sse2) | |
| 602 + global EXTN(jsimd_idct_float_sse2) PRIVATE | |
| 603 | |
| 604 EXTN(jsimd_idct_float_sse2): | |
| 605 push rbp | |
| 606 Index: simd/jfss2int-64.asm | |
| 607 =================================================================== | |
| 608 --- simd/jfss2int-64.asm (revision 829) | |
| 609 +++ simd/jfss2int-64.asm (working copy) | |
| 610 @@ -67,7 +67,7 @@ | |
| 611 SECTION SEG_CONST | |
| 612 | |
| 613 alignz 16 | |
| 614 - global EXTN(jconst_fdct_islow_sse2) | |
| 615 + global EXTN(jconst_fdct_islow_sse2) PRIVATE | |
| 616 | |
| 617 EXTN(jconst_fdct_islow_sse2): | |
| 618 | |
| 619 @@ -101,7 +101,7 @@ | |
| 620 %define WK_NUM 6 | |
| 621 | |
| 622 align 16 | |
| 623 - global EXTN(jsimd_fdct_islow_sse2) | |
| 624 + global EXTN(jsimd_fdct_islow_sse2) PRIVATE | |
| 625 | |
| 626 EXTN(jsimd_fdct_islow_sse2): | |
| 627 push rbp | |
| 628 Index: simd/jcqnts2f.asm | |
| 629 =================================================================== | |
| 630 --- simd/jcqnts2f.asm (revision 829) | |
| 631 +++ simd/jcqnts2f.asm (working copy) | |
| 632 @@ -35,7 +35,7 @@ | |
| 633 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
| 634 | |
| 635 align 16 | |
| 636 - global EXTN(jsimd_convsamp_float_sse2) | |
| 637 + global EXTN(jsimd_convsamp_float_sse2) PRIVATE | |
| 638 | |
| 639 EXTN(jsimd_convsamp_float_sse2): | |
| 640 push ebp | |
| 641 @@ -115,7 +115,7 @@ | |
| 642 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
| 643 | |
| 644 align 16 | |
| 645 - global EXTN(jsimd_quantize_float_sse2) | |
| 646 + global EXTN(jsimd_quantize_float_sse2) PRIVATE | |
| 647 | |
| 648 EXTN(jsimd_quantize_float_sse2): | |
| 649 push ebp | |
| 650 Index: simd/jdmrgss2.asm | |
| 651 =================================================================== | |
| 652 --- simd/jdmrgss2.asm (revision 829) | |
| 653 +++ simd/jdmrgss2.asm (working copy) | |
| 654 @@ -40,7 +40,7 @@ | |
| 655 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
| 656 | |
| 657 align 16 | |
| 658 - global EXTN(jsimd_h2v1_merged_upsample_sse2) | |
| 659 + global EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE | |
| 660 | |
| 661 EXTN(jsimd_h2v1_merged_upsample_sse2): | |
| 662 push ebp | |
| 663 @@ -560,7 +560,7 @@ | |
| 664 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf | |
| 665 | |
| 666 align 16 | |
| 667 - global EXTN(jsimd_h2v2_merged_upsample_sse2) | |
| 668 + global EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE | |
| 669 | |
| 670 EXTN(jsimd_h2v2_merged_upsample_sse2): | |
| 671 push ebp | |
| 672 Index: simd/jfmmxint.asm | |
| 673 =================================================================== | |
| 674 --- simd/jfmmxint.asm (revision 829) | |
| 675 +++ simd/jfmmxint.asm (working copy) | |
| 676 @@ -66,7 +66,7 @@ | |
| 677 SECTION SEG_CONST | |
| 678 | |
| 679 alignz 16 | |
| 680 - global EXTN(jconst_fdct_islow_mmx) | |
| 681 + global EXTN(jconst_fdct_islow_mmx) PRIVATE | |
| 682 | |
| 683 EXTN(jconst_fdct_islow_mmx): | |
| 684 | |
| 685 @@ -101,7 +101,7 @@ | |
| 686 %define WK_NUM 2 | |
| 687 | |
| 688 align 16 | |
| 689 - global EXTN(jsimd_fdct_islow_mmx) | |
| 690 + global EXTN(jsimd_fdct_islow_mmx) PRIVATE | |
| 691 | |
| 692 EXTN(jsimd_fdct_islow_mmx): | |
| 693 push ebp | |
| 694 Index: simd/jcgryss2-64.asm | |
| 695 =================================================================== | |
| 696 --- simd/jcgryss2-64.asm (revision 829) | |
| 697 +++ simd/jcgryss2-64.asm (working copy) | |
| 698 @@ -37,7 +37,7 @@ | |
| 699 | |
| 700 align 16 | |
| 701 | |
| 702 - global EXTN(jsimd_rgb_gray_convert_sse2) | |
| 703 + global EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE | |
| 704 | |
| 705 EXTN(jsimd_rgb_gray_convert_sse2): | |
| 706 push rbp | |
| 707 Index: simd/jcqnts2i.asm | |
| 708 =================================================================== | |
| 709 --- simd/jcqnts2i.asm (revision 829) | |
| 710 +++ simd/jcqnts2i.asm (working copy) | |
| 711 @@ -35,7 +35,7 @@ | |
| 712 %define workspace ebp+16 ; DCTELEM * workspace | |
| 713 | |
| 714 align 16 | |
| 715 - global EXTN(jsimd_convsamp_sse2) | |
| 716 + global EXTN(jsimd_convsamp_sse2) PRIVATE | |
| 717 | |
| 718 EXTN(jsimd_convsamp_sse2): | |
| 719 push ebp | |
| 720 @@ -117,7 +117,7 @@ | |
| 721 %define workspace ebp+16 ; DCTELEM * workspace | |
| 722 | |
| 723 align 16 | |
| 724 - global EXTN(jsimd_quantize_sse2) | |
| 725 + global EXTN(jsimd_quantize_sse2) PRIVATE | |
| 726 | |
| 727 EXTN(jsimd_quantize_sse2): | |
| 728 push ebp | |
| 729 Index: simd/jiss2fst-64.asm | |
| 730 =================================================================== | |
| 731 --- simd/jiss2fst-64.asm (revision 829) | |
| 732 +++ simd/jiss2fst-64.asm (working copy) | |
| 733 @@ -60,7 +60,7 @@ | |
| 734 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
| 735 | |
| 736 alignz 16 | |
| 737 - global EXTN(jconst_idct_ifast_sse2) | |
| 738 + global EXTN(jconst_idct_ifast_sse2) PRIVATE | |
| 739 | |
| 740 EXTN(jconst_idct_ifast_sse2): | |
| 741 | |
| 742 @@ -93,7 +93,7 @@ | |
| 743 %define WK_NUM 2 | |
| 744 | |
| 745 align 16 | |
| 746 - global EXTN(jsimd_idct_ifast_sse2) | |
| 747 + global EXTN(jsimd_idct_ifast_sse2) PRIVATE | |
| 748 | |
| 749 EXTN(jsimd_idct_ifast_sse2): | |
| 750 push rbp | |
| 751 Index: simd/jiss2flt.asm | |
| 752 =================================================================== | |
| 753 --- simd/jiss2flt.asm (revision 829) | |
| 754 +++ simd/jiss2flt.asm (working copy) | |
| 755 @@ -37,7 +37,7 @@ | |
| 756 SECTION SEG_CONST | |
| 757 | |
| 758 alignz 16 | |
| 759 - global EXTN(jconst_idct_float_sse2) | |
| 760 + global EXTN(jconst_idct_float_sse2) PRIVATE | |
| 761 | |
| 762 EXTN(jconst_idct_float_sse2): | |
| 763 | |
| 764 @@ -73,7 +73,7 @@ | |
| 765 ; FAST_FLOAT workspace[DCTSIZE2] | |
| 766 | |
| 767 align 16 | |
| 768 - global EXTN(jsimd_idct_float_sse2) | |
| 769 + global EXTN(jsimd_idct_float_sse2) PRIVATE | |
| 770 | |
| 771 EXTN(jsimd_idct_float_sse2): | |
| 772 push ebp | |
| 773 Index: simd/jiss2int.asm | |
| 774 =================================================================== | |
| 775 --- simd/jiss2int.asm (revision 829) | |
| 776 +++ simd/jiss2int.asm (working copy) | |
| 777 @@ -66,7 +66,7 @@ | |
| 778 SECTION SEG_CONST | |
| 779 | |
| 780 alignz 16 | |
| 781 - global EXTN(jconst_idct_islow_sse2) | |
| 782 + global EXTN(jconst_idct_islow_sse2) PRIVATE | |
| 783 | |
| 784 EXTN(jconst_idct_islow_sse2): | |
| 785 | |
| 786 @@ -105,7 +105,7 @@ | |
| 787 %define WK_NUM 12 | |
| 788 | |
| 789 align 16 | |
| 790 - global EXTN(jsimd_idct_islow_sse2) | |
| 791 + global EXTN(jsimd_idct_islow_sse2) PRIVATE | |
| 792 | |
| 793 EXTN(jsimd_idct_islow_sse2): | |
| 794 push ebp | |
| 795 Index: simd/jfsseflt-64.asm | |
| 796 =================================================================== | |
| 797 --- simd/jfsseflt-64.asm (revision 829) | |
| 798 +++ simd/jfsseflt-64.asm (working copy) | |
| 799 @@ -38,7 +38,7 @@ | |
| 800 SECTION SEG_CONST | |
| 801 | |
| 802 alignz 16 | |
| 803 - global EXTN(jconst_fdct_float_sse) | |
| 804 + global EXTN(jconst_fdct_float_sse) PRIVATE | |
| 805 | |
| 806 EXTN(jconst_fdct_float_sse): | |
| 807 | |
| 808 @@ -65,7 +65,7 @@ | |
| 809 %define WK_NUM 2 | |
| 810 | |
| 811 align 16 | |
| 812 - global EXTN(jsimd_fdct_float_sse) | |
| 813 + global EXTN(jsimd_fdct_float_sse) PRIVATE | |
| 814 | |
| 815 EXTN(jsimd_fdct_float_sse): | |
| 816 push rbp | |
| 817 Index: simd/jccolss2-64.asm | |
| 818 =================================================================== | |
| 819 --- simd/jccolss2-64.asm (revision 829) | |
| 820 +++ simd/jccolss2-64.asm (working copy) | |
| 821 @@ -34,7 +34,7 @@ | |
| 822 SECTION SEG_CONST | |
| 823 | |
| 824 alignz 16 | |
| 825 - global EXTN(jconst_rgb_ycc_convert_sse2) | |
| 826 + global EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE | |
| 827 | |
| 828 EXTN(jconst_rgb_ycc_convert_sse2): | |
| 829 | |
| 830 Index: simd/jcsamss2-64.asm | |
| 831 =================================================================== | |
| 832 --- simd/jcsamss2-64.asm (revision 829) | |
| 833 +++ simd/jcsamss2-64.asm (working copy) | |
| 834 @@ -41,7 +41,7 @@ | |
| 835 ; r15 = JSAMPARRAY output_data | |
| 836 | |
| 837 align 16 | |
| 838 - global EXTN(jsimd_h2v1_downsample_sse2) | |
| 839 + global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | |
| 840 | |
| 841 EXTN(jsimd_h2v1_downsample_sse2): | |
| 842 push rbp | |
| 843 @@ -185,7 +185,7 @@ | |
| 844 ; r15 = JSAMPARRAY output_data | |
| 845 | |
| 846 align 16 | |
| 847 - global EXTN(jsimd_h2v2_downsample_sse2) | |
| 848 + global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | |
| 849 | |
| 850 EXTN(jsimd_h2v2_downsample_sse2): | |
| 851 push rbp | |
| 852 Index: simd/jdclrss2-64.asm | |
| 853 =================================================================== | |
| 854 --- simd/jdclrss2-64.asm (revision 829) | |
| 855 +++ simd/jdclrss2-64.asm (working copy) | |
| 856 @@ -39,7 +39,7 @@ | |
| 857 %define WK_NUM 2 | |
| 858 | |
| 859 align 16 | |
| 860 - global EXTN(jsimd_ycc_rgb_convert_sse2) | |
| 861 + global EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE | |
| 862 | |
| 863 EXTN(jsimd_ycc_rgb_convert_sse2): | |
| 864 push rbp | |
| 865 Index: simd/jdcolmmx.asm | |
| 866 =================================================================== | |
| 867 --- simd/jdcolmmx.asm (revision 829) | |
| 868 +++ simd/jdcolmmx.asm (working copy) | |
| 869 @@ -35,7 +35,7 @@ | |
| 870 SECTION SEG_CONST | |
| 871 | |
| 872 alignz 16 | |
| 873 - global EXTN(jconst_ycc_rgb_convert_mmx) | |
| 874 + global EXTN(jconst_ycc_rgb_convert_mmx) PRIVATE | |
| 875 | |
| 876 EXTN(jconst_ycc_rgb_convert_mmx): | |
| 877 | |
| 878 Index: simd/jcclrmmx.asm | |
| 879 =================================================================== | |
| 880 --- simd/jcclrmmx.asm (revision 829) | |
| 881 +++ simd/jcclrmmx.asm (working copy) | |
| 882 @@ -40,7 +40,7 @@ | |
| 883 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
| 884 | |
| 885 align 16 | |
| 886 - global EXTN(jsimd_rgb_ycc_convert_mmx) | |
| 887 + global EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE | |
| 888 | |
| 889 EXTN(jsimd_rgb_ycc_convert_mmx): | |
| 890 push ebp | |
| 891 Index: simd/jfsseflt.asm | |
| 892 =================================================================== | |
| 893 --- simd/jfsseflt.asm (revision 829) | |
| 894 +++ simd/jfsseflt.asm (working copy) | |
| 895 @@ -37,7 +37,7 @@ | |
| 896 SECTION SEG_CONST | |
| 897 | |
| 898 alignz 16 | |
| 899 - global EXTN(jconst_fdct_float_sse) | |
| 900 + global EXTN(jconst_fdct_float_sse) PRIVATE | |
| 901 | |
| 902 EXTN(jconst_fdct_float_sse): | |
| 903 | |
| 904 @@ -65,7 +65,7 @@ | |
| 905 %define WK_NUM 2 | |
| 906 | |
| 907 align 16 | |
| 908 - global EXTN(jsimd_fdct_float_sse) | |
| 909 + global EXTN(jsimd_fdct_float_sse) PRIVATE | |
| 910 | |
| 911 EXTN(jsimd_fdct_float_sse): | |
| 912 push ebp | |
| 913 Index: simd/jdmrgss2-64.asm | |
| 914 =================================================================== | |
| 915 --- simd/jdmrgss2-64.asm (revision 829) | |
| 916 +++ simd/jdmrgss2-64.asm (working copy) | |
| 917 @@ -39,7 +39,7 @@ | |
| 918 %define WK_NUM 3 | |
| 919 | |
| 920 align 16 | |
| 921 - global EXTN(jsimd_h2v1_merged_upsample_sse2) | |
| 922 + global EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE | |
| 923 | |
| 924 EXTN(jsimd_h2v1_merged_upsample_sse2): | |
| 925 push rbp | |
| 926 @@ -543,7 +543,7 @@ | |
| 927 ; r13 = JSAMPARRAY output_buf | |
| 928 | |
| 929 align 16 | |
| 930 - global EXTN(jsimd_h2v2_merged_upsample_sse2) | |
| 931 + global EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE | |
| 932 | |
| 933 EXTN(jsimd_h2v2_merged_upsample_sse2): | |
| 934 push rbp | |
| 935 Index: simd/jdcolss2.asm | |
| 936 =================================================================== | |
| 937 --- simd/jdcolss2.asm (revision 829) | |
| 938 +++ simd/jdcolss2.asm (working copy) | |
| 939 @@ -35,7 +35,7 @@ | |
| 940 SECTION SEG_CONST | |
| 941 | |
| 942 alignz 16 | |
| 943 - global EXTN(jconst_ycc_rgb_convert_sse2) | |
| 944 + global EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE | |
| 945 | |
| 946 EXTN(jconst_ycc_rgb_convert_sse2): | |
| 947 | |
| 948 Index: simd/jdmermmx.asm | |
| 949 =================================================================== | |
| 950 --- simd/jdmermmx.asm (revision 829) | |
| 951 +++ simd/jdmermmx.asm (working copy) | |
| 952 @@ -35,7 +35,7 @@ | |
| 953 SECTION SEG_CONST | |
| 954 | |
| 955 alignz 16 | |
| 956 - global EXTN(jconst_merged_upsample_mmx) | |
| 957 + global EXTN(jconst_merged_upsample_mmx) PRIVATE | |
| 958 | |
| 959 EXTN(jconst_merged_upsample_mmx): | |
| 960 | |
| 961 Index: simd/jcclrss2.asm | |
| 962 =================================================================== | |
| 963 --- simd/jcclrss2.asm (revision 829) | |
| 964 +++ simd/jcclrss2.asm (working copy) | |
| 965 @@ -38,7 +38,7 @@ | |
| 966 | |
| 967 align 16 | |
| 968 | |
| 969 - global EXTN(jsimd_rgb_ycc_convert_sse2) | |
| 970 + global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE | |
| 971 | |
| 972 EXTN(jsimd_rgb_ycc_convert_sse2): | |
| 973 push ebp | |
| 974 Index: simd/jiss2red.asm | |
| 975 =================================================================== | |
| 976 --- simd/jiss2red.asm (revision 829) | |
| 977 +++ simd/jiss2red.asm (working copy) | |
| 978 @@ -72,7 +72,7 @@ | |
| 979 SECTION SEG_CONST | |
| 980 | |
| 981 alignz 16 | |
| 982 - global EXTN(jconst_idct_red_sse2) | |
| 983 + global EXTN(jconst_idct_red_sse2) PRIVATE | |
| 984 | |
| 985 EXTN(jconst_idct_red_sse2): | |
| 986 | |
| 987 @@ -113,7 +113,7 @@ | |
| 988 %define WK_NUM 2 | |
| 989 | |
| 990 align 16 | |
| 991 - global EXTN(jsimd_idct_4x4_sse2) | |
| 992 + global EXTN(jsimd_idct_4x4_sse2) PRIVATE | |
| 993 | |
| 994 EXTN(jsimd_idct_4x4_sse2): | |
| 995 push ebp | |
| 996 @@ -424,7 +424,7 @@ | |
| 997 %define output_col(b) (b)+20 ; JDIMENSION output_col | |
| 998 | |
| 999 align 16 | |
| 1000 - global EXTN(jsimd_idct_2x2_sse2) | |
| 1001 + global EXTN(jsimd_idct_2x2_sse2) PRIVATE | |
| 1002 | |
| 1003 EXTN(jsimd_idct_2x2_sse2): | |
| 1004 push ebp | |
| 1005 Index: simd/jdmerss2.asm | |
| 1006 =================================================================== | |
| 1007 --- simd/jdmerss2.asm (revision 829) | |
| 1008 +++ simd/jdmerss2.asm (working copy) | |
| 1009 @@ -35,7 +35,7 @@ | |
| 1010 SECTION SEG_CONST | |
| 1011 | |
| 1012 alignz 16 | |
| 1013 - global EXTN(jconst_merged_upsample_sse2) | |
| 1014 + global EXTN(jconst_merged_upsample_sse2) PRIVATE | |
| 1015 | |
| 1016 EXTN(jconst_merged_upsample_sse2): | |
| 1017 | |
| 1018 Index: simd/jfss2fst-64.asm | |
| 1019 =================================================================== | |
| 1020 --- simd/jfss2fst-64.asm (revision 829) | |
| 1021 +++ simd/jfss2fst-64.asm (working copy) | |
| 1022 @@ -53,7 +53,7 @@ | |
| 1023 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
| 1024 | |
| 1025 alignz 16 | |
| 1026 - global EXTN(jconst_fdct_ifast_sse2) | |
| 1027 + global EXTN(jconst_fdct_ifast_sse2) PRIVATE | |
| 1028 | |
| 1029 EXTN(jconst_fdct_ifast_sse2): | |
| 1030 | |
| 1031 @@ -80,7 +80,7 @@ | |
| 1032 %define WK_NUM 2 | |
| 1033 | |
| 1034 align 16 | |
| 1035 - global EXTN(jsimd_fdct_ifast_sse2) | |
| 1036 + global EXTN(jsimd_fdct_ifast_sse2) PRIVATE | |
| 1037 | |
| 1038 EXTN(jsimd_fdct_ifast_sse2): | |
| 1039 push rbp | |
| 1040 Index: simd/jcqntmmx.asm | |
| 1041 =================================================================== | |
| 1042 --- simd/jcqntmmx.asm (revision 829) | |
| 1043 +++ simd/jcqntmmx.asm (working copy) | |
| 1044 @@ -35,7 +35,7 @@ | |
| 1045 %define workspace ebp+16 ; DCTELEM * workspace | |
| 1046 | |
| 1047 align 16 | |
| 1048 - global EXTN(jsimd_convsamp_mmx) | |
| 1049 + global EXTN(jsimd_convsamp_mmx) PRIVATE | |
| 1050 | |
| 1051 EXTN(jsimd_convsamp_mmx): | |
| 1052 push ebp | |
| 1053 @@ -140,7 +140,7 @@ | |
| 1054 %define workspace ebp+16 ; DCTELEM * workspace | |
| 1055 | |
| 1056 align 16 | |
| 1057 - global EXTN(jsimd_quantize_mmx) | |
| 1058 + global EXTN(jsimd_quantize_mmx) PRIVATE | |
| 1059 | |
| 1060 EXTN(jsimd_quantize_mmx): | |
| 1061 push ebp | |
| 1062 Index: simd/jimmxfst.asm | |
| 1063 =================================================================== | |
| 1064 --- simd/jimmxfst.asm (revision 829) | |
| 1065 +++ simd/jimmxfst.asm (working copy) | |
| 1066 @@ -59,7 +59,7 @@ | |
| 1067 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
| 1068 | |
| 1069 alignz 16 | |
| 1070 - global EXTN(jconst_idct_ifast_mmx) | |
| 1071 + global EXTN(jconst_idct_ifast_mmx) PRIVATE | |
| 1072 | |
| 1073 EXTN(jconst_idct_ifast_mmx): | |
| 1074 | |
| 1075 @@ -94,7 +94,7 @@ | |
| 1076 ; JCOEF workspace[DCTSIZE2] | |
| 1077 | |
| 1078 align 16 | |
| 1079 - global EXTN(jsimd_idct_ifast_mmx) | |
| 1080 + global EXTN(jsimd_idct_ifast_mmx) PRIVATE | |
| 1081 | |
| 1082 EXTN(jsimd_idct_ifast_mmx): | |
| 1083 push ebp | |
| 1084 Index: simd/jfss2fst.asm | |
| 1085 =================================================================== | |
| 1086 --- simd/jfss2fst.asm (revision 829) | |
| 1087 +++ simd/jfss2fst.asm (working copy) | |
| 1088 @@ -52,7 +52,7 @@ | |
| 1089 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
| 1090 | |
| 1091 alignz 16 | |
| 1092 - global EXTN(jconst_fdct_ifast_sse2) | |
| 1093 + global EXTN(jconst_fdct_ifast_sse2) PRIVATE | |
| 1094 | |
| 1095 EXTN(jconst_fdct_ifast_sse2): | |
| 1096 | |
| 1097 @@ -80,7 +80,7 @@ | |
| 1098 %define WK_NUM 2 | |
| 1099 | |
| 1100 align 16 | |
| 1101 - global EXTN(jsimd_fdct_ifast_sse2) | |
| 1102 + global EXTN(jsimd_fdct_ifast_sse2) PRIVATE | |
| 1103 | |
| 1104 EXTN(jsimd_fdct_ifast_sse2): | |
| 1105 push ebp | |
| 1106 Index: simd/jcgrammx.asm | |
| 1107 =================================================================== | |
| 1108 --- simd/jcgrammx.asm (revision 829) | |
| 1109 +++ simd/jcgrammx.asm (working copy) | |
| 1110 @@ -33,7 +33,7 @@ | |
| 1111 SECTION SEG_CONST | |
| 1112 | |
| 1113 alignz 16 | |
| 1114 - global EXTN(jconst_rgb_gray_convert_mmx) | |
| 1115 + global EXTN(jconst_rgb_gray_convert_mmx) PRIVATE | |
| 1116 | |
| 1117 EXTN(jconst_rgb_gray_convert_mmx): | |
| 1118 | |
| 1119 Index: simd/jdcolss2-64.asm | |
| 1120 =================================================================== | |
| 1121 --- simd/jdcolss2-64.asm (revision 829) | |
| 1122 +++ simd/jdcolss2-64.asm (working copy) | |
| 1123 @@ -35,7 +35,7 @@ | |
| 1124 SECTION SEG_CONST | |
| 1125 | |
| 1126 alignz 16 | |
| 1127 - global EXTN(jconst_ycc_rgb_convert_sse2) | |
| 1128 + global EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE | |
| 1129 | |
| 1130 EXTN(jconst_ycc_rgb_convert_sse2): | |
| 1131 | |
| 1132 Index: simd/jf3dnflt.asm | |
| 1133 =================================================================== | |
| 1134 --- simd/jf3dnflt.asm (revision 829) | |
| 1135 +++ simd/jf3dnflt.asm (working copy) | |
| 1136 @@ -27,7 +27,7 @@ | |
| 1137 SECTION SEG_CONST | |
| 1138 | |
| 1139 alignz 16 | |
| 1140 - global EXTN(jconst_fdct_float_3dnow) | |
| 1141 + global EXTN(jconst_fdct_float_3dnow) PRIVATE | |
| 1142 | |
| 1143 EXTN(jconst_fdct_float_3dnow): | |
| 1144 | |
| 1145 @@ -55,7 +55,7 @@ | |
| 1146 %define WK_NUM 2 | |
| 1147 | |
| 1148 align 16 | |
| 1149 - global EXTN(jsimd_fdct_float_3dnow) | |
| 1150 + global EXTN(jsimd_fdct_float_3dnow) PRIVATE | |
| 1151 | |
| 1152 EXTN(jsimd_fdct_float_3dnow): | |
| 1153 push ebp | |
| 1154 Index: simd/jdsamss2-64.asm | |
| 1155 =================================================================== | |
| 1156 --- simd/jdsamss2-64.asm (revision 829) | |
| 1157 +++ simd/jdsamss2-64.asm (working copy) | |
| 1158 @@ -23,7 +23,7 @@ | |
| 1159 SECTION SEG_CONST | |
| 1160 | |
| 1161 alignz 16 | |
| 1162 - global EXTN(jconst_fancy_upsample_sse2) | |
| 1163 + global EXTN(jconst_fancy_upsample_sse2) PRIVATE | |
| 1164 | |
| 1165 EXTN(jconst_fancy_upsample_sse2): | |
| 1166 | |
| 1167 @@ -59,7 +59,7 @@ | |
| 1168 ; r13 = JSAMPARRAY * output_data_ptr | |
| 1169 | |
| 1170 align 16 | |
| 1171 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) | |
| 1172 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE | |
| 1173 | |
| 1174 EXTN(jsimd_h2v1_fancy_upsample_sse2): | |
| 1175 push rbp | |
| 1176 @@ -201,7 +201,7 @@ | |
| 1177 %define WK_NUM 4 | |
| 1178 | |
| 1179 align 16 | |
| 1180 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) | |
| 1181 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE | |
| 1182 | |
| 1183 EXTN(jsimd_h2v2_fancy_upsample_sse2): | |
| 1184 push rbp | |
| 1185 @@ -498,7 +498,7 @@ | |
| 1186 ; r13 = JSAMPARRAY * output_data_ptr | |
| 1187 | |
| 1188 align 16 | |
| 1189 - global EXTN(jsimd_h2v1_upsample_sse2) | |
| 1190 + global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE | |
| 1191 | |
| 1192 EXTN(jsimd_h2v1_upsample_sse2): | |
| 1193 push rbp | |
| 1194 @@ -587,7 +587,7 @@ | |
| 1195 ; r13 = JSAMPARRAY * output_data_ptr | |
| 1196 | |
| 1197 align 16 | |
| 1198 - global EXTN(jsimd_h2v2_upsample_sse2) | |
| 1199 + global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE | |
| 1200 | |
| 1201 EXTN(jsimd_h2v2_upsample_sse2): | |
| 1202 push rbp | |
| 1203 Index: simd/jcgrass2.asm | |
| 1204 =================================================================== | |
| 1205 --- simd/jcgrass2.asm (revision 829) | |
| 1206 +++ simd/jcgrass2.asm (working copy) | |
| 1207 @@ -30,7 +30,7 @@ | |
| 1208 SECTION SEG_CONST | |
| 1209 | |
| 1210 alignz 16 | |
| 1211 - global EXTN(jconst_rgb_gray_convert_sse2) | |
| 1212 + global EXTN(jconst_rgb_gray_convert_sse2) PRIVATE | |
| 1213 | |
| 1214 EXTN(jconst_rgb_gray_convert_sse2): | |
| 1215 | |
| 1216 Index: simd/jcsammmx.asm | |
| 1217 =================================================================== | |
| 1218 --- simd/jcsammmx.asm (revision 829) | |
| 1219 +++ simd/jcsammmx.asm (working copy) | |
| 1220 @@ -40,7 +40,7 @@ | |
| 1221 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
| 1222 | |
| 1223 align 16 | |
| 1224 - global EXTN(jsimd_h2v1_downsample_mmx) | |
| 1225 + global EXTN(jsimd_h2v1_downsample_mmx) PRIVATE | |
| 1226 | |
| 1227 EXTN(jsimd_h2v1_downsample_mmx): | |
| 1228 push ebp | |
| 1229 @@ -182,7 +182,7 @@ | |
| 1230 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
| 1231 | |
| 1232 align 16 | |
| 1233 - global EXTN(jsimd_h2v2_downsample_mmx) | |
| 1234 + global EXTN(jsimd_h2v2_downsample_mmx) PRIVATE | |
| 1235 | |
| 1236 EXTN(jsimd_h2v2_downsample_mmx): | |
| 1237 push ebp | |
| 1238 +Index: simd/jsimd_arm.c | |
| 1239 +=================================================================== | |
| 1240 +--- simd/jsimd_arm.c (revision 272637) | |
| 1241 ++++ simd/jsimd_arm.c (working copy) | |
| 1242 +@@ -29,0 +29,0 @@ | |
| 1243 + | |
| 1244 + static unsigned int simd_support = ~0; | |
| 1245 + | |
| 1246 +-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) | |
| 1247 ++#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defin
ed(__ANDROID__)) | |
| 1248 + | |
| 1249 + #define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) | |
| 1250 + | |
| 1251 +@@ -100,6 +100,6 @@ | |
| 1252 + init_simd (void) | |
| 1253 + { | |
| 1254 + char *env = NULL; | |
| 1255 +-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || define
d(__ANDROID__) | |
| 1256 ++#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defin
ed(__ANDROID__)) | |
| 1257 + int bufsize = 1024; /* an initial guess for the line buffer size limit */ | |
| 1258 + #endif | |
| 1259 + | |
| 1260 Index: simd/jsimd_arm_neon.S | |
| 1261 =================================================================== | |
| 1262 --- simd/jsimd_arm_neon.S (revision 272637) | |
| 1263 +++ simd/jsimd_arm_neon.S (working copy) | |
| 1264 @@ -41,11 +41,9 @@ | |
| 1265 /* Supplementary macro for setting function attributes */ | |
| 1266 .macro asm_function fname | |
| 1267 #ifdef __APPLE__ | |
| 1268 - .func _\fname | |
| 1269 .globl _\fname | |
| 1270 _\fname: | |
| 1271 #else | |
| 1272 - .func \fname | |
| 1273 .global \fname | |
| 1274 #ifdef __ELF__ | |
| 1275 .hidden \fname | |
| 1276 @@ -670,7 +668,6 @@ | |
| 1277 .unreq ROW6R | |
| 1278 .unreq ROW7L | |
| 1279 .unreq ROW7R | |
| 1280 -.endfunc | |
| 1281 | |
| 1282 | |
| 1283 /*****************************************************************************/ | |
| 1284 @@ -895,7 +892,6 @@ | |
| 1285 .unreq TMP2 | |
| 1286 .unreq TMP3 | |
| 1287 .unreq TMP4 | |
| 1288 -.endfunc | |
| 1289 | |
| 1290 | |
| 1291 /*****************************************************************************/ | |
| 1292 @@ -1108,7 +1104,6 @@ | |
| 1293 .unreq TMP2 | |
| 1294 .unreq TMP3 | |
| 1295 .unreq TMP4 | |
| 1296 -.endfunc | |
| 1297 | |
| 1298 .purgem idct_helper | |
| 1299 | |
| 1300 @@ -1263,7 +1258,6 @@ | |
| 1301 .unreq OUTPUT_COL | |
| 1302 .unreq TMP1 | |
| 1303 .unreq TMP2 | |
| 1304 -.endfunc | |
| 1305 | |
| 1306 .purgem idct_helper | |
| 1307 | |
| 1308 @@ -1547,7 +1541,6 @@ | |
| 1309 .unreq U | |
| 1310 .unreq V | |
| 1311 .unreq N | |
| 1312 -.endfunc | |
| 1313 | |
| 1314 .purgem do_yuv_to_rgb | |
| 1315 .purgem do_yuv_to_rgb_stage1 | |
| 1316 @@ -1858,7 +1851,6 @@ | |
| 1317 .unreq U | |
| 1318 .unreq V | |
| 1319 .unreq N | |
| 1320 -.endfunc | |
| 1321 | |
| 1322 .purgem do_rgb_to_yuv | |
| 1323 .purgem do_rgb_to_yuv_stage1 | |
| 1324 @@ -1940,7 +1932,6 @@ | |
| 1325 .unreq TMP2 | |
| 1326 .unreq TMP3 | |
| 1327 .unreq TMP4 | |
| 1328 -.endfunc | |
| 1329 | |
| 1330 | |
| 1331 /*****************************************************************************/ | |
| 1332 @@ -2064,7 +2055,6 @@ | |
| 1333 | |
| 1334 .unreq DATA | |
| 1335 .unreq TMP | |
| 1336 -.endfunc | |
| 1337 | |
| 1338 | |
| 1339 /*****************************************************************************/ | |
| 1340 @@ -2166,7 +2156,6 @@ | |
| 1341 .unreq CORRECTION | |
| 1342 .unreq SHIFT | |
| 1343 .unreq LOOP_COUNT | |
| 1344 -.endfunc | |
| 1345 | |
| 1346 | |
| 1347 /*****************************************************************************/ | |
| 1348 @@ -2401,7 +2390,6 @@ | |
| 1349 .unreq WIDTH | |
| 1350 .unreq TMP | |
| 1351 | |
| 1352 -.endfunc | |
| 1353 | |
| 1354 .purgem upsample16 | |
| 1355 .purgem upsample32 | |
| 1356 Index: simd/jsimd_i386.c | |
| 1357 =================================================================== | |
| 1358 --- simd/jsimd_i386.c (revision 829) | |
| 1359 +++ simd/jsimd_i386.c (working copy) | |
| 1360 @@ -61,6 +61,7 @@ | |
| 1361 simd_support &= JSIMD_SSE2; | |
| 1362 } | |
| 1363 | |
| 1364 +#ifndef JPEG_DECODE_ONLY | |
| 1365 GLOBAL(int) | |
| 1366 jsimd_can_rgb_ycc (void) | |
| 1367 { | |
| 1368 @@ -82,6 +83,7 @@ | |
| 1369 | |
| 1370 return 0; | |
| 1371 } | |
| 1372 +#endif | |
| 1373 | |
| 1374 GLOBAL(int) | |
| 1375 jsimd_can_rgb_gray (void) | |
| 1376 @@ -127,6 +129,7 @@ | |
| 1377 return 0; | |
| 1378 } | |
| 1379 | |
| 1380 +#ifndef JPEG_DECODE_ONLY | |
| 1381 GLOBAL(void) | |
| 1382 jsimd_rgb_ycc_convert (j_compress_ptr cinfo, | |
| 1383 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | |
| 1384 @@ -179,6 +182,7 @@ | |
| 1385 mmxfct(cinfo->image_width, input_buf, | |
| 1386 output_buf, output_row, num_rows); | |
| 1387 } | |
| 1388 +#endif | |
| 1389 | |
| 1390 GLOBAL(void) | |
| 1391 jsimd_rgb_gray_convert (j_compress_ptr cinfo, | |
| 1392 @@ -286,6 +290,7 @@ | |
| 1393 input_row, output_buf, num_rows); | |
| 1394 } | |
| 1395 | |
| 1396 +#ifndef JPEG_DECODE_ONLY | |
| 1397 GLOBAL(int) | |
| 1398 jsimd_can_h2v2_downsample (void) | |
| 1399 { | |
| 1400 @@ -351,6 +356,7 @@ | |
| 1401 compptr->v_samp_factor, compptr->width_in_blocks, | |
| 1402 input_data, output_data); | |
| 1403 } | |
| 1404 +#endif | |
| 1405 | |
| 1406 GLOBAL(int) | |
| 1407 jsimd_can_h2v2_upsample (void) | |
| 1408 @@ -636,6 +642,7 @@ | |
| 1409 in_row_group_ctr, output_buf); | |
| 1410 } | |
| 1411 | |
| 1412 +#ifndef JPEG_DECODE_ONLY | |
| 1413 GLOBAL(int) | |
| 1414 jsimd_can_convsamp (void) | |
| 1415 { | |
| 1416 @@ -855,6 +862,7 @@ | |
| 1417 else if (simd_support & JSIMD_3DNOW) | |
| 1418 jsimd_quantize_float_3dnow(coef_block, divisors, workspace); | |
| 1419 } | |
| 1420 +#endif | |
| 1421 | |
| 1422 GLOBAL(int) | |
| 1423 jsimd_can_idct_2x2 (void) | |
| 1424 @@ -1045,4 +1053,3 @@ | |
| 1425 jsimd_idct_float_3dnow(compptr->dct_table, coef_block, | |
| 1426 output_buf, output_col); | |
| 1427 } | |
| 1428 - | |
| 1429 Index: simd/jcqnts2f-64.asm | |
| 1430 =================================================================== | |
| 1431 --- simd/jcqnts2f-64.asm (revision 829) | |
| 1432 +++ simd/jcqnts2f-64.asm (working copy) | |
| 1433 @@ -36,7 +36,7 @@ | |
| 1434 ; r12 = FAST_FLOAT * workspace | |
| 1435 | |
| 1436 align 16 | |
| 1437 - global EXTN(jsimd_convsamp_float_sse2) | |
| 1438 + global EXTN(jsimd_convsamp_float_sse2) PRIVATE | |
| 1439 | |
| 1440 EXTN(jsimd_convsamp_float_sse2): | |
| 1441 push rbp | |
| 1442 @@ -110,7 +110,7 @@ | |
| 1443 ; r12 = FAST_FLOAT * workspace | |
| 1444 | |
| 1445 align 16 | |
| 1446 - global EXTN(jsimd_quantize_float_sse2) | |
| 1447 + global EXTN(jsimd_quantize_float_sse2) PRIVATE | |
| 1448 | |
| 1449 EXTN(jsimd_quantize_float_sse2): | |
| 1450 push rbp | |
| 1451 Index: simd/jcqnt3dn.asm | |
| 1452 =================================================================== | |
| 1453 --- simd/jcqnt3dn.asm (revision 829) | |
| 1454 +++ simd/jcqnt3dn.asm (working copy) | |
| 1455 @@ -35,7 +35,7 @@ | |
| 1456 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
| 1457 | |
| 1458 align 16 | |
| 1459 - global EXTN(jsimd_convsamp_float_3dnow) | |
| 1460 + global EXTN(jsimd_convsamp_float_3dnow) PRIVATE | |
| 1461 | |
| 1462 EXTN(jsimd_convsamp_float_3dnow): | |
| 1463 push ebp | |
| 1464 @@ -138,7 +138,7 @@ | |
| 1465 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
| 1466 | |
| 1467 align 16 | |
| 1468 - global EXTN(jsimd_quantize_float_3dnow) | |
| 1469 + global EXTN(jsimd_quantize_float_3dnow) PRIVATE | |
| 1470 | |
| 1471 EXTN(jsimd_quantize_float_3dnow): | |
| 1472 push ebp | |
| 1473 Index: simd/jcsamss2.asm | |
| 1474 =================================================================== | |
| 1475 --- simd/jcsamss2.asm (revision 829) | |
| 1476 +++ simd/jcsamss2.asm (working copy) | |
| 1477 @@ -40,7 +40,7 @@ | |
| 1478 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
| 1479 | |
| 1480 align 16 | |
| 1481 - global EXTN(jsimd_h2v1_downsample_sse2) | |
| 1482 + global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | |
| 1483 | |
| 1484 EXTN(jsimd_h2v1_downsample_sse2): | |
| 1485 push ebp | |
| 1486 @@ -195,7 +195,7 @@ | |
| 1487 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
| 1488 | |
| 1489 align 16 | |
| 1490 - global EXTN(jsimd_h2v2_downsample_sse2) | |
| 1491 + global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | |
| 1492 | |
| 1493 EXTN(jsimd_h2v2_downsample_sse2): | |
| 1494 push ebp | |
| 1495 Index: simd/jsimd_x86_64.c | |
| 1496 =================================================================== | |
| 1497 --- simd/jsimd_x86_64.c (revision 829) | |
| 1498 +++ simd/jsimd_x86_64.c (working copy) | |
| 1499 @@ -29,6 +29,7 @@ | |
| 1500 | |
| 1501 #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ | |
| 1502 | |
| 1503 +#ifndef JPEG_DECODE_ONLY | |
| 1504 GLOBAL(int) | |
| 1505 jsimd_can_rgb_ycc (void) | |
| 1506 { | |
| 1507 @@ -45,6 +46,7 @@ | |
| 1508 | |
| 1509 return 1; | |
| 1510 } | |
| 1511 +#endif | |
| 1512 | |
| 1513 GLOBAL(int) | |
| 1514 jsimd_can_rgb_gray (void) | |
| 1515 @@ -80,6 +82,7 @@ | |
| 1516 return 1; | |
| 1517 } | |
| 1518 | |
| 1519 +#ifndef JPEG_DECODE_ONLY | |
| 1520 GLOBAL(void) | |
| 1521 jsimd_rgb_ycc_convert (j_compress_ptr cinfo, | |
| 1522 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | |
| 1523 @@ -118,6 +121,7 @@ | |
| 1524 | |
| 1525 sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); | |
| 1526 } | |
| 1527 +#endif | |
| 1528 | |
| 1529 GLOBAL(void) | |
| 1530 jsimd_rgb_gray_convert (j_compress_ptr cinfo, | |
| 1531 @@ -197,6 +201,7 @@ | |
| 1532 sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); | |
| 1533 } | |
| 1534 | |
| 1535 +#ifndef JPEG_DECODE_ONLY | |
| 1536 GLOBAL(int) | |
| 1537 jsimd_can_h2v2_downsample (void) | |
| 1538 { | |
| 1539 @@ -242,6 +247,7 @@ | |
| 1540 compptr->width_in_blocks, | |
| 1541 input_data, output_data); | |
| 1542 } | |
| 1543 +#endif | |
| 1544 | |
| 1545 GLOBAL(int) | |
| 1546 jsimd_can_h2v2_upsample (void) | |
| 1547 @@ -451,6 +457,7 @@ | |
| 1548 sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); | |
| 1549 } | |
| 1550 | |
| 1551 +#ifndef JPEG_DECODE_ONLY | |
| 1552 GLOBAL(int) | |
| 1553 jsimd_can_convsamp (void) | |
| 1554 { | |
| 1555 @@ -601,6 +608,7 @@ | |
| 1556 { | |
| 1557 jsimd_quantize_float_sse2(coef_block, divisors, workspace); | |
| 1558 } | |
| 1559 +#endif | |
| 1560 | |
| 1561 GLOBAL(int) | |
| 1562 jsimd_can_idct_2x2 (void) | |
| 1563 @@ -750,4 +758,3 @@ | |
| 1564 jsimd_idct_float_sse2(compptr->dct_table, coef_block, | |
| 1565 output_buf, output_col); | |
| 1566 } | |
| 1567 - | |
| 1568 Index: simd/jimmxint.asm | |
| 1569 =================================================================== | |
| 1570 --- simd/jimmxint.asm (revision 829) | |
| 1571 +++ simd/jimmxint.asm (working copy) | |
| 1572 @@ -66,7 +66,7 @@ | |
| 1573 SECTION SEG_CONST | |
| 1574 | |
| 1575 alignz 16 | |
| 1576 - global EXTN(jconst_idct_islow_mmx) | |
| 1577 + global EXTN(jconst_idct_islow_mmx) PRIVATE | |
| 1578 | |
| 1579 EXTN(jconst_idct_islow_mmx): | |
| 1580 | |
| 1581 @@ -107,7 +107,7 @@ | |
| 1582 ; JCOEF workspace[DCTSIZE2] | |
| 1583 | |
| 1584 align 16 | |
| 1585 - global EXTN(jsimd_idct_islow_mmx) | |
| 1586 + global EXTN(jsimd_idct_islow_mmx) PRIVATE | |
| 1587 | |
| 1588 EXTN(jsimd_idct_islow_mmx): | |
| 1589 push ebp | |
| 1590 Index: simd/jcgrymmx.asm | |
| 1591 =================================================================== | |
| 1592 --- simd/jcgrymmx.asm (revision 829) | |
| 1593 +++ simd/jcgrymmx.asm (working copy) | |
| 1594 @@ -41,7 +41,7 @@ | |
| 1595 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
| 1596 | |
| 1597 align 16 | |
| 1598 - global EXTN(jsimd_rgb_gray_convert_mmx) | |
| 1599 + global EXTN(jsimd_rgb_gray_convert_mmx) PRIVATE | |
| 1600 | |
| 1601 EXTN(jsimd_rgb_gray_convert_mmx): | |
| 1602 push ebp | |
| 1603 Index: simd/jfss2int.asm | |
| 1604 =================================================================== | |
| 1605 --- simd/jfss2int.asm (revision 829) | |
| 1606 +++ simd/jfss2int.asm (working copy) | |
| 1607 @@ -66,7 +66,7 @@ | |
| 1608 SECTION SEG_CONST | |
| 1609 | |
| 1610 alignz 16 | |
| 1611 - global EXTN(jconst_fdct_islow_sse2) | |
| 1612 + global EXTN(jconst_fdct_islow_sse2) PRIVATE | |
| 1613 | |
| 1614 EXTN(jconst_fdct_islow_sse2): | |
| 1615 | |
| 1616 @@ -101,7 +101,7 @@ | |
| 1617 %define WK_NUM 6 | |
| 1618 | |
| 1619 align 16 | |
| 1620 - global EXTN(jsimd_fdct_islow_sse2) | |
| 1621 + global EXTN(jsimd_fdct_islow_sse2) PRIVATE | |
| 1622 | |
| 1623 EXTN(jsimd_fdct_islow_sse2): | |
| 1624 push ebp | |
| 1625 Index: simd/jcgryss2.asm | |
| 1626 =================================================================== | |
| 1627 --- simd/jcgryss2.asm (revision 829) | |
| 1628 +++ simd/jcgryss2.asm (working copy) | |
| 1629 @@ -39,7 +39,7 @@ | |
| 1630 | |
| 1631 align 16 | |
| 1632 | |
| 1633 - global EXTN(jsimd_rgb_gray_convert_sse2) | |
| 1634 + global EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE | |
| 1635 | |
| 1636 EXTN(jsimd_rgb_gray_convert_sse2): | |
| 1637 push ebp | |
| 1638 Index: simd/jccolmmx.asm | |
| 1639 =================================================================== | |
| 1640 --- simd/jccolmmx.asm (revision 829) | |
| 1641 +++ simd/jccolmmx.asm (working copy) | |
| 1642 @@ -37,7 +37,7 @@ | |
| 1643 SECTION SEG_CONST | |
| 1644 | |
| 1645 alignz 16 | |
| 1646 - global EXTN(jconst_rgb_ycc_convert_mmx) | |
| 1647 + global EXTN(jconst_rgb_ycc_convert_mmx) PRIVATE | |
| 1648 | |
| 1649 EXTN(jconst_rgb_ycc_convert_mmx): | |
| 1650 | |
| 1651 Index: simd/jimmxred.asm | |
| 1652 =================================================================== | |
| 1653 --- simd/jimmxred.asm (revision 829) | |
| 1654 +++ simd/jimmxred.asm (working copy) | |
| 1655 @@ -72,7 +72,7 @@ | |
| 1656 SECTION SEG_CONST | |
| 1657 | |
| 1658 alignz 16 | |
| 1659 - global EXTN(jconst_idct_red_mmx) | |
| 1660 + global EXTN(jconst_idct_red_mmx) PRIVATE | |
| 1661 | |
| 1662 EXTN(jconst_idct_red_mmx): | |
| 1663 | |
| 1664 @@ -115,7 +115,7 @@ | |
| 1665 ; JCOEF workspace[DCTSIZE2] | |
| 1666 | |
| 1667 align 16 | |
| 1668 - global EXTN(jsimd_idct_4x4_mmx) | |
| 1669 + global EXTN(jsimd_idct_4x4_mmx) PRIVATE | |
| 1670 | |
| 1671 EXTN(jsimd_idct_4x4_mmx): | |
| 1672 push ebp | |
| 1673 @@ -503,7 +503,7 @@ | |
| 1674 %define output_col(b) (b)+20 ; JDIMENSION output_col | |
| 1675 | |
| 1676 align 16 | |
| 1677 - global EXTN(jsimd_idct_2x2_mmx) | |
| 1678 + global EXTN(jsimd_idct_2x2_mmx) PRIVATE | |
| 1679 | |
| 1680 EXTN(jsimd_idct_2x2_mmx): | |
| 1681 push ebp | |
| 1682 Index: simd/jsimdext.inc | |
| 1683 =================================================================== | |
| 1684 --- simd/jsimdext.inc (revision 829) | |
| 1685 +++ simd/jsimdext.inc (working copy) | |
| 1686 @@ -73,6 +73,9 @@ | |
| 1687 ; * *BSD family Unix using elf format | |
| 1688 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix | |
| 1689 | |
| 1690 +; PIC is the default on Linux | |
| 1691 +%define PIC | |
| 1692 + | |
| 1693 ; mark stack as non-executable | |
| 1694 section .note.GNU-stack noalloc noexec nowrite progbits | |
| 1695 | |
| 1696 @@ -375,4 +378,14 @@ | |
| 1697 ; | |
| 1698 %include "jsimdcfg.inc" | |
| 1699 | |
| 1700 +; Begin chromium edits | |
| 1701 +%ifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- | |
| 1702 +%define PRIVATE :private_extern | |
| 1703 +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ | |
| 1704 +%define PRIVATE :hidden | |
| 1705 +%else | |
| 1706 +%define PRIVATE | |
| 1707 +%endif | |
| 1708 +; End chromium edits | |
| 1709 + | |
| 1710 ; -------------------------------------------------------------------------- | |
| 1711 Index: simd/jdclrmmx.asm | |
| 1712 =================================================================== | |
| 1713 --- simd/jdclrmmx.asm (revision 829) | |
| 1714 +++ simd/jdclrmmx.asm (working copy) | |
| 1715 @@ -40,7 +40,7 @@ | |
| 1716 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
| 1717 | |
| 1718 align 16 | |
| 1719 - global EXTN(jsimd_ycc_rgb_convert_mmx) | |
| 1720 + global EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE | |
| 1721 | |
| 1722 EXTN(jsimd_ycc_rgb_convert_mmx): | |
| 1723 push ebp | |
| 1724 Index: simd/jccolss2.asm | |
| 1725 =================================================================== | |
| 1726 --- simd/jccolss2.asm (revision 829) | |
| 1727 +++ simd/jccolss2.asm (working copy) | |
| 1728 @@ -34,7 +34,7 @@ | |
| 1729 SECTION SEG_CONST | |
| 1730 | |
| 1731 alignz 16 | |
| 1732 - global EXTN(jconst_rgb_ycc_convert_sse2) | |
| 1733 + global EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE | |
| 1734 | |
| 1735 EXTN(jconst_rgb_ycc_convert_sse2): | |
| 1736 | |
| 1737 Index: simd/jisseflt.asm | |
| 1738 =================================================================== | |
| 1739 --- simd/jisseflt.asm (revision 829) | |
| 1740 +++ simd/jisseflt.asm (working copy) | |
| 1741 @@ -37,7 +37,7 @@ | |
| 1742 SECTION SEG_CONST | |
| 1743 | |
| 1744 alignz 16 | |
| 1745 - global EXTN(jconst_idct_float_sse) | |
| 1746 + global EXTN(jconst_idct_float_sse) PRIVATE | |
| 1747 | |
| 1748 EXTN(jconst_idct_float_sse): | |
| 1749 | |
| 1750 @@ -73,7 +73,7 @@ | |
| 1751 ; FAST_FLOAT workspace[DCTSIZE2] | |
| 1752 | |
| 1753 align 16 | |
| 1754 - global EXTN(jsimd_idct_float_sse) | |
| 1755 + global EXTN(jsimd_idct_float_sse) PRIVATE | |
| 1756 | |
| 1757 EXTN(jsimd_idct_float_sse): | |
| 1758 push ebp | |
| 1759 Index: simd/jcqnts2i-64.asm | |
| 1760 =================================================================== | |
| 1761 --- simd/jcqnts2i-64.asm (revision 829) | |
| 1762 +++ simd/jcqnts2i-64.asm (working copy) | |
| 1763 @@ -36,7 +36,7 @@ | |
| 1764 ; r12 = DCTELEM * workspace | |
| 1765 | |
| 1766 align 16 | |
| 1767 - global EXTN(jsimd_convsamp_sse2) | |
| 1768 + global EXTN(jsimd_convsamp_sse2) PRIVATE | |
| 1769 | |
| 1770 EXTN(jsimd_convsamp_sse2): | |
| 1771 push rbp | |
| 1772 @@ -112,7 +112,7 @@ | |
| 1773 ; r12 = DCTELEM * workspace | |
| 1774 | |
| 1775 align 16 | |
| 1776 - global EXTN(jsimd_quantize_sse2) | |
| 1777 + global EXTN(jsimd_quantize_sse2) PRIVATE | |
| 1778 | |
| 1779 EXTN(jsimd_quantize_sse2): | |
| 1780 push rbp | |
| 1781 Index: simd/jdclrss2.asm | |
| 1782 =================================================================== | |
| 1783 --- simd/jdclrss2.asm (revision 829) | |
| 1784 +++ simd/jdclrss2.asm (working copy) | |
| 1785 @@ -40,7 +40,7 @@ | |
| 1786 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
| 1787 | |
| 1788 align 16 | |
| 1789 - global EXTN(jsimd_ycc_rgb_convert_sse2) | |
| 1790 + global EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE | |
| 1791 | |
| 1792 EXTN(jsimd_ycc_rgb_convert_sse2): | |
| 1793 push ebp | |
| 1794 Index: simd/jcqntsse.asm | |
| 1795 =================================================================== | |
| 1796 --- simd/jcqntsse.asm (revision 829) | |
| 1797 +++ simd/jcqntsse.asm (working copy) | |
| 1798 @@ -35,7 +35,7 @@ | |
| 1799 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
| 1800 | |
| 1801 align 16 | |
| 1802 - global EXTN(jsimd_convsamp_float_sse) | |
| 1803 + global EXTN(jsimd_convsamp_float_sse) PRIVATE | |
| 1804 | |
| 1805 EXTN(jsimd_convsamp_float_sse): | |
| 1806 push ebp | |
| 1807 @@ -138,7 +138,7 @@ | |
| 1808 %define workspace ebp+16 ; FAST_FLOAT * workspace | |
| 1809 | |
| 1810 align 16 | |
| 1811 - global EXTN(jsimd_quantize_float_sse) | |
| 1812 + global EXTN(jsimd_quantize_float_sse) PRIVATE | |
| 1813 | |
| 1814 EXTN(jsimd_quantize_float_sse): | |
| 1815 push ebp | |
| 1816 Index: simd/jiss2int-64.asm | |
| 1817 =================================================================== | |
| 1818 --- simd/jiss2int-64.asm (revision 829) | |
| 1819 +++ simd/jiss2int-64.asm (working copy) | |
| 1820 @@ -67,7 +67,7 @@ | |
| 1821 SECTION SEG_CONST | |
| 1822 | |
| 1823 alignz 16 | |
| 1824 - global EXTN(jconst_idct_islow_sse2) | |
| 1825 + global EXTN(jconst_idct_islow_sse2) PRIVATE | |
| 1826 | |
| 1827 EXTN(jconst_idct_islow_sse2): | |
| 1828 | |
| 1829 @@ -106,7 +106,7 @@ | |
| 1830 %define WK_NUM 12 | |
| 1831 | |
| 1832 align 16 | |
| 1833 - global EXTN(jsimd_idct_islow_sse2) | |
| 1834 + global EXTN(jsimd_idct_islow_sse2) PRIVATE | |
| 1835 | |
| 1836 EXTN(jsimd_idct_islow_sse2): | |
| 1837 push rbp | |
| 1838 Index: simd/jfmmxfst.asm | |
| 1839 =================================================================== | |
| 1840 --- simd/jfmmxfst.asm (revision 829) | |
| 1841 +++ simd/jfmmxfst.asm (working copy) | |
| 1842 @@ -52,7 +52,7 @@ | |
| 1843 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
| 1844 | |
| 1845 alignz 16 | |
| 1846 - global EXTN(jconst_fdct_ifast_mmx) | |
| 1847 + global EXTN(jconst_fdct_ifast_mmx) PRIVATE | |
| 1848 | |
| 1849 EXTN(jconst_fdct_ifast_mmx): | |
| 1850 | |
| 1851 @@ -80,7 +80,7 @@ | |
| 1852 %define WK_NUM 2 | |
| 1853 | |
| 1854 align 16 | |
| 1855 - global EXTN(jsimd_fdct_ifast_mmx) | |
| 1856 + global EXTN(jsimd_fdct_ifast_mmx) PRIVATE | |
| 1857 | |
| 1858 EXTN(jsimd_fdct_ifast_mmx): | |
| 1859 push ebp | |
| 1860 Index: jdarith.c | |
| 1861 =================================================================== | |
| 1862 --- jdarith.c (revision 829) | |
| 1863 +++ jdarith.c (working copy) | |
| 1864 @@ -150,8 +150,8 @@ | |
| 1865 */ | |
| 1866 sv = *st; | |
| 1867 qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */ | |
| 1868 - nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */ | |
| 1869 - nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ | |
| 1870 + nl = (unsigned char) qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS *
/ | |
| 1871 + nm = (unsigned char) qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ | |
| 1872 | |
| 1873 /* Decode & estimation procedures per sections D.2.4 & D.2.5 */ | |
| 1874 temp = e->a - qe; | |
| 1875 Index: jdhuff.c | |
| 1876 =================================================================== | |
| 1877 --- jdhuff.c (revision 1541) | |
| 1878 +++ jdhuff.c (working copy) | |
| 1879 @@ -662,7 +662,7 @@ | |
| 1880 d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn]; | |
| 1881 register int s, k, r, l; | |
| 1882 | |
| 1883 - HUFF_DECODE_FAST(s, l, dctbl); | |
| 1884 + HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu); | |
| 1885 if (s) { | |
| 1886 FILL_BIT_BUFFER_FAST | |
| 1887 r = GET_BITS(s); | |
| 1888 @@ -679,7 +679,7 @@ | |
| 1889 if (entropy->ac_needed[blkn]) { | |
| 1890 | |
| 1891 for (k = 1; k < DCTSIZE2; k++) { | |
| 1892 - HUFF_DECODE_FAST(s, l, actbl); | |
| 1893 + HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu); | |
| 1894 r = s >> 4; | |
| 1895 s &= 15; | |
| 1896 | |
| 1897 @@ -698,7 +698,7 @@ | |
| 1898 } else { | |
| 1899 | |
| 1900 for (k = 1; k < DCTSIZE2; k++) { | |
| 1901 - HUFF_DECODE_FAST(s, l, actbl); | |
| 1902 + HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu); | |
| 1903 r = s >> 4; | |
| 1904 s &= 15; | |
| 1905 | |
| 1906 @@ -715,6 +715,7 @@ | |
| 1907 } | |
| 1908 | |
| 1909 if (cinfo->unread_marker != 0) { | |
| 1910 +slow_decode_mcu: | |
| 1911 cinfo->unread_marker = 0; | |
| 1912 return FALSE; | |
| 1913 } | |
| 1914 @@ -742,7 +743,7 @@ | |
| 1915 * this module, since we'll just re-assign them on the next call.) | |
| 1916 */ | |
| 1917 | |
| 1918 -#define BUFSIZE (DCTSIZE2 * 2) | |
| 1919 +#define BUFSIZE (DCTSIZE2 * 2u) | |
| 1920 | |
| 1921 METHODDEF(boolean) | |
| 1922 decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) | |
| 1923 Index: jdhuff.h | |
| 1924 =================================================================== | |
| 1925 --- jdhuff.h (revision 1541) | |
| 1926 +++ jdhuff.h (working copy) | |
| 1927 @@ -208,7 +208,7 @@ | |
| 1928 } \ | |
| 1929 } | |
| 1930 | |
| 1931 -#define HUFF_DECODE_FAST(s,nb,htbl) \ | |
| 1932 +#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \ | |
| 1933 FILL_BIT_BUFFER_FAST; \ | |
| 1934 s = PEEK_BITS(HUFF_LOOKAHEAD); \ | |
| 1935 s = htbl->lookup[s]; \ | |
| 1936 @@ -225,7 +225,9 @@ | |
| 1937 s |= GET_BITS(1); \ | |
| 1938 nb++; \ | |
| 1939 } \ | |
| 1940 - s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) & 0xFF ]; \ | |
| 1941 + if (nb > 16) \ | |
| 1942 + goto slowlabel; \ | |
| 1943 + s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \ | |
| 1944 } | |
| 1945 | |
| 1946 /* Out-of-line case for Huffman code fetching */ | |
| 1947 | |
| 1948 Index: jchuff.c | |
| 1949 =================================================================== | |
| 1950 --- jchuff.c (revision 1219) | |
| 1951 +++ jchuff.c (revision 1220) | |
| 1952 @@ -22,8 +22,36 @@ | |
| 1953 #include "jchuff.h" /* Declarations shared with jcphuff.c */ | |
| 1954 #include <limits.h> | |
| 1955 | |
| 1956 +/* | |
| 1957 + * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be | |
| 1958 + * used for bit counting rather than the lookup table. This will reduce the | |
| 1959 + * memory footprint by 64k, which is important for some mobile applications | |
| 1960 + * that create many isolated instances of libjpeg-turbo (web browsers, for | |
| 1961 + * instance.) This may improve performance on some mobile platforms as well. | |
| 1962 + * This feature is enabled by default only on ARM processors, because some x86 | |
| 1963 + * chips have a slow implementation of bsr, and the use of clz/bsr cannot be | |
| 1964 + * shown to have a significant performance impact even on the x86 chips that | |
| 1965 + * have a fast implementation of it. When building for ARMv6, you can | |
| 1966 + * explicitly disable the use of clz/bsr by adding -mthumb to the compiler | |
| 1967 + * flags (this defines __thumb__). | |
| 1968 + */ | |
| 1969 + | |
| 1970 +/* NOTE: Both GCC and Clang define __GNUC__ */ | |
| 1971 +#if defined __GNUC__ && defined __arm__ | |
| 1972 +#if !defined __thumb__ || defined __thumb2__ | |
| 1973 +#define USE_CLZ_INTRINSIC | |
| 1974 +#endif | |
| 1975 +#endif | |
| 1976 + | |
| 1977 +#ifdef USE_CLZ_INTRINSIC | |
| 1978 +#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) | |
| 1979 +#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) | |
| 1980 +#else | |
| 1981 static unsigned char jpeg_nbits_table[65536]; | |
| 1982 static int jpeg_nbits_table_init = 0; | |
| 1983 +#define JPEG_NBITS(x) (jpeg_nbits_table[x]) | |
| 1984 +#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) | |
| 1985 +#endif | |
| 1986 | |
| 1987 #ifndef min | |
| 1988 #define min(a,b) ((a)<(b)?(a):(b)) | |
| 1989 @@ -272,6 +300,7 @@ | |
| 1990 dtbl->ehufsi[i] = huffsize[p]; | |
| 1991 } | |
| 1992 | |
| 1993 +#ifndef USE_CLZ_INTRINSIC | |
| 1994 if(!jpeg_nbits_table_init) { | |
| 1995 for(i = 0; i < 65536; i++) { | |
| 1996 int nbits = 0, temp = i; | |
| 1997 @@ -280,6 +309,7 @@ | |
| 1998 } | |
| 1999 jpeg_nbits_table_init = 1; | |
| 2000 } | |
| 2001 +#endif | |
| 2002 } | |
| 2003 | |
| 2004 | |
| 2005 @@ -482,7 +512,7 @@ | |
| 2006 temp2 += temp3; | |
| 2007 | |
| 2008 /* Find the number of bits needed for the magnitude of the coefficient */ | |
| 2009 - nbits = jpeg_nbits_table[temp]; | |
| 2010 + nbits = JPEG_NBITS(temp); | |
| 2011 | |
| 2012 /* Emit the Huffman-coded symbol for the number of bits */ | |
| 2013 code = dctbl->ehufco[nbits]; | |
| 2014 @@ -516,7 +546,7 @@ | |
| 2015 temp ^= temp3; \ | |
| 2016 temp -= temp3; \ | |
| 2017 temp2 += temp3; \ | |
| 2018 - nbits = jpeg_nbits_table[temp]; \ | |
| 2019 + nbits = JPEG_NBITS_NONZERO(temp); \ | |
| 2020 /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ | |
| 2021 while (r > 15) { \ | |
| 2022 EMIT_BITS(code_0xf0, size_0xf0) \ | |
| 2023 Index: simd/jsimd_arm64.c | |
| 2024 =================================================================== | |
| 2025 --- /dev/null | |
| 2026 +++ simd/jsimd_arm64.c | |
| 2027 @@ -0,0 +1,544 @@ | |
| 2028 +/* | |
| 2029 + * jsimd_arm64.c | |
| 2030 + * | |
| 2031 + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
| 2032 + * Copyright 2009-2011, 2013-2014 D. R. Commander | |
| 2033 + * | |
| 2034 + * Based on the x86 SIMD extension for IJG JPEG library, | |
| 2035 + * Copyright (C) 1999-2006, MIYASAKA Masaru. | |
| 2036 + * For conditions of distribution and use, see copyright notice in jsimdext.inc | |
| 2037 + * | |
| 2038 + * This file contains the interface between the "normal" portions | |
| 2039 + * of the library and the SIMD implementations when running on a | |
| 2040 + * 64-bit ARM architecture. | |
| 2041 + */ | |
| 2042 + | |
| 2043 +#define JPEG_INTERNALS | |
| 2044 +#include "../jinclude.h" | |
| 2045 +#include "../jpeglib.h" | |
| 2046 +#include "../jsimd.h" | |
| 2047 +#include "../jdct.h" | |
| 2048 +#include "../jsimddct.h" | |
| 2049 +#include "jsimd.h" | |
| 2050 + | |
| 2051 +#include <stdio.h> | |
| 2052 +#include <string.h> | |
| 2053 +#include <ctype.h> | |
| 2054 + | |
| 2055 +static unsigned int simd_support = ~0; | |
| 2056 + | |
| 2057 +/* | |
| 2058 + * Check what SIMD accelerations are supported. | |
| 2059 + * | |
| 2060 + * FIXME: This code is racy under a multi-threaded environment. | |
| 2061 + */ | |
| 2062 + | |
| 2063 +/* | |
| 2064 + * ARMv8 architectures support NEON extensions by default. | |
| 2065 + * It is no longer optional as it was with ARMv7. | |
| 2066 + */ | |
| 2067 + | |
| 2068 + | |
| 2069 +LOCAL(void) | |
| 2070 +init_simd (void) | |
| 2071 +{ | |
| 2072 + char *env = NULL; | |
| 2073 + | |
| 2074 + if (simd_support != ~0U) | |
| 2075 + return; | |
| 2076 + | |
| 2077 + simd_support = 0; | |
| 2078 + | |
| 2079 + simd_support |= JSIMD_ARM_NEON; | |
| 2080 + | |
| 2081 + /* Force different settings through environment variables */ | |
| 2082 + env = getenv("JSIMD_FORCENEON"); | |
| 2083 + if ((env != NULL) && (strcmp(env, "1") == 0)) | |
| 2084 + simd_support &= JSIMD_ARM_NEON; | |
| 2085 + env = getenv("JSIMD_FORCENONE"); | |
| 2086 + if ((env != NULL) && (strcmp(env, "1") == 0)) | |
| 2087 + simd_support = 0; | |
| 2088 +} | |
| 2089 + | |
| 2090 +GLOBAL(int) | |
| 2091 +jsimd_can_rgb_ycc (void) | |
| 2092 +{ | |
| 2093 + init_simd(); | |
| 2094 + | |
| 2095 + return 0; | |
| 2096 +} | |
| 2097 + | |
| 2098 +GLOBAL(int) | |
| 2099 +jsimd_can_rgb_gray (void) | |
| 2100 +{ | |
| 2101 + init_simd(); | |
| 2102 + | |
| 2103 + return 0; | |
| 2104 +} | |
| 2105 + | |
| 2106 +GLOBAL(int) | |
| 2107 +jsimd_can_ycc_rgb (void) | |
| 2108 +{ | |
| 2109 + init_simd(); | |
| 2110 + | |
| 2111 + /* The code is optimised for these values only */ | |
| 2112 + if (BITS_IN_JSAMPLE != 8) | |
| 2113 + return 0; | |
| 2114 + if (sizeof(JDIMENSION) != 4) | |
| 2115 + return 0; | |
| 2116 + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) | |
| 2117 + return 0; | |
| 2118 + | |
| 2119 + if (simd_support & JSIMD_ARM_NEON) | |
| 2120 + return 1; | |
| 2121 + | |
| 2122 + return 0; | |
| 2123 +} | |
| 2124 + | |
| 2125 +GLOBAL(int) | |
| 2126 +jsimd_can_ycc_rgb565 (void) | |
| 2127 +{ | |
| 2128 + init_simd(); | |
| 2129 + | |
| 2130 + /* The code is optimised for these values only */ | |
| 2131 + if (BITS_IN_JSAMPLE != 8) | |
| 2132 + return 0; | |
| 2133 + if (sizeof(JDIMENSION) != 4) | |
| 2134 + return 0; | |
| 2135 + | |
| 2136 + if (simd_support & JSIMD_ARM_NEON) | |
| 2137 + return 1; | |
| 2138 + | |
| 2139 + return 0; | |
| 2140 +} | |
| 2141 + | |
| 2142 +GLOBAL(void) | |
| 2143 +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, | |
| 2144 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | |
| 2145 + JDIMENSION output_row, int num_rows) | |
| 2146 +{ | |
| 2147 +} | |
| 2148 + | |
| 2149 +GLOBAL(void) | |
| 2150 +jsimd_rgb_gray_convert (j_compress_ptr cinfo, | |
| 2151 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | |
| 2152 + JDIMENSION output_row, int num_rows) | |
| 2153 +{ | |
| 2154 +} | |
| 2155 + | |
| 2156 +GLOBAL(void) | |
| 2157 +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, | |
| 2158 + JSAMPIMAGE input_buf, JDIMENSION input_row, | |
| 2159 + JSAMPARRAY output_buf, int num_rows) | |
| 2160 +{ | |
| 2161 + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); | |
| 2162 + | |
| 2163 + switch(cinfo->out_color_space) { | |
| 2164 + case JCS_EXT_RGB: | |
| 2165 + neonfct=jsimd_ycc_extrgb_convert_neon; | |
| 2166 + break; | |
| 2167 + case JCS_EXT_RGBX: | |
| 2168 + case JCS_EXT_RGBA: | |
| 2169 + neonfct=jsimd_ycc_extrgbx_convert_neon; | |
| 2170 + break; | |
| 2171 + case JCS_EXT_BGR: | |
| 2172 + neonfct=jsimd_ycc_extbgr_convert_neon; | |
| 2173 + break; | |
| 2174 + case JCS_EXT_BGRX: | |
| 2175 + case JCS_EXT_BGRA: | |
| 2176 + neonfct=jsimd_ycc_extbgrx_convert_neon; | |
| 2177 + break; | |
| 2178 + case JCS_EXT_XBGR: | |
| 2179 + case JCS_EXT_ABGR: | |
| 2180 + neonfct=jsimd_ycc_extxbgr_convert_neon; | |
| 2181 + break; | |
| 2182 + case JCS_EXT_XRGB: | |
| 2183 + case JCS_EXT_ARGB: | |
| 2184 + neonfct=jsimd_ycc_extxrgb_convert_neon; | |
| 2185 + break; | |
| 2186 + default: | |
| 2187 + neonfct=jsimd_ycc_extrgb_convert_neon; | |
| 2188 + break; | |
| 2189 + } | |
| 2190 + | |
| 2191 + if (simd_support & JSIMD_ARM_NEON) | |
| 2192 + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); | |
| 2193 +} | |
| 2194 + | |
| 2195 +GLOBAL(void) | |
| 2196 +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, | |
| 2197 + JSAMPIMAGE input_buf, JDIMENSION input_row, | |
| 2198 + JSAMPARRAY output_buf, int num_rows) | |
| 2199 +{ | |
| 2200 + if (simd_support & JSIMD_ARM_NEON) | |
| 2201 + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, | |
| 2202 + output_buf, num_rows); | |
| 2203 +} | |
| 2204 + | |
| 2205 +GLOBAL(int) | |
| 2206 +jsimd_can_h2v2_downsample (void) | |
| 2207 +{ | |
| 2208 + init_simd(); | |
| 2209 + | |
| 2210 + return 0; | |
| 2211 +} | |
| 2212 + | |
| 2213 +GLOBAL(int) | |
| 2214 +jsimd_can_h2v1_downsample (void) | |
| 2215 +{ | |
| 2216 + init_simd(); | |
| 2217 + | |
| 2218 + return 0; | |
| 2219 +} | |
| 2220 + | |
| 2221 +GLOBAL(void) | |
| 2222 +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, | |
| 2223 + JSAMPARRAY input_data, JSAMPARRAY output_data) | |
| 2224 +{ | |
| 2225 +} | |
| 2226 + | |
| 2227 +GLOBAL(void) | |
| 2228 +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, | |
| 2229 + JSAMPARRAY input_data, JSAMPARRAY output_data) | |
| 2230 +{ | |
| 2231 +} | |
| 2232 + | |
| 2233 +GLOBAL(int) | |
| 2234 +jsimd_can_h2v2_upsample (void) | |
| 2235 +{ | |
| 2236 + init_simd(); | |
| 2237 + | |
| 2238 + return 0; | |
| 2239 +} | |
| 2240 + | |
| 2241 +GLOBAL(int) | |
| 2242 +jsimd_can_h2v1_upsample (void) | |
| 2243 +{ | |
| 2244 + init_simd(); | |
| 2245 + | |
| 2246 + return 0; | |
| 2247 +} | |
| 2248 + | |
| 2249 +GLOBAL(void) | |
| 2250 +jsimd_h2v2_upsample (j_decompress_ptr cinfo, | |
| 2251 + jpeg_component_info * compptr, | |
| 2252 + JSAMPARRAY input_data, | |
| 2253 + JSAMPARRAY * output_data_ptr) | |
| 2254 +{ | |
| 2255 +} | |
| 2256 + | |
| 2257 +GLOBAL(void) | |
| 2258 +jsimd_h2v1_upsample (j_decompress_ptr cinfo, | |
| 2259 + jpeg_component_info * compptr, | |
| 2260 + JSAMPARRAY input_data, | |
| 2261 + JSAMPARRAY * output_data_ptr) | |
| 2262 +{ | |
| 2263 +} | |
| 2264 + | |
| 2265 +GLOBAL(int) | |
| 2266 +jsimd_can_h2v2_fancy_upsample (void) | |
| 2267 +{ | |
| 2268 + init_simd(); | |
| 2269 + | |
| 2270 + return 0; | |
| 2271 +} | |
| 2272 + | |
| 2273 +GLOBAL(int) | |
| 2274 +jsimd_can_h2v1_fancy_upsample (void) | |
| 2275 +{ | |
| 2276 + init_simd(); | |
| 2277 + | |
| 2278 + return 0; | |
| 2279 +} | |
| 2280 + | |
| 2281 +GLOBAL(void) | |
| 2282 +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, | |
| 2283 + jpeg_component_info * compptr, | |
| 2284 + JSAMPARRAY input_data, | |
| 2285 + JSAMPARRAY * output_data_ptr) | |
| 2286 +{ | |
| 2287 +} | |
| 2288 + | |
| 2289 +GLOBAL(void) | |
| 2290 +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, | |
| 2291 + jpeg_component_info * compptr, | |
| 2292 + JSAMPARRAY input_data, | |
| 2293 + JSAMPARRAY * output_data_ptr) | |
| 2294 +{ | |
| 2295 +} | |
| 2296 + | |
| 2297 +GLOBAL(int) | |
| 2298 +jsimd_can_h2v2_merged_upsample (void) | |
| 2299 +{ | |
| 2300 + init_simd(); | |
| 2301 + | |
| 2302 + return 0; | |
| 2303 +} | |
| 2304 + | |
| 2305 +GLOBAL(int) | |
| 2306 +jsimd_can_h2v1_merged_upsample (void) | |
| 2307 +{ | |
| 2308 + init_simd(); | |
| 2309 + | |
| 2310 + return 0; | |
| 2311 +} | |
| 2312 + | |
| 2313 +GLOBAL(void) | |
| 2314 +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, | |
| 2315 + JSAMPIMAGE input_buf, | |
| 2316 + JDIMENSION in_row_group_ctr, | |
| 2317 + JSAMPARRAY output_buf) | |
| 2318 +{ | |
| 2319 +} | |
| 2320 + | |
| 2321 +GLOBAL(void) | |
| 2322 +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, | |
| 2323 + JSAMPIMAGE input_buf, | |
| 2324 + JDIMENSION in_row_group_ctr, | |
| 2325 + JSAMPARRAY output_buf) | |
| 2326 +{ | |
| 2327 +} | |
| 2328 + | |
| 2329 +GLOBAL(int) | |
| 2330 +jsimd_can_convsamp (void) | |
| 2331 +{ | |
| 2332 + init_simd(); | |
| 2333 + | |
| 2334 + return 0; | |
| 2335 +} | |
| 2336 + | |
| 2337 +GLOBAL(int) | |
| 2338 +jsimd_can_convsamp_float (void) | |
| 2339 +{ | |
| 2340 + init_simd(); | |
| 2341 + | |
| 2342 + return 0; | |
| 2343 +} | |
| 2344 + | |
| 2345 +GLOBAL(void) | |
| 2346 +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, | |
| 2347 + DCTELEM * workspace) | |
| 2348 +{ | |
| 2349 +} | |
| 2350 + | |
| 2351 +GLOBAL(void) | |
| 2352 +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, | |
| 2353 + FAST_FLOAT * workspace) | |
| 2354 +{ | |
| 2355 +} | |
| 2356 + | |
| 2357 +GLOBAL(int) | |
| 2358 +jsimd_can_fdct_islow (void) | |
| 2359 +{ | |
| 2360 + init_simd(); | |
| 2361 + | |
| 2362 + return 0; | |
| 2363 +} | |
| 2364 + | |
| 2365 +GLOBAL(int) | |
| 2366 +jsimd_can_fdct_ifast (void) | |
| 2367 +{ | |
| 2368 + init_simd(); | |
| 2369 + | |
| 2370 + return 0; | |
| 2371 +} | |
| 2372 + | |
| 2373 +GLOBAL(int) | |
| 2374 +jsimd_can_fdct_float (void) | |
| 2375 +{ | |
| 2376 + init_simd(); | |
| 2377 + | |
| 2378 + return 0; | |
| 2379 +} | |
| 2380 + | |
| 2381 +GLOBAL(void) | |
| 2382 +jsimd_fdct_islow (DCTELEM * data) | |
| 2383 +{ | |
| 2384 +} | |
| 2385 + | |
| 2386 +GLOBAL(void) | |
| 2387 +jsimd_fdct_ifast (DCTELEM * data) | |
| 2388 +{ | |
| 2389 +} | |
| 2390 + | |
| 2391 +GLOBAL(void) | |
| 2392 +jsimd_fdct_float (FAST_FLOAT * data) | |
| 2393 +{ | |
| 2394 +} | |
| 2395 + | |
| 2396 +GLOBAL(int) | |
| 2397 +jsimd_can_quantize (void) | |
| 2398 +{ | |
| 2399 + init_simd(); | |
| 2400 + | |
| 2401 + return 0; | |
| 2402 +} | |
| 2403 + | |
| 2404 +GLOBAL(int) | |
| 2405 +jsimd_can_quantize_float (void) | |
| 2406 +{ | |
| 2407 + init_simd(); | |
| 2408 + | |
| 2409 + return 0; | |
| 2410 +} | |
| 2411 + | |
| 2412 +GLOBAL(void) | |
| 2413 +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, | |
| 2414 + DCTELEM * workspace) | |
| 2415 +{ | |
| 2416 +} | |
| 2417 + | |
| 2418 +GLOBAL(void) | |
| 2419 +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, | |
| 2420 + FAST_FLOAT * workspace) | |
| 2421 +{ | |
| 2422 +} | |
| 2423 + | |
| 2424 +GLOBAL(int) | |
| 2425 +jsimd_can_idct_2x2 (void) | |
| 2426 +{ | |
| 2427 + init_simd(); | |
| 2428 + | |
| 2429 + /* The code is optimised for these values only */ | |
| 2430 + if (DCTSIZE != 8) | |
| 2431 + return 0; | |
| 2432 + if (sizeof(JCOEF) != 2) | |
| 2433 + return 0; | |
| 2434 + if (BITS_IN_JSAMPLE != 8) | |
| 2435 + return 0; | |
| 2436 + if (sizeof(JDIMENSION) != 4) | |
| 2437 + return 0; | |
| 2438 + if (sizeof(ISLOW_MULT_TYPE) != 2) | |
| 2439 + return 0; | |
| 2440 + | |
| 2441 + if (simd_support & JSIMD_ARM_NEON) | |
| 2442 + return 1; | |
| 2443 + | |
| 2444 + return 0; | |
| 2445 +} | |
| 2446 + | |
| 2447 +GLOBAL(int) | |
| 2448 +jsimd_can_idct_4x4 (void) | |
| 2449 +{ | |
| 2450 + init_simd(); | |
| 2451 + | |
| 2452 + /* The code is optimised for these values only */ | |
| 2453 + if (DCTSIZE != 8) | |
| 2454 + return 0; | |
| 2455 + if (sizeof(JCOEF) != 2) | |
| 2456 + return 0; | |
| 2457 + if (BITS_IN_JSAMPLE != 8) | |
| 2458 + return 0; | |
| 2459 + if (sizeof(JDIMENSION) != 4) | |
| 2460 + return 0; | |
| 2461 + if (sizeof(ISLOW_MULT_TYPE) != 2) | |
| 2462 + return 0; | |
| 2463 + | |
| 2464 + if (simd_support & JSIMD_ARM_NEON) | |
| 2465 + return 1; | |
| 2466 + | |
| 2467 + return 0; | |
| 2468 +} | |
| 2469 + | |
| 2470 +GLOBAL(void) | |
| 2471 +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
| 2472 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
| 2473 + JDIMENSION output_col) | |
| 2474 +{ | |
| 2475 + if (simd_support & JSIMD_ARM_NEON) | |
| 2476 + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, | |
| 2477 + output_col); | |
| 2478 +} | |
| 2479 + | |
| 2480 +GLOBAL(void) | |
| 2481 +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
| 2482 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
| 2483 + JDIMENSION output_col) | |
| 2484 +{ | |
| 2485 + if (simd_support & JSIMD_ARM_NEON) | |
| 2486 + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, | |
| 2487 + output_col); | |
| 2488 +} | |
| 2489 + | |
| 2490 +GLOBAL(int) | |
| 2491 +jsimd_can_idct_islow (void) | |
| 2492 +{ | |
| 2493 + init_simd(); | |
| 2494 + | |
| 2495 + /* The code is optimised for these values only */ | |
| 2496 + if (DCTSIZE != 8) | |
| 2497 + return 0; | |
| 2498 + if (sizeof(JCOEF) != 2) | |
| 2499 + return 0; | |
| 2500 + if (BITS_IN_JSAMPLE != 8) | |
| 2501 + return 0; | |
| 2502 + if (sizeof(JDIMENSION) != 4) | |
| 2503 + return 0; | |
| 2504 + if (sizeof(ISLOW_MULT_TYPE) != 2) | |
| 2505 + return 0; | |
| 2506 + | |
| 2507 + if (simd_support & JSIMD_ARM_NEON) | |
| 2508 + return 1; | |
| 2509 + | |
| 2510 + return 0; | |
| 2511 +} | |
| 2512 + | |
| 2513 +GLOBAL(int) | |
| 2514 +jsimd_can_idct_ifast (void) | |
| 2515 +{ | |
| 2516 + init_simd(); | |
| 2517 + | |
| 2518 + /* The code is optimised for these values only */ | |
| 2519 + if (DCTSIZE != 8) | |
| 2520 + return 0; | |
| 2521 + if (sizeof(JCOEF) != 2) | |
| 2522 + return 0; | |
| 2523 + if (BITS_IN_JSAMPLE != 8) | |
| 2524 + return 0; | |
| 2525 + if (sizeof(JDIMENSION) != 4) | |
| 2526 + return 0; | |
| 2527 + if (sizeof(IFAST_MULT_TYPE) != 2) | |
| 2528 + return 0; | |
| 2529 + if (IFAST_SCALE_BITS != 2) | |
| 2530 + return 0; | |
| 2531 + | |
| 2532 + if (simd_support & JSIMD_ARM_NEON) | |
| 2533 + return 1; | |
| 2534 + | |
| 2535 + return 0; | |
| 2536 +} | |
| 2537 + | |
| 2538 +GLOBAL(int) | |
| 2539 +jsimd_can_idct_float (void) | |
| 2540 +{ | |
| 2541 + init_simd(); | |
| 2542 + | |
| 2543 + return 0; | |
| 2544 +} | |
| 2545 + | |
| 2546 +GLOBAL(void) | |
| 2547 +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
| 2548 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
| 2549 + JDIMENSION output_col) | |
| 2550 +{ | |
| 2551 + if (simd_support & JSIMD_ARM_NEON) | |
| 2552 + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, | |
| 2553 + output_col); | |
| 2554 +} | |
| 2555 + | |
| 2556 +GLOBAL(void) | |
| 2557 +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
| 2558 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
| 2559 + JDIMENSION output_col) | |
| 2560 +{ | |
| 2561 + if (simd_support & JSIMD_ARM_NEON) | |
| 2562 + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, | |
| 2563 + output_col); | |
| 2564 +} | |
| 2565 + | |
| 2566 +GLOBAL(void) | |
| 2567 +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
| 2568 + JCOEFPTR coef_block, JSAMPARRAY output_buf, | |
| 2569 + JDIMENSION output_col) | |
| 2570 +{ | |
| 2571 +} | |
| 2572 Index: simd/jsimd_arm64_neon.S | |
| 2573 new file mode 100644 | |
| 2574 =================================================================== | |
| 2575 --- /dev/null | |
| 2576 +++ simd/jsimd_arm64_neon.S | |
| 2577 @@ -0,0 +1,1861 @@ | |
| 2578 +/* | |
| 2579 + * ARMv8 NEON optimizations for libjpeg-turbo | |
| 2580 + * | |
| 2581 + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | |
| 2582 + * All rights reserved. | |
| 2583 + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | |
| 2584 + * Copyright (C) 2013-2014, Linaro Limited | |
| 2585 + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> | |
| 2586 + * | |
| 2587 + * This software is provided 'as-is', without any express or implied | |
| 2588 + * warranty. In no event will the authors be held liable for any damages | |
| 2589 + * arising from the use of this software. | |
| 2590 + * | |
| 2591 + * Permission is granted to anyone to use this software for any purpose, | |
| 2592 + * including commercial applications, and to alter it and redistribute it | |
| 2593 + * freely, subject to the following restrictions: | |
| 2594 + * | |
| 2595 + * 1. The origin of this software must not be misrepresented; you must not | |
| 2596 + * claim that you wrote the original software. If you use this software | |
| 2597 + * in a product, an acknowledgment in the product documentation would be | |
| 2598 + * appreciated but is not required. | |
| 2599 + * 2. Altered source versions must be plainly marked as such, and must not be | |
| 2600 + * misrepresented as being the original software. | |
| 2601 + * 3. This notice may not be removed or altered from any source distribution. | |
| 2602 + */ | |
| 2603 + | |
| 2604 +#if defined(__linux__) && defined(__ELF__) | |
| 2605 +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ | |
| 2606 +#endif | |
| 2607 + | |
| 2608 +.text | |
| 2609 +.arch armv8-a+fp+simd | |
| 2610 + | |
| 2611 + | |
| 2612 +#define RESPECT_STRICT_ALIGNMENT 1 | |
| 2613 + | |
| 2614 + | |
| 2615 +/*****************************************************************************/ | |
| 2616 + | |
| 2617 +/* Supplementary macro for setting function attributes */ | |
| 2618 +.macro asm_function fname | |
| 2619 +#ifdef __APPLE__ | |
| 2620 + .globl _\fname | |
| 2621 +_\fname: | |
| 2622 +#else | |
| 2623 + .global \fname | |
| 2624 +#ifdef __ELF__ | |
| 2625 + .hidden \fname | |
| 2626 + .type \fname, %function | |
| 2627 +#endif | |
| 2628 +\fname: | |
| 2629 +#endif | |
| 2630 +.endm | |
| 2631 + | |
| 2632 +/* Transpose elements of single 128 bit registers */ | |
| 2633 +.macro transpose_single x0,x1,xi,xilen,literal | |
| 2634 + ins \xi\xilen[0], \x0\xilen[0] | |
| 2635 + ins \x1\xilen[0], \x0\xilen[1] | |
| 2636 + trn1 \x0\literal, \x0\literal, \x1\literal | |
| 2637 + trn2 \x1\literal, \xi\literal, \x1\literal | |
| 2638 +.endm | |
| 2639 + | |
| 2640 +/* Transpose elements of 2 differnet registers */ | |
| 2641 +.macro transpose x0,x1,xi,xilen,literal | |
| 2642 + mov \xi\xilen, \x0\xilen | |
| 2643 + trn1 \x0\literal, \x0\literal, \x1\literal | |
| 2644 + trn2 \x1\literal, \xi\literal, \x1\literal | |
| 2645 +.endm | |
| 2646 + | |
| 2647 +/* Transpose a block of 4x4 coefficients in four 64-bit registers */ | |
| 2648 +.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen | |
| 2649 + mov \xi\xilen, \x0\xilen | |
| 2650 + trn1 \x0\x0len, \x0\x0len, \x2\x2len | |
| 2651 + trn2 \x2\x2len, \xi\x0len, \x2\x2len | |
| 2652 + mov \xi\xilen, \x1\xilen | |
| 2653 + trn1 \x1\x1len, \x1\x1len, \x3\x3len | |
| 2654 + trn2 \x3\x3len, \xi\x1len, \x3\x3len | |
| 2655 +.endm | |
| 2656 + | |
| 2657 +.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen | |
| 2658 + mov \xi\xilen, \x0\xilen | |
| 2659 + trn1 \x0\x0len, \x0\x0len, \x1\x1len | |
| 2660 + trn2 \x1\x2len, \xi\x0len, \x1\x2len | |
| 2661 + mov \xi\xilen, \x2\xilen | |
| 2662 + trn1 \x2\x2len, \x2\x2len, \x3\x3len | |
| 2663 + trn2 \x3\x2len, \xi\x1len, \x3\x3len | |
| 2664 +.endm | |
| 2665 + | |
| 2666 +.macro transpose_4x4 x0, x1, x2, x3,x5 | |
| 2667 + transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b | |
| 2668 + transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b | |
| 2669 +.endm | |
| 2670 + | |
| 2671 + | |
| 2672 +#define CENTERJSAMPLE 128 | |
| 2673 + | |
| 2674 +/*****************************************************************************/ | |
| 2675 + | |
| 2676 +/* | |
| 2677 + * Perform dequantization and inverse DCT on one block of coefficients. | |
| 2678 + * | |
| 2679 + * GLOBAL(void) | |
| 2680 + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, | |
| 2681 + * JSAMPARRAY output_buf, JDIMENSION output_col) | |
| 2682 + */ | |
| 2683 + | |
| 2684 +#define FIX_0_298631336 (2446) | |
| 2685 +#define FIX_0_390180644 (3196) | |
| 2686 +#define FIX_0_541196100 (4433) | |
| 2687 +#define FIX_0_765366865 (6270) | |
| 2688 +#define FIX_0_899976223 (7373) | |
| 2689 +#define FIX_1_175875602 (9633) | |
| 2690 +#define FIX_1_501321110 (12299) | |
| 2691 +#define FIX_1_847759065 (15137) | |
| 2692 +#define FIX_1_961570560 (16069) | |
| 2693 +#define FIX_2_053119869 (16819) | |
| 2694 +#define FIX_2_562915447 (20995) | |
| 2695 +#define FIX_3_072711026 (25172) | |
| 2696 + | |
| 2697 +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) | |
| 2698 +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) | |
| 2699 +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) | |
| 2700 +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) | |
| 2701 +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) | |
| 2702 +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) | |
| 2703 +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) | |
| 2704 +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) | |
| 2705 + | |
| 2706 +/* | |
| 2707 + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. | |
| 2708 + * Uses some ideas from the comments in 'simd/jiss2int-64.asm' | |
| 2709 + */ | |
| 2710 +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ | |
| 2711 +{ \ | |
| 2712 + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ | |
| 2713 + INT32 q1, q2, q3, q4, q5, q6, q7; \ | |
| 2714 + INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ | |
| 2715 + \ | |
| 2716 + /* 1-D iDCT input data */ \ | |
| 2717 + row0 = xrow0; \ | |
| 2718 + row1 = xrow1; \ | |
| 2719 + row2 = xrow2; \ | |
| 2720 + row3 = xrow3; \ | |
| 2721 + row4 = xrow4; \ | |
| 2722 + row5 = xrow5; \ | |
| 2723 + row6 = xrow6; \ | |
| 2724 + row7 = xrow7; \ | |
| 2725 + \ | |
| 2726 + q5 = row7 + row3; \ | |
| 2727 + q4 = row5 + row1; \ | |
| 2728 + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ | |
| 2729 + MULTIPLY(q4, FIX_1_175875602); \ | |
| 2730 + q7 = MULTIPLY(q5, FIX_1_175875602) + \ | |
| 2731 + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ | |
| 2732 + q2 = MULTIPLY(row2, FIX_0_541196100) + \ | |
| 2733 + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ | |
| 2734 + q4 = q6; \ | |
| 2735 + q3 = ((INT32) row0 - (INT32) row4) << 13; \ | |
| 2736 + q6 += MULTIPLY(row5, -FIX_2_562915447) + \ | |
| 2737 + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ | |
| 2738 + /* now we can use q1 (reloadable constants have been used up) */ \ | |
| 2739 + q1 = q3 + q2; \ | |
| 2740 + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ | |
| 2741 + MULTIPLY(row1, -FIX_0_899976223); \ | |
| 2742 + q5 = q7; \ | |
| 2743 + q1 = q1 + q6; \ | |
| 2744 + q7 += MULTIPLY(row7, -FIX_0_899976223) + \ | |
| 2745 + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ | |
| 2746 + \ | |
| 2747 + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ | |
| 2748 + tmp11_plus_tmp2 = q1; \ | |
| 2749 + row1 = 0; \ | |
| 2750 + \ | |
| 2751 + q1 = q1 - q6; \ | |
| 2752 + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ | |
| 2753 + MULTIPLY(row3, -FIX_2_562915447); \ | |
| 2754 + q1 = q1 - q6; \ | |
| 2755 + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ | |
| 2756 + MULTIPLY(row6, FIX_0_541196100); \ | |
| 2757 + q3 = q3 - q2; \ | |
| 2758 + \ | |
| 2759 + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ | |
| 2760 + tmp11_minus_tmp2 = q1; \ | |
| 2761 + \ | |
| 2762 + q1 = ((INT32) row0 + (INT32) row4) << 13; \ | |
| 2763 + q2 = q1 + q6; \ | |
| 2764 + q1 = q1 - q6; \ | |
| 2765 + \ | |
| 2766 + /* pick up the results */ \ | |
| 2767 + tmp0 = q4; \ | |
| 2768 + tmp1 = q5; \ | |
| 2769 + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ | |
| 2770 + tmp3 = q7; \ | |
| 2771 + tmp10 = q2; \ | |
| 2772 + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ | |
| 2773 + tmp12 = q3; \ | |
| 2774 + tmp13 = q1; \ | |
| 2775 +} | |
| 2776 + | |
| 2777 +#define XFIX_0_899976223 v0.4h[0] | |
| 2778 +#define XFIX_0_541196100 v0.4h[1] | |
| 2779 +#define XFIX_2_562915447 v0.4h[2] | |
| 2780 +#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3] | |
| 2781 +#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0] | |
| 2782 +#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1] | |
| 2783 +#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2] | |
| 2784 +#define XFIX_1_175875602 v1.4h[3] | |
| 2785 +#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0] | |
| 2786 +#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1] | |
| 2787 +#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2] | |
| 2788 +#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3] | |
| 2789 + | |
| 2790 +.balign 16 | |
| 2791 +jsimd_idct_islow_neon_consts: | |
| 2792 + .short FIX_0_899976223 /* d0[0] */ | |
| 2793 + .short FIX_0_541196100 /* d0[1] */ | |
| 2794 + .short FIX_2_562915447 /* d0[2] */ | |
| 2795 + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ | |
| 2796 + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ | |
| 2797 + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ | |
| 2798 + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ | |
| 2799 + .short FIX_1_175875602 /* d1[3] */ | |
| 2800 + /* reloadable constants */ | |
| 2801 + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ | |
| 2802 + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ | |
| 2803 + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ | |
| 2804 + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ | |
| 2805 + | |
| 2806 +asm_function jsimd_idct_islow_neon | |
| 2807 + | |
| 2808 + DCT_TABLE .req x0 | |
| 2809 + COEF_BLOCK .req x1 | |
| 2810 + OUTPUT_BUF .req x2 | |
| 2811 + OUTPUT_COL .req x3 | |
| 2812 + TMP1 .req x0 | |
| 2813 + TMP2 .req x1 | |
| 2814 + TMP3 .req x2 | |
| 2815 + TMP4 .req x15 | |
| 2816 + | |
| 2817 + ROW0L .req v16 | |
| 2818 + ROW0R .req v17 | |
| 2819 + ROW1L .req v18 | |
| 2820 + ROW1R .req v19 | |
| 2821 + ROW2L .req v20 | |
| 2822 + ROW2R .req v21 | |
| 2823 + ROW3L .req v22 | |
| 2824 + ROW3R .req v23 | |
| 2825 + ROW4L .req v24 | |
| 2826 + ROW4R .req v25 | |
| 2827 + ROW5L .req v26 | |
| 2828 + ROW5R .req v27 | |
| 2829 + ROW6L .req v28 | |
| 2830 + ROW6R .req v29 | |
| 2831 + ROW7L .req v30 | |
| 2832 + ROW7R .req v31 | |
| 2833 + /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */ | |
| 2834 + sub sp, sp, 272 | |
| 2835 + str x15, [sp], 16 | |
| 2836 + adr x15, jsimd_idct_islow_neon_consts | |
| 2837 + st1 {v0.8b - v3.8b}, [sp], 32 | |
| 2838 + st1 {v4.8b - v7.8b}, [sp], 32 | |
| 2839 + st1 {v8.8b - v11.8b}, [sp], 32 | |
| 2840 + st1 {v12.8b - v15.8b}, [sp], 32 | |
| 2841 + st1 {v16.8b - v19.8b}, [sp], 32 | |
| 2842 + st1 {v20.8b - v23.8b}, [sp], 32 | |
| 2843 + st1 {v24.8b - v27.8b}, [sp], 32 | |
| 2844 + st1 {v28.8b - v31.8b}, [sp], 32 | |
| 2845 + ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 | |
| 2846 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | |
| 2847 + ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 | |
| 2848 + mul v16.4h, v16.4h, v0.4h | |
| 2849 + mul v17.4h, v17.4h, v1.4h | |
| 2850 + ins v16.2d[1], v17.2d[0] /* 128 bit q8 */ | |
| 2851 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | |
| 2852 + mul v18.4h, v18.4h, v2.4h | |
| 2853 + mul v19.4h, v19.4h, v3.4h | |
| 2854 + ins v18.2d[1], v19.2d[0] /* 128 bit q9 */ | |
| 2855 + ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 | |
| 2856 + mul v20.4h, v20.4h, v4.4h | |
| 2857 + mul v21.4h, v21.4h, v5.4h | |
| 2858 + ins v20.2d[1], v21.2d[0] /* 128 bit q10 */ | |
| 2859 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | |
| 2860 + mul v22.4h, v22.4h, v6.4h | |
| 2861 + mul v23.4h, v23.4h, v7.4h | |
| 2862 + ins v22.2d[1], v23.2d[0] /* 128 bit q11 */ | |
| 2863 + ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] | |
| 2864 + mul v24.4h, v24.4h, v0.4h | |
| 2865 + mul v25.4h, v25.4h, v1.4h | |
| 2866 + ins v24.2d[1], v25.2d[0] /* 128 bit q12 */ | |
| 2867 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | |
| 2868 + mul v28.4h, v28.4h, v4.4h | |
| 2869 + mul v29.4h, v29.4h, v5.4h | |
| 2870 + ins v28.2d[1], v29.2d[0] /* 128 bit q14 */ | |
| 2871 + mul v26.4h, v26.4h, v2.4h | |
| 2872 + mul v27.4h, v27.4h, v3.4h | |
| 2873 + ins v26.2d[1], v27.2d[0] /* 128 bit q13 */ | |
| 2874 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ | |
| 2875 + add x15, x15, #16 | |
| 2876 + mul v30.4h, v30.4h, v6.4h | |
| 2877 + mul v31.4h, v31.4h, v7.4h | |
| 2878 + ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ | |
| 2879 + /* Go to the bottom of the stack */ | |
| 2880 + sub sp, sp, 352 | |
| 2881 + stp x4, x5, [sp], 16 | |
| 2882 + st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ | |
| 2883 + st1 {v12.4h - v15.4h}, [sp], 32 | |
| 2884 + /* 1-D IDCT, pass 1, left 4x8 half */ | |
| 2885 + add v4.4h, ROW7L.4h, ROW3L.4h | |
| 2886 + add v5.4h, ROW5L.4h, ROW1L.4h | |
| 2887 + smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 | |
| 2888 + smlal v12.4s, v5.4h, XFIX_1_175875602 | |
| 2889 + smull v14.4s, v4.4h, XFIX_1_175875602 | |
| 2890 + /* Check for the zero coefficients in the right 4x8 half */ | |
| 2891 + smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644 | |
| 2892 + ssubl v6.4s, ROW0L.4h, ROW4L.4h | |
| 2893 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] | |
| 2894 + smull v4.4s, ROW2L.4h, XFIX_0_541196100 | |
| 2895 + smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065 | |
| 2896 + orr x0, x4, x5 | |
| 2897 + mov v8.16b, v12.16b | |
| 2898 + smlsl v12.4s, ROW5L.4h, XFIX_2_562915447 | |
| 2899 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] | |
| 2900 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
| 2901 + shl v6.4s, v6.4s, #13 | |
| 2902 + orr x0, x0, x4 | |
| 2903 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | |
| 2904 + orr x0, x0 , x5 | |
| 2905 + add v2.4s, v6.4s, v4.4s | |
| 2906 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] | |
| 2907 + mov v10.16b, v14.16b | |
| 2908 + add v2.4s, v2.4s, v12.4s | |
| 2909 + orr x0, x0, x4 | |
| 2910 + smlsl v14.4s, ROW7L.4h, XFIX_0_899976223 | |
| 2911 + orr x0, x0, x5 | |
| 2912 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
| 2913 + rshrn ROW1L.4h, v2.4s, #11 | |
| 2914 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] | |
| 2915 + sub v2.4s, v2.4s, v12.4s | |
| 2916 + smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447 | |
| 2917 + orr x0, x0, x4 | |
| 2918 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | |
| 2919 + orr x0, x0, x5 | |
| 2920 + sub v2.4s, v2.4s, v12.4s | |
| 2921 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
| 2922 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] | |
| 2923 + smlal v12.4s, ROW6L.4h, XFIX_0_541196100 | |
| 2924 + sub v6.4s, v6.4s, v4.4s | |
| 2925 + orr x0, x0, x4 | |
| 2926 + rshrn ROW6L.4h, v2.4s, #11 | |
| 2927 + orr x0, x0, x5 | |
| 2928 + add v2.4s, v6.4s, v10.4s | |
| 2929 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] | |
| 2930 + sub v6.4s, v6.4s, v10.4s | |
| 2931 + saddl v10.4s, ROW0L.4h, ROW4L.4h | |
| 2932 + orr x0, x0, x4 | |
| 2933 + rshrn ROW2L.4h, v2.4s, #11 | |
| 2934 + orr x0, x0, x5 | |
| 2935 + rshrn ROW5L.4h, v6.4s, #11 | |
| 2936 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] | |
| 2937 + shl v10.4s, v10.4s, #13 | |
| 2938 + smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223 | |
| 2939 + orr x0, x0, x4 | |
| 2940 + add v4.4s, v10.4s, v12.4s | |
| 2941 + orr x0, x0, x5 | |
| 2942 + cmp x0, #0 /* orrs instruction removed */ | |
| 2943 + sub v2.4s, v10.4s, v12.4s | |
| 2944 + add v12.4s, v4.4s, v14.4s | |
| 2945 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] | |
| 2946 + sub v4.4s, v4.4s, v14.4s | |
| 2947 + add v10.4s, v2.4s, v8.4s | |
| 2948 + orr x0, x4, x5 | |
| 2949 + sub v6.4s, v2.4s, v8.4s | |
| 2950 + /* pop {x4, x5} */ | |
| 2951 + sub sp, sp, 80 | |
| 2952 + ldp x4, x5, [sp], 16 | |
| 2953 + rshrn ROW7L.4h, v4.4s, #11 | |
| 2954 + rshrn ROW3L.4h, v10.4s, #11 | |
| 2955 + rshrn ROW0L.4h, v12.4s, #11 | |
| 2956 + rshrn ROW4L.4h, v6.4s, #11 | |
| 2957 + | |
| 2958 + beq 3f /* Go to do some special handling for the sparse right
4x8 half */ | |
| 2959 + | |
| 2960 + /* 1-D IDCT, pass 1, right 4x8 half */ | |
| 2961 + ld1 {v2.4h}, [x15] /* reload constants */ | |
| 2962 + add v10.4h, ROW7R.4h, ROW3R.4h | |
| 2963 + add v8.4h, ROW5R.4h, ROW1R.4h | |
| 2964 + /* Transpose ROW6L <-> ROW7L (v3 available free register) */ | |
| 2965 + transpose ROW6L, ROW7L, v3, .16b, .4h | |
| 2966 + smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560 | |
| 2967 + smlal v12.4s, v8.4h, XFIX_1_175875602 | |
| 2968 + /* Transpose ROW2L <-> ROW3L (v3 available free register) */ | |
| 2969 + transpose ROW2L, ROW3L, v3, .16b, .4h | |
| 2970 + smull v14.4s, v10.4h, XFIX_1_175875602 | |
| 2971 + smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644 | |
| 2972 + /* Transpose ROW0L <-> ROW1L (v3 available free register) */ | |
| 2973 + transpose ROW0L, ROW1L, v3, .16b, .4h | |
| 2974 + ssubl v6.4s, ROW0R.4h, ROW4R.4h | |
| 2975 + smull v4.4s, ROW2R.4h, XFIX_0_541196100 | |
| 2976 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 | |
| 2977 + /* Transpose ROW4L <-> ROW5L (v3 available free register) */ | |
| 2978 + transpose ROW4L, ROW5L, v3, .16b, .4h | |
| 2979 + mov v8.16b, v12.16b | |
| 2980 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 | |
| 2981 + smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447 | |
| 2982 + /* Transpose ROW1L <-> ROW3L (v3 available free register) */ | |
| 2983 + transpose ROW1L, ROW3L, v3, .16b, .2s | |
| 2984 + shl v6.4s, v6.4s, #13 | |
| 2985 + smlsl v8.4s, ROW1R.4h, XFIX_0_899976223 | |
| 2986 + /* Transpose ROW4L <-> ROW6L (v3 available free register) */ | |
| 2987 + transpose ROW4L, ROW6L, v3, .16b, .2s | |
| 2988 + add v2.4s, v6.4s, v4.4s | |
| 2989 + mov v10.16b, v14.16b | |
| 2990 + add v2.4s, v2.4s, v12.4s | |
| 2991 + /* Transpose ROW0L <-> ROW2L (v3 available free register) */ | |
| 2992 + transpose ROW0L, ROW2L, v3, .16b, .2s | |
| 2993 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 | |
| 2994 + smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223 | |
| 2995 + rshrn ROW1R.4h, v2.4s, #11 | |
| 2996 + /* Transpose ROW5L <-> ROW7L (v3 available free register) */ | |
| 2997 + transpose ROW5L, ROW7L, v3, .16b, .2s | |
| 2998 + sub v2.4s, v2.4s, v12.4s | |
| 2999 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 | |
| 3000 + smlsl v10.4s, ROW3R.4h, XFIX_2_562915447 | |
| 3001 + sub v2.4s, v2.4s, v12.4s | |
| 3002 + smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865 | |
| 3003 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100 | |
| 3004 + sub v6.4s, v6.4s, v4.4s | |
| 3005 + rshrn ROW6R.4h, v2.4s, #11 | |
| 3006 + add v2.4s, v6.4s, v10.4s | |
| 3007 + sub v6.4s, v6.4s, v10.4s | |
| 3008 + saddl v10.4s, ROW0R.4h, ROW4R.4h | |
| 3009 + rshrn ROW2R.4h, v2.4s, #11 | |
| 3010 + rshrn ROW5R.4h, v6.4s, #11 | |
| 3011 + shl v10.4s, v10.4s, #13 | |
| 3012 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 | |
| 3013 + add v4.4s, v10.4s, v12.4s | |
| 3014 + sub v2.4s, v10.4s, v12.4s | |
| 3015 + add v12.4s, v4.4s, v14.4s | |
| 3016 + sub v4.4s, v4.4s, v14.4s | |
| 3017 + add v10.4s, v2.4s, v8.4s | |
| 3018 + sub v6.4s, v2.4s, v8.4s | |
| 3019 + rshrn ROW7R.4h, v4.4s, #11 | |
| 3020 + rshrn ROW3R.4h, v10.4s, #11 | |
| 3021 + rshrn ROW0R.4h, v12.4s, #11 | |
| 3022 + rshrn ROW4R.4h, v6.4s, #11 | |
| 3023 + /* Transpose right 4x8 half */ | |
| 3024 + transpose ROW6R, ROW7R, v3, .16b, .4h | |
| 3025 + transpose ROW2R, ROW3R, v3, .16b, .4h | |
| 3026 + transpose ROW0R, ROW1R, v3, .16b, .4h | |
| 3027 + transpose ROW4R, ROW5R, v3, .16b, .4h | |
| 3028 + transpose ROW1R, ROW3R, v3, .16b, .2s | |
| 3029 + transpose ROW4R, ROW6R, v3, .16b, .2s | |
| 3030 + transpose ROW0R, ROW2R, v3, .16b, .2s | |
| 3031 + transpose ROW5R, ROW7R, v3, .16b, .2s | |
| 3032 + | |
| 3033 +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ | |
| 3034 + ld1 {v2.4h}, [x15] /* reload constants */ | |
| 3035 + smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.
4h */ | |
| 3036 + smlal v12.4s, ROW1L.4h, XFIX_1_175875602 | |
| 3037 + smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* R
OW7L.4h <-> ROW3R.4h */ | |
| 3038 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 | |
| 3039 + smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.
4h */ | |
| 3040 + smlal v14.4s, ROW3L.4h, XFIX_1_175875602 | |
| 3041 + smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* R
OW5L.4h <-> ROW1R.4h */ | |
| 3042 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 | |
| 3043 + ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
| 3044 + smull v4.4s, ROW2L.4h, XFIX_0_541196100 | |
| 3045 + smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* R
OW6L.4h <-> ROW2R.4h */ | |
| 3046 + mov v8.16b, v12.16b | |
| 3047 + smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.
4h */ | |
| 3048 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
| 3049 + shl v6.4s, v6.4s, #13 | |
| 3050 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | |
| 3051 + add v2.4s, v6.4s, v4.4s | |
| 3052 + mov v10.16b, v14.16b | |
| 3053 + add v2.4s, v2.4s, v12.4s | |
| 3054 + smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.
4h */ | |
| 3055 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
| 3056 + shrn ROW1L.4h, v2.4s, #16 | |
| 3057 + sub v2.4s, v2.4s, v12.4s | |
| 3058 + smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* R
OW5L.4h <-> ROW1R.4h */ | |
| 3059 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | |
| 3060 + sub v2.4s, v2.4s, v12.4s | |
| 3061 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
| 3062 + smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.
4h */ | |
| 3063 + sub v6.4s, v6.4s, v4.4s | |
| 3064 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
| 3065 + add v2.4s, v6.4s, v10.4s | |
| 3066 + sub v6.4s, v6.4s, v10.4s | |
| 3067 + saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
| 3068 + shrn ROW2L.4h, v2.4s, #16 | |
| 3069 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
| 3070 + shl v10.4s, v10.4s, #13 | |
| 3071 + smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* R
OW7L.4h <-> ROW3R.4h */ | |
| 3072 + add v4.4s, v10.4s, v12.4s | |
| 3073 + sub v2.4s, v10.4s, v12.4s | |
| 3074 + add v12.4s, v4.4s, v14.4s | |
| 3075 + sub v4.4s, v4.4s, v14.4s | |
| 3076 + add v10.4s, v2.4s, v8.4s | |
| 3077 + sub v6.4s, v2.4s, v8.4s | |
| 3078 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
| 3079 + shrn ROW3L.4h, v10.4s, #16 | |
| 3080 + shrn ROW0L.4h, v12.4s, #16 | |
| 3081 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
| 3082 + /* 1-D IDCT, pass 2, right 4x8 half */ | |
| 3083 + ld1 {v2.4h}, [x15] /* reload constants */ | |
| 3084 + smull v12.4s, ROW5R.4h, XFIX_1_175875602 | |
| 3085 + smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.
4h */ | |
| 3086 + smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560 | |
| 3087 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* R
OW7L.4h <-> ROW3R.4h */ | |
| 3088 + smull v14.4s, ROW7R.4h, XFIX_1_175875602 | |
| 3089 + smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.
4h */ | |
| 3090 + smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644 | |
| 3091 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* R
OW5L.4h <-> ROW1R.4h */ | |
| 3092 + ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
| 3093 + smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.
4h */ | |
| 3094 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 | |
| 3095 + mov v8.16b, v12.16b | |
| 3096 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 | |
| 3097 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* R
OW7L.4h <-> ROW3R.4h */ | |
| 3098 + shl v6.4s, v6.4s, #13 | |
| 3099 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.
4h */ | |
| 3100 + add v2.4s, v6.4s, v4.4s | |
| 3101 + mov v10.16b, v14.16b | |
| 3102 + add v2.4s, v2.4s, v12.4s | |
| 3103 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 | |
| 3104 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* R
OW5L.4h <-> ROW1R.4h */ | |
| 3105 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
| 3106 + sub v2.4s, v2.4s, v12.4s | |
| 3107 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 | |
| 3108 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.
4h */ | |
| 3109 + sub v2.4s, v2.4s, v12.4s | |
| 3110 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* RO
W6L.4h <-> ROW2R.4h */ | |
| 3111 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100 | |
| 3112 + sub v6.4s, v6.4s, v4.4s | |
| 3113 + shrn ROW6R.4h, v2.4s, #16 | |
| 3114 + add v2.4s, v6.4s, v10.4s | |
| 3115 + sub v6.4s, v6.4s, v10.4s | |
| 3116 + saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ | |
| 3117 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
| 3118 + shrn ROW5R.4h, v6.4s, #16 | |
| 3119 + shl v10.4s, v10.4s, #13 | |
| 3120 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 | |
| 3121 + add v4.4s, v10.4s, v12.4s | |
| 3122 + sub v2.4s, v10.4s, v12.4s | |
| 3123 + add v12.4s, v4.4s, v14.4s | |
| 3124 + sub v4.4s, v4.4s, v14.4s | |
| 3125 + add v10.4s, v2.4s, v8.4s | |
| 3126 + sub v6.4s, v2.4s, v8.4s | |
| 3127 + shrn ROW7R.4h, v4.4s, #16 | |
| 3128 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
| 3129 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
| 3130 + shrn ROW4R.4h, v6.4s, #16 | |
| 3131 + | |
| 3132 +2: /* Descale to 8-bit and range limit */ | |
| 3133 + ins v16.2d[1], v17.2d[0] | |
| 3134 + ins v18.2d[1], v19.2d[0] | |
| 3135 + ins v20.2d[1], v21.2d[0] | |
| 3136 + ins v22.2d[1], v23.2d[0] | |
| 3137 + sqrshrn v16.8b, v16.8h, #2 | |
| 3138 + sqrshrn2 v16.16b, v18.8h, #2 | |
| 3139 + sqrshrn v18.8b, v20.8h, #2 | |
| 3140 + sqrshrn2 v18.16b, v22.8h, #2 | |
| 3141 + | |
| 3142 + /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ | |
| 3143 + ld1 {v8.4h - v11.4h}, [sp], 32 | |
| 3144 + ld1 {v12.4h - v15.4h}, [sp], 32 | |
| 3145 + ins v24.2d[1], v25.2d[0] | |
| 3146 + | |
| 3147 + sqrshrn v20.8b, v24.8h, #2 | |
| 3148 + /* Transpose the final 8-bit samples and do signed->unsigned conversion *
/ | |
| 3149 + /* trn1 v16.8h, v16.8h, v18.8h */ | |
| 3150 + transpose v16, v18, v3, .16b, .8h | |
| 3151 + ins v26.2d[1], v27.2d[0] | |
| 3152 + ins v28.2d[1], v29.2d[0] | |
| 3153 + ins v30.2d[1], v31.2d[0] | |
| 3154 + sqrshrn2 v20.16b, v26.8h, #2 | |
| 3155 + sqrshrn v22.8b, v28.8h, #2 | |
| 3156 + movi v0.16b, #(CENTERJSAMPLE) | |
| 3157 + sqrshrn2 v22.16b, v30.8h, #2 | |
| 3158 + transpose_single v16, v17, v3, .2d, .8b | |
| 3159 + transpose_single v18, v19, v3, .2d, .8b | |
| 3160 + add v16.8b, v16.8b, v0.8b | |
| 3161 + add v17.8b, v17.8b, v0.8b | |
| 3162 + add v18.8b, v18.8b, v0.8b | |
| 3163 + add v19.8b, v19.8b, v0.8b | |
| 3164 + transpose v20, v22, v3, .16b, .8h | |
| 3165 + /* Store results to the output buffer */ | |
| 3166 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
| 3167 + add TMP1, TMP1, OUTPUT_COL | |
| 3168 + add TMP2, TMP2, OUTPUT_COL | |
| 3169 + st1 {v16.8b}, [TMP1] | |
| 3170 + transpose_single v20, v21, v3, .2d, .8b | |
| 3171 + st1 {v17.8b}, [TMP2] | |
| 3172 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
| 3173 + add TMP1, TMP1, OUTPUT_COL | |
| 3174 + add TMP2, TMP2, OUTPUT_COL | |
| 3175 + st1 {v18.8b}, [TMP1] | |
| 3176 + add v20.8b, v20.8b, v0.8b | |
| 3177 + add v21.8b, v21.8b, v0.8b | |
| 3178 + st1 {v19.8b}, [TMP2] | |
| 3179 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
| 3180 + ldp TMP3, TMP4, [OUTPUT_BUF] | |
| 3181 + add TMP1, TMP1, OUTPUT_COL | |
| 3182 + add TMP2, TMP2, OUTPUT_COL | |
| 3183 + add TMP3, TMP3, OUTPUT_COL | |
| 3184 + add TMP4, TMP4, OUTPUT_COL | |
| 3185 + transpose_single v22, v23, v3, .2d, .8b | |
| 3186 + st1 {v20.8b}, [TMP1] | |
| 3187 + add v22.8b, v22.8b, v0.8b | |
| 3188 + add v23.8b, v23.8b, v0.8b | |
| 3189 + st1 {v21.8b}, [TMP2] | |
| 3190 + st1 {v22.8b}, [TMP3] | |
| 3191 + st1 {v23.8b}, [TMP4] | |
| 3192 + ldr x15, [sp], 16 | |
| 3193 + ld1 {v0.8b - v3.8b}, [sp], 32 | |
| 3194 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
| 3195 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
| 3196 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
| 3197 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
| 3198 + ld1 {v20.8b - v23.8b}, [sp], 32 | |
| 3199 + ld1 {v24.8b - v27.8b}, [sp], 32 | |
| 3200 + ld1 {v28.8b - v31.8b}, [sp], 32 | |
| 3201 + blr x30 | |
| 3202 + | |
| 3203 +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ | |
| 3204 + | |
| 3205 + /* Transpose left 4x8 half */ | |
| 3206 + transpose ROW6L, ROW7L, v3, .16b, .4h | |
| 3207 + transpose ROW2L, ROW3L, v3, .16b, .4h | |
| 3208 + transpose ROW0L, ROW1L, v3, .16b, .4h | |
| 3209 + transpose ROW4L, ROW5L, v3, .16b, .4h | |
| 3210 + shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ | |
| 3211 + transpose ROW1L, ROW3L, v3, .16b, .2s | |
| 3212 + transpose ROW4L, ROW6L, v3, .16b, .2s | |
| 3213 + transpose ROW0L, ROW2L, v3, .16b, .2s | |
| 3214 + transpose ROW5L, ROW7L, v3, .16b, .2s | |
| 3215 + cmp x0, #0 | |
| 3216 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second p
ass */ | |
| 3217 + | |
| 3218 + /* Only row 0 is non-zero for the right 4x8 half */ | |
| 3219 + dup ROW1R.4h, ROW0R.4h[1] | |
| 3220 + dup ROW2R.4h, ROW0R.4h[2] | |
| 3221 + dup ROW3R.4h, ROW0R.4h[3] | |
| 3222 + dup ROW4R.4h, ROW0R.4h[0] | |
| 3223 + dup ROW5R.4h, ROW0R.4h[1] | |
| 3224 + dup ROW6R.4h, ROW0R.4h[2] | |
| 3225 + dup ROW7R.4h, ROW0R.4h[3] | |
| 3226 + dup ROW0R.4h, ROW0R.4h[0] | |
| 3227 + b 1b /* Go to 'normal' second pass */ | |
| 3228 + | |
| 3229 +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ | |
| 3230 + ld1 {v2.4h}, [x15] /* reload constants */ | |
| 3231 + smull v12.4s, ROW1L.4h, XFIX_1_175875602 | |
| 3232 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 | |
| 3233 + smull v14.4s, ROW3L.4h, XFIX_1_175875602 | |
| 3234 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 | |
| 3235 + smull v4.4s, ROW2L.4h, XFIX_0_541196100 | |
| 3236 + sshll v6.4s, ROW0L.4h, #13 | |
| 3237 + mov v8.16b, v12.16b | |
| 3238 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
| 3239 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 | |
| 3240 + add v2.4s, v6.4s, v4.4s | |
| 3241 + mov v10.16b, v14.16b | |
| 3242 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
| 3243 + add v2.4s, v2.4s, v12.4s | |
| 3244 + add v12.4s, v12.4s, v12.4s | |
| 3245 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 | |
| 3246 + shrn ROW1L.4h, v2.4s, #16 | |
| 3247 + sub v2.4s, v2.4s, v12.4s | |
| 3248 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
| 3249 + sub v6.4s, v6.4s, v4.4s | |
| 3250 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
| 3251 + add v2.4s, v6.4s, v10.4s | |
| 3252 + sub v6.4s, v6.4s, v10.4s | |
| 3253 + sshll v10.4s, ROW0L.4h, #13 | |
| 3254 + shrn ROW2L.4h, v2.4s, #16 | |
| 3255 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
| 3256 + add v4.4s, v10.4s, v12.4s | |
| 3257 + sub v2.4s, v10.4s, v12.4s | |
| 3258 + add v12.4s, v4.4s, v14.4s | |
| 3259 + sub v4.4s, v4.4s, v14.4s | |
| 3260 + add v10.4s, v2.4s, v8.4s | |
| 3261 + sub v6.4s, v2.4s, v8.4s | |
| 3262 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
| 3263 + shrn ROW3L.4h, v10.4s, #16 | |
| 3264 + shrn ROW0L.4h, v12.4s, #16 | |
| 3265 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
| 3266 + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ | |
| 3267 + ld1 {v2.4h}, [x15] /* reload constants */ | |
| 3268 + smull v12.4s, ROW5L.4h, XFIX_1_175875602 | |
| 3269 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 | |
| 3270 + smull v14.4s, ROW7L.4h, XFIX_1_175875602 | |
| 3271 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 | |
| 3272 + smull v4.4s, ROW6L.4h, XFIX_0_541196100 | |
| 3273 + sshll v6.4s, ROW4L.4h, #13 | |
| 3274 + mov v8.16b, v12.16b | |
| 3275 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 | |
| 3276 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 | |
| 3277 + add v2.4s, v6.4s, v4.4s | |
| 3278 + mov v10.16b, v14.16b | |
| 3279 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 | |
| 3280 + add v2.4s, v2.4s, v12.4s | |
| 3281 + add v12.4s, v12.4s, v12.4s | |
| 3282 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 | |
| 3283 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ | |
| 3284 + sub v2.4s, v2.4s, v12.4s | |
| 3285 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 | |
| 3286 + sub v6.4s, v6.4s, v4.4s | |
| 3287 + shrn ROW6R.4h, v2.4s, #16 | |
| 3288 + add v2.4s, v6.4s, v10.4s | |
| 3289 + sub v6.4s, v6.4s, v10.4s | |
| 3290 + sshll v10.4s, ROW4L.4h, #13 | |
| 3291 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ | |
| 3292 + shrn ROW5R.4h, v6.4s, #16 | |
| 3293 + add v4.4s, v10.4s, v12.4s | |
| 3294 + sub v2.4s, v10.4s, v12.4s | |
| 3295 + add v12.4s, v4.4s, v14.4s | |
| 3296 + sub v4.4s, v4.4s, v14.4s | |
| 3297 + add v10.4s, v2.4s, v8.4s | |
| 3298 + sub v6.4s, v2.4s, v8.4s | |
| 3299 + shrn ROW7R.4h, v4.4s, #16 | |
| 3300 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | |
| 3301 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | |
| 3302 + shrn ROW4R.4h, v6.4s, #16 | |
| 3303 + b 2b /* Go to epilogue */ | |
| 3304 + | |
| 3305 + .unreq DCT_TABLE | |
| 3306 + .unreq COEF_BLOCK | |
| 3307 + .unreq OUTPUT_BUF | |
| 3308 + .unreq OUTPUT_COL | |
| 3309 + .unreq TMP1 | |
| 3310 + .unreq TMP2 | |
| 3311 + .unreq TMP3 | |
| 3312 + .unreq TMP4 | |
| 3313 + | |
| 3314 + .unreq ROW0L | |
| 3315 + .unreq ROW0R | |
| 3316 + .unreq ROW1L | |
| 3317 + .unreq ROW1R | |
| 3318 + .unreq ROW2L | |
| 3319 + .unreq ROW2R | |
| 3320 + .unreq ROW3L | |
| 3321 + .unreq ROW3R | |
| 3322 + .unreq ROW4L | |
| 3323 + .unreq ROW4R | |
| 3324 + .unreq ROW5L | |
| 3325 + .unreq ROW5R | |
| 3326 + .unreq ROW6L | |
| 3327 + .unreq ROW6R | |
| 3328 + .unreq ROW7L | |
| 3329 + .unreq ROW7R | |
| 3330 + | |
| 3331 + | |
| 3332 +/*****************************************************************************/ | |
| 3333 + | |
| 3334 +/* | |
| 3335 + * jsimd_idct_ifast_neon | |
| 3336 + * | |
| 3337 + * This function contains a fast, not so accurate integer implementation of | |
| 3338 + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | |
| 3339 + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' | |
| 3340 + * function from jidctfst.c | |
| 3341 + * | |
| 3342 + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. | |
| 3343 + * But in ARM NEON case some extra additions are required because VQDMULH | |
| 3344 + * instruction can't handle the constants larger than 1. So the expressions | |
| 3345 + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", | |
| 3346 + * which introduces an extra addition. Overall, there are 6 extra additions | |
| 3347 + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. | |
| 3348 + */ | |
| 3349 + | |
| 3350 +#define XFIX_1_082392200 v0.4h[0] | |
| 3351 +#define XFIX_1_414213562 v0.4h[1] | |
| 3352 +#define XFIX_1_847759065 v0.4h[2] | |
| 3353 +#define XFIX_2_613125930 v0.4h[3] | |
| 3354 + | |
| 3355 +.balign 16 | |
| 3356 +jsimd_idct_ifast_neon_consts: | |
| 3357 + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ | |
| 3358 + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ | |
| 3359 + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ | |
| 3360 + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ | |
| 3361 + | |
| 3362 +asm_function jsimd_idct_ifast_neon | |
| 3363 + | |
| 3364 + DCT_TABLE .req x0 | |
| 3365 + COEF_BLOCK .req x1 | |
| 3366 + OUTPUT_BUF .req x2 | |
| 3367 + OUTPUT_COL .req x3 | |
| 3368 + TMP1 .req x0 | |
| 3369 + TMP2 .req x1 | |
| 3370 + TMP3 .req x2 | |
| 3371 + TMP4 .req x22 | |
| 3372 + TMP5 .req x23 | |
| 3373 + | |
| 3374 + /* Load and dequantize coefficients into NEON registers | |
| 3375 + * with the following allocation: | |
| 3376 + * 0 1 2 3 | 4 5 6 7 | |
| 3377 + * ---------+-------- | |
| 3378 + * 0 | d16 | d17 ( v8.8h ) | |
| 3379 + * 1 | d18 | d19 ( v9.8h ) | |
| 3380 + * 2 | d20 | d21 ( v10.8h ) | |
| 3381 + * 3 | d22 | d23 ( v11.8h ) | |
| 3382 + * 4 | d24 | d25 ( v12.8h ) | |
| 3383 + * 5 | d26 | d27 ( v13.8h ) | |
| 3384 + * 6 | d28 | d29 ( v14.8h ) | |
| 3385 + * 7 | d30 | d31 ( v15.8h ) | |
| 3386 + */ | |
| 3387 + /* Save NEON registers used in fast IDCT */ | |
| 3388 + sub sp, sp, #176 | |
| 3389 + stp x22, x23, [sp], 16 | |
| 3390 + adr x23, jsimd_idct_ifast_neon_consts | |
| 3391 + st1 {v0.8b - v3.8b}, [sp], 32 | |
| 3392 + st1 {v4.8b - v7.8b}, [sp], 32 | |
| 3393 + st1 {v8.8b - v11.8b}, [sp], 32 | |
| 3394 + st1 {v12.8b - v15.8b}, [sp], 32 | |
| 3395 + st1 {v16.8b - v19.8b}, [sp], 32 | |
| 3396 + ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32 | |
| 3397 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 | |
| 3398 + ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32 | |
| 3399 + mul v8.8h, v8.8h, v0.8h | |
| 3400 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 | |
| 3401 + mul v9.8h, v9.8h, v1.8h | |
| 3402 + ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32 | |
| 3403 + mul v10.8h, v10.8h, v2.8h | |
| 3404 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 | |
| 3405 + mul v11.8h, v11.8h, v3.8h | |
| 3406 + ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32 | |
| 3407 + mul v12.8h, v12.8h, v0.8h | |
| 3408 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 | |
| 3409 + mul v14.8h, v14.8h, v2.8h | |
| 3410 + mul v13.8h, v13.8h, v1.8h | |
| 3411 + ld1 {v0.4h}, [x23] /* load constants */ | |
| 3412 + mul v15.8h, v15.8h, v3.8h | |
| 3413 + | |
| 3414 + /* 1-D IDCT, pass 1 */ | |
| 3415 + sub v2.8h, v10.8h, v14.8h | |
| 3416 + add v14.8h, v10.8h, v14.8h | |
| 3417 + sub v1.8h, v11.8h, v13.8h | |
| 3418 + add v13.8h, v11.8h, v13.8h | |
| 3419 + sub v5.8h, v9.8h, v15.8h | |
| 3420 + add v15.8h, v9.8h, v15.8h | |
| 3421 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 | |
| 3422 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 | |
| 3423 + add v3.8h, v1.8h, v1.8h | |
| 3424 + sub v1.8h, v5.8h, v1.8h | |
| 3425 + add v10.8h, v2.8h, v4.8h | |
| 3426 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 | |
| 3427 + sub v2.8h, v15.8h, v13.8h | |
| 3428 + add v3.8h, v3.8h, v6.8h | |
| 3429 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 | |
| 3430 + add v1.8h, v1.8h, v4.8h | |
| 3431 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 | |
| 3432 + sub v10.8h, v10.8h, v14.8h | |
| 3433 + add v2.8h, v2.8h, v6.8h | |
| 3434 + sub v6.8h, v8.8h, v12.8h | |
| 3435 + add v12.8h, v8.8h, v12.8h | |
| 3436 + add v9.8h, v5.8h, v4.8h | |
| 3437 + add v5.8h, v6.8h, v10.8h | |
| 3438 + sub v10.8h, v6.8h, v10.8h | |
| 3439 + add v6.8h, v15.8h, v13.8h | |
| 3440 + add v8.8h, v12.8h, v14.8h | |
| 3441 + sub v3.8h, v6.8h, v3.8h | |
| 3442 + sub v12.8h, v12.8h, v14.8h | |
| 3443 + sub v3.8h, v3.8h, v1.8h | |
| 3444 + sub v1.8h, v9.8h, v1.8h | |
| 3445 + add v2.8h, v3.8h, v2.8h | |
| 3446 + sub v15.8h, v8.8h, v6.8h | |
| 3447 + add v1.8h, v1.8h, v2.8h | |
| 3448 + add v8.8h, v8.8h, v6.8h | |
| 3449 + add v14.8h, v5.8h, v3.8h | |
| 3450 + sub v9.8h, v5.8h, v3.8h | |
| 3451 + sub v13.8h, v10.8h, v2.8h | |
| 3452 + add v10.8h, v10.8h, v2.8h | |
| 3453 + /* Transpose q8-q9 */ | |
| 3454 + mov v18.16b, v8.16b | |
| 3455 + trn1 v8.8h, v8.8h, v9.8h | |
| 3456 + trn2 v9.8h, v18.8h, v9.8h | |
| 3457 + sub v11.8h, v12.8h, v1.8h | |
| 3458 + /* Transpose q14-q15 */ | |
| 3459 + mov v18.16b, v14.16b | |
| 3460 + trn1 v14.8h, v14.8h, v15.8h | |
| 3461 + trn2 v15.8h, v18.8h, v15.8h | |
| 3462 + add v12.8h, v12.8h, v1.8h | |
| 3463 + /* Transpose q10-q11 */ | |
| 3464 + mov v18.16b, v10.16b | |
| 3465 + trn1 v10.8h, v10.8h, v11.8h | |
| 3466 + trn2 v11.8h, v18.8h, v11.8h | |
| 3467 + /* Transpose q12-q13 */ | |
| 3468 + mov v18.16b, v12.16b | |
| 3469 + trn1 v12.8h, v12.8h, v13.8h | |
| 3470 + trn2 v13.8h, v18.8h, v13.8h | |
| 3471 + /* Transpose q9-q11 */ | |
| 3472 + mov v18.16b, v9.16b | |
| 3473 + trn1 v9.4s, v9.4s, v11.4s | |
| 3474 + trn2 v11.4s, v18.4s, v11.4s | |
| 3475 + /* Transpose q12-q14 */ | |
| 3476 + mov v18.16b, v12.16b | |
| 3477 + trn1 v12.4s, v12.4s, v14.4s | |
| 3478 + trn2 v14.4s, v18.4s, v14.4s | |
| 3479 + /* Transpose q8-q10 */ | |
| 3480 + mov v18.16b, v8.16b | |
| 3481 + trn1 v8.4s, v8.4s, v10.4s | |
| 3482 + trn2 v10.4s, v18.4s, v10.4s | |
| 3483 + /* Transpose q13-q15 */ | |
| 3484 + mov v18.16b, v13.16b | |
| 3485 + trn1 v13.4s, v13.4s, v15.4s | |
| 3486 + trn2 v15.4s, v18.4s, v15.4s | |
| 3487 + /* vswp v14.4h, v10-MSB.4h */ | |
| 3488 + umov x22, v14.d[0] | |
| 3489 + ins v14.2d[0], v10.2d[1] | |
| 3490 + ins v10.2d[1], x22 | |
| 3491 + /* vswp v13.4h, v9MSB.4h */ | |
| 3492 + | |
| 3493 + umov x22, v13.d[0] | |
| 3494 + ins v13.2d[0], v9.2d[1] | |
| 3495 + ins v9.2d[1], x22 | |
| 3496 + /* 1-D IDCT, pass 2 */ | |
| 3497 + sub v2.8h, v10.8h, v14.8h | |
| 3498 + /* vswp v15.4h, v11MSB.4h */ | |
| 3499 + umov x22, v15.d[0] | |
| 3500 + ins v15.2d[0], v11.2d[1] | |
| 3501 + ins v11.2d[1], x22 | |
| 3502 + add v14.8h, v10.8h, v14.8h | |
| 3503 + /* vswp v12.4h, v8-MSB.4h */ | |
| 3504 + umov x22, v12.d[0] | |
| 3505 + ins v12.2d[0], v8.2d[1] | |
| 3506 + ins v8.2d[1], x22 | |
| 3507 + sub v1.8h, v11.8h, v13.8h | |
| 3508 + add v13.8h, v11.8h, v13.8h | |
| 3509 + sub v5.8h, v9.8h, v15.8h | |
| 3510 + add v15.8h, v9.8h, v15.8h | |
| 3511 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562 | |
| 3512 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930 | |
| 3513 + add v3.8h, v1.8h, v1.8h | |
| 3514 + sub v1.8h, v5.8h, v1.8h | |
| 3515 + add v10.8h, v2.8h, v4.8h | |
| 3516 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065 | |
| 3517 + sub v2.8h, v15.8h, v13.8h | |
| 3518 + add v3.8h, v3.8h, v6.8h | |
| 3519 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562 | |
| 3520 + add v1.8h, v1.8h, v4.8h | |
| 3521 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200 | |
| 3522 + sub v10.8h, v10.8h, v14.8h | |
| 3523 + add v2.8h, v2.8h, v6.8h | |
| 3524 + sub v6.8h, v8.8h, v12.8h | |
| 3525 + add v12.8h, v8.8h, v12.8h | |
| 3526 + add v9.8h, v5.8h, v4.8h | |
| 3527 + add v5.8h, v6.8h, v10.8h | |
| 3528 + sub v10.8h, v6.8h, v10.8h | |
| 3529 + add v6.8h, v15.8h, v13.8h | |
| 3530 + add v8.8h, v12.8h, v14.8h | |
| 3531 + sub v3.8h, v6.8h, v3.8h | |
| 3532 + sub v12.8h, v12.8h, v14.8h | |
| 3533 + sub v3.8h, v3.8h, v1.8h | |
| 3534 + sub v1.8h, v9.8h, v1.8h | |
| 3535 + add v2.8h, v3.8h, v2.8h | |
| 3536 + sub v15.8h, v8.8h, v6.8h | |
| 3537 + add v1.8h, v1.8h, v2.8h | |
| 3538 + add v8.8h, v8.8h, v6.8h | |
| 3539 + add v14.8h, v5.8h, v3.8h | |
| 3540 + sub v9.8h, v5.8h, v3.8h | |
| 3541 + sub v13.8h, v10.8h, v2.8h | |
| 3542 + add v10.8h, v10.8h, v2.8h | |
| 3543 + sub v11.8h, v12.8h, v1.8h | |
| 3544 + add v12.8h, v12.8h, v1.8h | |
| 3545 + /* Descale to 8-bit and range limit */ | |
| 3546 + movi v0.16b, #0x80 | |
| 3547 + sqshrn v8.8b, v8.8h, #5 | |
| 3548 + sqshrn2 v8.16b, v9.8h, #5 | |
| 3549 + sqshrn v9.8b, v10.8h, #5 | |
| 3550 + sqshrn2 v9.16b, v11.8h, #5 | |
| 3551 + sqshrn v10.8b, v12.8h, #5 | |
| 3552 + sqshrn2 v10.16b, v13.8h, #5 | |
| 3553 + sqshrn v11.8b, v14.8h, #5 | |
| 3554 + sqshrn2 v11.16b, v15.8h, #5 | |
| 3555 + add v8.16b, v8.16b, v0.16b | |
| 3556 + add v9.16b, v9.16b, v0.16b | |
| 3557 + add v10.16b, v10.16b, v0.16b | |
| 3558 + add v11.16b, v11.16b, v0.16b | |
| 3559 + /* Transpose the final 8-bit samples */ | |
| 3560 + /* Transpose q8-q9 */ | |
| 3561 + mov v18.16b, v8.16b | |
| 3562 + trn1 v8.8h, v8.8h, v9.8h | |
| 3563 + trn2 v9.8h, v18.8h, v9.8h | |
| 3564 + /* Transpose q10-q11 */ | |
| 3565 + mov v18.16b, v10.16b | |
| 3566 + trn1 v10.8h, v10.8h, v11.8h | |
| 3567 + trn2 v11.8h, v18.8h, v11.8h | |
| 3568 + /* Transpose q8-q10 */ | |
| 3569 + mov v18.16b, v8.16b | |
| 3570 + trn1 v8.4s, v8.4s, v10.4s | |
| 3571 + trn2 v10.4s, v18.4s, v10.4s | |
| 3572 + /* Transpose q9-q11 */ | |
| 3573 + mov v18.16b, v9.16b | |
| 3574 + trn1 v9.4s, v9.4s, v11.4s | |
| 3575 + trn2 v11.4s, v18.4s, v11.4s | |
| 3576 + /* make copy */ | |
| 3577 + ins v17.2d[0], v8.2d[1] | |
| 3578 + /* Transpose d16-d17-msb */ | |
| 3579 + mov v18.16b, v8.16b | |
| 3580 + trn1 v8.8b, v8.8b, v17.8b | |
| 3581 + trn2 v17.8b, v18.8b, v17.8b | |
| 3582 + /* make copy */ | |
| 3583 + ins v19.2d[0], v9.2d[1] | |
| 3584 + mov v18.16b, v9.16b | |
| 3585 + trn1 v9.8b, v9.8b, v19.8b | |
| 3586 + trn2 v19.8b, v18.8b, v19.8b | |
| 3587 + /* Store results to the output buffer */ | |
| 3588 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
| 3589 + add TMP1, TMP1, OUTPUT_COL | |
| 3590 + add TMP2, TMP2, OUTPUT_COL | |
| 3591 + st1 {v8.8b}, [TMP1] | |
| 3592 + st1 {v17.8b}, [TMP2] | |
| 3593 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
| 3594 + add TMP1, TMP1, OUTPUT_COL | |
| 3595 + add TMP2, TMP2, OUTPUT_COL | |
| 3596 + st1 {v9.8b}, [TMP1] | |
| 3597 + /* make copy */ | |
| 3598 + ins v7.2d[0], v10.2d[1] | |
| 3599 + mov v18.16b, v10.16b | |
| 3600 + trn1 v10.8b, v10.8b, v7.8b | |
| 3601 + trn2 v7.8b, v18.8b, v7.8b | |
| 3602 + st1 {v19.8b}, [TMP2] | |
| 3603 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
| 3604 + ldp TMP4, TMP5, [OUTPUT_BUF], 16 | |
| 3605 + add TMP1, TMP1, OUTPUT_COL | |
| 3606 + add TMP2, TMP2, OUTPUT_COL | |
| 3607 + add TMP4, TMP4, OUTPUT_COL | |
| 3608 + add TMP5, TMP5, OUTPUT_COL | |
| 3609 + st1 {v10.8b}, [TMP1] | |
| 3610 + /* make copy */ | |
| 3611 + ins v16.2d[0], v11.2d[1] | |
| 3612 + mov v18.16b, v11.16b | |
| 3613 + trn1 v11.8b, v11.8b, v16.8b | |
| 3614 + trn2 v16.8b, v18.8b, v16.8b | |
| 3615 + st1 {v7.8b}, [TMP2] | |
| 3616 + st1 {v11.8b}, [TMP4] | |
| 3617 + st1 {v16.8b}, [TMP5] | |
| 3618 + sub sp, sp, #176 | |
| 3619 + ldp x22, x23, [sp], 16 | |
| 3620 + ld1 {v0.8b - v3.8b}, [sp], 32 | |
| 3621 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
| 3622 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
| 3623 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
| 3624 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
| 3625 + blr x30 | |
| 3626 + | |
| 3627 + .unreq DCT_TABLE | |
| 3628 + .unreq COEF_BLOCK | |
| 3629 + .unreq OUTPUT_BUF | |
| 3630 + .unreq OUTPUT_COL | |
| 3631 + .unreq TMP1 | |
| 3632 + .unreq TMP2 | |
| 3633 + .unreq TMP3 | |
| 3634 + .unreq TMP4 | |
| 3635 + | |
| 3636 + | |
| 3637 +/*****************************************************************************/ | |
| 3638 + | |
| 3639 +/* | |
| 3640 + * jsimd_idct_4x4_neon | |
| 3641 + * | |
| 3642 + * This function contains inverse-DCT code for getting reduced-size | |
| 3643 + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations | |
| 3644 + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | |
| 3645 + * function from jpeg-6b (jidctred.c). | |
| 3646 + * | |
| 3647 + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which | |
| 3648 + * requires much less arithmetic operations and hence should be faster. | |
| 3649 + * The primary purpose of this particular NEON optimized function is | |
| 3650 + * bit exact compatibility with jpeg-6b. | |
| 3651 + * | |
| 3652 + * TODO: a bit better instructions scheduling can be achieved by expanding | |
| 3653 + * idct_helper/transpose_4x4 macros and reordering instructions, | |
| 3654 + * but readability will suffer somewhat. | |
| 3655 + */ | |
| 3656 + | |
| 3657 +#define CONST_BITS 13 | |
| 3658 + | |
| 3659 +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ | |
| 3660 +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ | |
| 3661 +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ | |
| 3662 +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ | |
| 3663 +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ | |
| 3664 +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ | |
| 3665 +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ | |
| 3666 +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ | |
| 3667 +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ | |
| 3668 +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ | |
| 3669 +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ | |
| 3670 +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ | |
| 3671 +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ | |
| 3672 +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ | |
| 3673 + | |
| 3674 +.balign 16 | |
| 3675 +jsimd_idct_4x4_neon_consts: | |
| 3676 + .short FIX_1_847759065 /* v0.4h[0] */ | |
| 3677 + .short -FIX_0_765366865 /* v0.4h[1] */ | |
| 3678 + .short -FIX_0_211164243 /* v0.4h[2] */ | |
| 3679 + .short FIX_1_451774981 /* v0.4h[3] */ | |
| 3680 + .short -FIX_2_172734803 /* d1[0] */ | |
| 3681 + .short FIX_1_061594337 /* d1[1] */ | |
| 3682 + .short -FIX_0_509795579 /* d1[2] */ | |
| 3683 + .short -FIX_0_601344887 /* d1[3] */ | |
| 3684 + .short FIX_0_899976223 /* v2.4h[0] */ | |
| 3685 + .short FIX_2_562915447 /* v2.4h[1] */ | |
| 3686 + .short 1 << (CONST_BITS+1) /* v2.4h[2] */ | |
| 3687 + .short 0 /* v2.4h[3] */ | |
| 3688 + | |
| 3689 +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 | |
| 3690 + smull v28.4s, \x4, v2.4h[2] | |
| 3691 + smlal v28.4s, \x8, v0.4h[0] | |
| 3692 + smlal v28.4s, \x14, v0.4h[1] | |
| 3693 + | |
| 3694 + smull v26.4s, \x16, v1.4h[2] | |
| 3695 + smlal v26.4s, \x12, v1.4h[3] | |
| 3696 + smlal v26.4s, \x10, v2.4h[0] | |
| 3697 + smlal v26.4s, \x6, v2.4h[1] | |
| 3698 + | |
| 3699 + smull v30.4s, \x4, v2.4h[2] | |
| 3700 + smlsl v30.4s, \x8, v0.4h[0] | |
| 3701 + smlsl v30.4s, \x14, v0.4h[1] | |
| 3702 + | |
| 3703 + smull v24.4s, \x16, v0.4h[2] | |
| 3704 + smlal v24.4s, \x12, v0.4h[3] | |
| 3705 + smlal v24.4s, \x10, v1.4h[0] | |
| 3706 + smlal v24.4s, \x6, v1.4h[1] | |
| 3707 + | |
| 3708 + add v20.4s, v28.4s, v26.4s | |
| 3709 + sub v28.4s, v28.4s, v26.4s | |
| 3710 + | |
| 3711 +.if \shift > 16 | |
| 3712 + srshr v20.4s, v20.4s, #\shift | |
| 3713 + srshr v28.4s, v28.4s, #\shift | |
| 3714 + xtn \y26, v20.4s | |
| 3715 + xtn \y29, v28.4s | |
| 3716 +.else | |
| 3717 + rshrn \y26, v20.4s, #\shift | |
| 3718 + rshrn \y29, v28.4s, #\shift | |
| 3719 +.endif | |
| 3720 + | |
| 3721 + add v20.4s, v30.4s, v24.4s | |
| 3722 + sub v30.4s, v30.4s, v24.4s | |
| 3723 + | |
| 3724 +.if \shift > 16 | |
| 3725 + srshr v20.4s, v20.4s, #\shift | |
| 3726 + srshr v30.4s, v30.4s, #\shift | |
| 3727 + xtn \y27, v20.4s | |
| 3728 + xtn \y28, v30.4s | |
| 3729 +.else | |
| 3730 + rshrn \y27, v20.4s, #\shift | |
| 3731 + rshrn \y28, v30.4s, #\shift | |
| 3732 +.endif | |
| 3733 + | |
| 3734 +.endm | |
| 3735 + | |
| 3736 +asm_function jsimd_idct_4x4_neon | |
| 3737 + | |
| 3738 + DCT_TABLE .req x0 | |
| 3739 + COEF_BLOCK .req x1 | |
| 3740 + OUTPUT_BUF .req x2 | |
| 3741 + OUTPUT_COL .req x3 | |
| 3742 + TMP1 .req x0 | |
| 3743 + TMP2 .req x1 | |
| 3744 + TMP3 .req x2 | |
| 3745 + TMP4 .req x15 | |
| 3746 + | |
| 3747 + /* Save all used NEON registers */ | |
| 3748 + sub sp, sp, 272 | |
| 3749 + str x15, [sp], 16 | |
| 3750 + /* Load constants (v3.4h is just used for padding) */ | |
| 3751 + adr TMP4, jsimd_idct_4x4_neon_consts | |
| 3752 + st1 {v0.8b - v3.8b}, [sp], 32 | |
| 3753 + st1 {v4.8b - v7.8b}, [sp], 32 | |
| 3754 + st1 {v8.8b - v11.8b}, [sp], 32 | |
| 3755 + st1 {v12.8b - v15.8b}, [sp], 32 | |
| 3756 + st1 {v16.8b - v19.8b}, [sp], 32 | |
| 3757 + st1 {v20.8b - v23.8b}, [sp], 32 | |
| 3758 + st1 {v24.8b - v27.8b}, [sp], 32 | |
| 3759 + st1 {v28.8b - v31.8b}, [sp], 32 | |
| 3760 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] | |
| 3761 + | |
| 3762 + /* Load all COEF_BLOCK into NEON registers with the following allocation: | |
| 3763 + * 0 1 2 3 | 4 5 6 7 | |
| 3764 + * ---------+-------- | |
| 3765 + * 0 | v4.4h | v5.4h | |
| 3766 + * 1 | v6.4h | v7.4h | |
| 3767 + * 2 | v8.4h | v9.4h | |
| 3768 + * 3 | v10.4h | v11.4h | |
| 3769 + * 4 | - | - | |
| 3770 + * 5 | v12.4h | v13.4h | |
| 3771 + * 6 | v14.4h | v15.4h | |
| 3772 + * 7 | v16.4h | v17.4h | |
| 3773 + */ | |
| 3774 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 | |
| 3775 + ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 | |
| 3776 + add COEF_BLOCK, COEF_BLOCK, #16 | |
| 3777 + ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 | |
| 3778 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | |
| 3779 + /* dequantize */ | |
| 3780 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | |
| 3781 + mul v4.4h, v4.4h, v18.4h | |
| 3782 + mul v5.4h, v5.4h, v19.4h | |
| 3783 + ins v4.2d[1], v5.2d[0] /* 128 bit q4 */ | |
| 3784 + ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 | |
| 3785 + mul v6.4h, v6.4h, v20.4h | |
| 3786 + mul v7.4h, v7.4h, v21.4h | |
| 3787 + ins v6.2d[1], v7.2d[0] /* 128 bit q6 */ | |
| 3788 + mul v8.4h, v8.4h, v22.4h | |
| 3789 + mul v9.4h, v9.4h, v23.4h | |
| 3790 + ins v8.2d[1], v9.2d[0] /* 128 bit q8 */ | |
| 3791 + add DCT_TABLE, DCT_TABLE, #16 | |
| 3792 + ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 | |
| 3793 + mul v10.4h, v10.4h, v24.4h | |
| 3794 + mul v11.4h, v11.4h, v25.4h | |
| 3795 + ins v10.2d[1], v11.2d[0] /* 128 bit q10 */ | |
| 3796 + mul v12.4h, v12.4h, v26.4h | |
| 3797 + mul v13.4h, v13.4h, v27.4h | |
| 3798 + ins v12.2d[1], v13.2d[0] /* 128 bit q12 */ | |
| 3799 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | |
| 3800 + mul v14.4h, v14.4h, v28.4h | |
| 3801 + mul v15.4h, v15.4h, v29.4h | |
| 3802 + ins v14.2d[1], v15.2d[0] /* 128 bit q14 */ | |
| 3803 + mul v16.4h, v16.4h, v30.4h | |
| 3804 + mul v17.4h, v17.4h, v31.4h | |
| 3805 + ins v16.2d[1], v17.2d[0] /* 128 bit q16 */ | |
| 3806 + | |
| 3807 + /* Pass 1 */ | |
| 3808 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4
.4h, v6.4h, v8.4h, v10.4h | |
| 3809 + transpose_4x4 v4, v6, v8, v10, v3 | |
| 3810 + ins v10.2d[1], v11.2d[0] | |
| 3811 + idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5
.4h, v7.4h, v9.4h, v11.4h | |
| 3812 + transpose_4x4 v5, v7, v9, v11, v3 | |
| 3813 + ins v10.2d[1], v11.2d[0] | |
| 3814 + /* Pass 2 */ | |
| 3815 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.
4h, v27.4h, v28.4h, v29.4h | |
| 3816 + transpose_4x4 v26, v27, v28, v29, v3 | |
| 3817 + | |
| 3818 + /* Range limit */ | |
| 3819 + movi v30.8h, #0x80 | |
| 3820 + ins v26.2d[1], v27.2d[0] | |
| 3821 + ins v28.2d[1], v29.2d[0] | |
| 3822 + add v26.8h, v26.8h, v30.8h | |
| 3823 + add v28.8h, v28.8h, v30.8h | |
| 3824 + sqxtun v26.8b, v26.8h | |
| 3825 + sqxtun v27.8b, v28.8h | |
| 3826 + | |
| 3827 + /* Store results to the output buffer */ | |
| 3828 + ldp TMP1, TMP2, [OUTPUT_BUF], 16 | |
| 3829 + ldp TMP3, TMP4, [OUTPUT_BUF] | |
| 3830 + add TMP1, TMP1, OUTPUT_COL | |
| 3831 + add TMP2, TMP2, OUTPUT_COL | |
| 3832 + add TMP3, TMP3, OUTPUT_COL | |
| 3833 + add TMP4, TMP4, OUTPUT_COL | |
| 3834 + | |
| 3835 +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT | |
| 3836 + /* We can use much less instructions on little endian systems if the | |
| 3837 + * OS kernel is not configured to trap unaligned memory accesses | |
| 3838 + */ | |
| 3839 + st1 {v26.s}[0], [TMP1], 4 | |
| 3840 + st1 {v27.s}[0], [TMP3], 4 | |
| 3841 + st1 {v26.s}[1], [TMP2], 4 | |
| 3842 + st1 {v27.s}[1], [TMP4], 4 | |
| 3843 +#else | |
| 3844 + st1 {v26.b}[0], [TMP1], 1 | |
| 3845 + st1 {v27.b}[0], [TMP3], 1 | |
| 3846 + st1 {v26.b}[1], [TMP1], 1 | |
| 3847 + st1 {v27.b}[1], [TMP3], 1 | |
| 3848 + st1 {v26.b}[2], [TMP1], 1 | |
| 3849 + st1 {v27.b}[2], [TMP3], 1 | |
| 3850 + st1 {v26.b}[3], [TMP1], 1 | |
| 3851 + st1 {v27.b}[3], [TMP3], 1 | |
| 3852 + | |
| 3853 + st1 {v26.b}[4], [TMP2], 1 | |
| 3854 + st1 {v27.b}[4], [TMP4], 1 | |
| 3855 + st1 {v26.b}[5], [TMP2], 1 | |
| 3856 + st1 {v27.b}[5], [TMP4], 1 | |
| 3857 + st1 {v26.b}[6], [TMP2], 1 | |
| 3858 + st1 {v27.b}[6], [TMP4], 1 | |
| 3859 + st1 {v26.b}[7], [TMP2], 1 | |
| 3860 + st1 {v27.b}[7], [TMP4], 1 | |
| 3861 +#endif | |
| 3862 + | |
| 3863 + /* vpop {v8.4h - v15.4h} ;not available */ | |
| 3864 + sub sp, sp, #272 | |
| 3865 + ldr x15, [sp], 16 | |
| 3866 + ld1 {v0.8b - v3.8b}, [sp], 32 | |
| 3867 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
| 3868 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
| 3869 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
| 3870 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
| 3871 + ld1 {v20.8b - v23.8b}, [sp], 32 | |
| 3872 + ld1 {v24.8b - v27.8b}, [sp], 32 | |
| 3873 + ld1 {v28.8b - v31.8b}, [sp], 32 | |
| 3874 + blr x30 | |
| 3875 + | |
| 3876 + .unreq DCT_TABLE | |
| 3877 + .unreq COEF_BLOCK | |
| 3878 + .unreq OUTPUT_BUF | |
| 3879 + .unreq OUTPUT_COL | |
| 3880 + .unreq TMP1 | |
| 3881 + .unreq TMP2 | |
| 3882 + .unreq TMP3 | |
| 3883 + .unreq TMP4 | |
| 3884 + | |
| 3885 +.purgem idct_helper | |
| 3886 + | |
| 3887 + | |
| 3888 +/*****************************************************************************/ | |
| 3889 + | |
| 3890 +/* | |
| 3891 + * jsimd_idct_2x2_neon | |
| 3892 + * | |
| 3893 + * This function contains inverse-DCT code for getting reduced-size | |
| 3894 + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations | |
| 3895 + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' | |
| 3896 + * function from jpeg-6b (jidctred.c). | |
| 3897 + * | |
| 3898 + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which | |
| 3899 + * requires much less arithmetic operations and hence should be faster. | |
| 3900 + * The primary purpose of this particular NEON optimized function is | |
| 3901 + * bit exact compatibility with jpeg-6b. | |
| 3902 + */ | |
| 3903 + | |
| 3904 +.balign 8 | |
| 3905 +jsimd_idct_2x2_neon_consts: | |
| 3906 + .short -FIX_0_720959822 /* v14[0] */ | |
| 3907 + .short FIX_0_850430095 /* v14[1] */ | |
| 3908 + .short -FIX_1_272758580 /* v14[2] */ | |
| 3909 + .short FIX_3_624509785 /* v14[3] */ | |
| 3910 + | |
| 3911 +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 | |
| 3912 + sshll v15.4s, \x4, #15 | |
| 3913 + smull v26.4s, \x6, v14.4h[3] | |
| 3914 + smlal v26.4s, \x10, v14.4h[2] | |
| 3915 + smlal v26.4s, \x12, v14.4h[1] | |
| 3916 + smlal v26.4s, \x16, v14.4h[0] | |
| 3917 + | |
| 3918 + add v20.4s, v15.4s, v26.4s | |
| 3919 + sub v15.4s, v15.4s, v26.4s | |
| 3920 + | |
| 3921 +.if \shift > 16 | |
| 3922 + srshr v20.4s, v20.4s, #\shift | |
| 3923 + srshr v15.4s, v15.4s, #\shift | |
| 3924 + xtn \y26, v20.4s | |
| 3925 + xtn \y27, v15.4s | |
| 3926 +.else | |
| 3927 + rshrn \y26, v20.4s, #\shift | |
| 3928 + rshrn \y27, v15.4s, #\shift | |
| 3929 +.endif | |
| 3930 + | |
| 3931 +.endm | |
| 3932 + | |
| 3933 +asm_function jsimd_idct_2x2_neon | |
| 3934 + | |
| 3935 + DCT_TABLE .req x0 | |
| 3936 + COEF_BLOCK .req x1 | |
| 3937 + OUTPUT_BUF .req x2 | |
| 3938 + OUTPUT_COL .req x3 | |
| 3939 + TMP1 .req x0 | |
| 3940 + TMP2 .req x15 | |
| 3941 + | |
| 3942 + /* vpush {v8.4h - v15.4h} ; not available */ | |
| 3943 + sub sp, sp, 208 | |
| 3944 + str x15, [sp], 16 | |
| 3945 + | |
| 3946 + /* Load constants */ | |
| 3947 + adr TMP2, jsimd_idct_2x2_neon_consts | |
| 3948 + st1 {v4.8b - v7.8b}, [sp], 32 | |
| 3949 + st1 {v8.8b - v11.8b}, [sp], 32 | |
| 3950 + st1 {v12.8b - v15.8b}, [sp], 32 | |
| 3951 + st1 {v16.8b - v19.8b}, [sp], 32 | |
| 3952 + st1 {v21.8b - v22.8b}, [sp], 16 | |
| 3953 + st1 {v24.8b - v27.8b}, [sp], 32 | |
| 3954 + st1 {v30.8b - v31.8b}, [sp], 16 | |
| 3955 + ld1 {v14.4h}, [TMP2] | |
| 3956 + | |
| 3957 + /* Load all COEF_BLOCK into NEON registers with the following allocation: | |
| 3958 + * 0 1 2 3 | 4 5 6 7 | |
| 3959 + * ---------+-------- | |
| 3960 + * 0 | v4.4h | v5.4h | |
| 3961 + * 1 | v6.4h | v7.4h | |
| 3962 + * 2 | - | - | |
| 3963 + * 3 | v10.4h | v11.4h | |
| 3964 + * 4 | - | - | |
| 3965 + * 5 | v12.4h | v13.4h | |
| 3966 + * 6 | - | - | |
| 3967 + * 7 | v16.4h | v17.4h | |
| 3968 + */ | |
| 3969 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 | |
| 3970 + add COEF_BLOCK, COEF_BLOCK, #16 | |
| 3971 + ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 | |
| 3972 + add COEF_BLOCK, COEF_BLOCK, #16 | |
| 3973 + ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 | |
| 3974 + add COEF_BLOCK, COEF_BLOCK, #16 | |
| 3975 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | |
| 3976 + /* Dequantize */ | |
| 3977 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | |
| 3978 + mul v4.4h, v4.4h, v18.4h | |
| 3979 + mul v5.4h, v5.4h, v19.4h | |
| 3980 + ins v4.2d[1], v5.2d[0] | |
| 3981 + mul v6.4h, v6.4h, v20.4h | |
| 3982 + mul v7.4h, v7.4h, v21.4h | |
| 3983 + ins v6.2d[1], v7.2d[0] | |
| 3984 + add DCT_TABLE, DCT_TABLE, #16 | |
| 3985 + ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 | |
| 3986 + mul v10.4h, v10.4h, v24.4h | |
| 3987 + mul v11.4h, v11.4h, v25.4h | |
| 3988 + ins v10.2d[1], v11.2d[0] | |
| 3989 + add DCT_TABLE, DCT_TABLE, #16 | |
| 3990 + ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 | |
| 3991 + mul v12.4h, v12.4h, v26.4h | |
| 3992 + mul v13.4h, v13.4h, v27.4h | |
| 3993 + ins v12.2d[1], v13.2d[0] | |
| 3994 + add DCT_TABLE, DCT_TABLE, #16 | |
| 3995 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | |
| 3996 + mul v16.4h, v16.4h, v30.4h | |
| 3997 + mul v17.4h, v17.4h, v31.4h | |
| 3998 + ins v16.2d[1], v17.2d[0] | |
| 3999 + | |
| 4000 + /* Pass 1 */ | |
| 4001 +#if 0 | |
| 4002 + idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h | |
| 4003 + transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h | |
| 4004 + idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h | |
| 4005 + transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h | |
| 4006 +#else | |
| 4007 + smull v26.4s, v6.4h, v14.4h[3] | |
| 4008 + smlal v26.4s, v10.4h, v14.4h[2] | |
| 4009 + smlal v26.4s, v12.4h, v14.4h[1] | |
| 4010 + smlal v26.4s, v16.4h, v14.4h[0] | |
| 4011 + smull v24.4s, v7.4h, v14.4h[3] | |
| 4012 + smlal v24.4s, v11.4h, v14.4h[2] | |
| 4013 + smlal v24.4s, v13.4h, v14.4h[1] | |
| 4014 + smlal v24.4s, v17.4h, v14.4h[0] | |
| 4015 + sshll v15.4s, v4.4h, #15 | |
| 4016 + sshll v30.4s, v5.4h, #15 | |
| 4017 + add v20.4s, v15.4s, v26.4s | |
| 4018 + sub v15.4s, v15.4s, v26.4s | |
| 4019 + rshrn v4.4h, v20.4s, #13 | |
| 4020 + rshrn v6.4h, v15.4s, #13 | |
| 4021 + add v20.4s, v30.4s, v24.4s | |
| 4022 + sub v15.4s, v30.4s, v24.4s | |
| 4023 + rshrn v5.4h, v20.4s, #13 | |
| 4024 + rshrn v7.4h, v15.4s, #13 | |
| 4025 + ins v4.2d[1], v5.2d[0] | |
| 4026 + ins v6.2d[1], v7.2d[0] | |
| 4027 + transpose v4, v6, v3, .16b, .8h | |
| 4028 + transpose v6, v10, v3, .16b, .4s | |
| 4029 + ins v11.2d[0], v10.2d[1] | |
| 4030 + ins v7.2d[0], v6.2d[1] | |
| 4031 +#endif | |
| 4032 + | |
| 4033 + /* Pass 2 */ | |
| 4034 + idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h | |
| 4035 + | |
| 4036 + /* Range limit */ | |
| 4037 + movi v30.8h, #0x80 | |
| 4038 + ins v26.2d[1], v27.2d[0] | |
| 4039 + add v26.8h, v26.8h, v30.8h | |
| 4040 + sqxtun v30.8b, v26.8h | |
| 4041 + ins v26.2d[0], v30.2d[0] | |
| 4042 + sqxtun v27.8b, v26.8h | |
| 4043 + | |
| 4044 + /* Store results to the output buffer */ | |
| 4045 + ldp TMP1, TMP2, [OUTPUT_BUF] | |
| 4046 + add TMP1, TMP1, OUTPUT_COL | |
| 4047 + add TMP2, TMP2, OUTPUT_COL | |
| 4048 + | |
| 4049 + st1 {v26.b}[0], [TMP1], 1 | |
| 4050 + st1 {v27.b}[4], [TMP1], 1 | |
| 4051 + st1 {v26.b}[1], [TMP2], 1 | |
| 4052 + st1 {v27.b}[5], [TMP2], 1 | |
| 4053 + | |
| 4054 + sub sp, sp, #208 | |
| 4055 + ldr x15, [sp], 16 | |
| 4056 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
| 4057 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
| 4058 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
| 4059 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
| 4060 + ld1 {v21.8b - v22.8b}, [sp], 16 | |
| 4061 + ld1 {v24.8b - v27.8b}, [sp], 32 | |
| 4062 + ld1 {v30.8b - v31.8b}, [sp], 16 | |
| 4063 + blr x30 | |
| 4064 + | |
| 4065 + .unreq DCT_TABLE | |
| 4066 + .unreq COEF_BLOCK | |
| 4067 + .unreq OUTPUT_BUF | |
| 4068 + .unreq OUTPUT_COL | |
| 4069 + .unreq TMP1 | |
| 4070 + .unreq TMP2 | |
| 4071 + | |
| 4072 +.purgem idct_helper | |
| 4073 + | |
| 4074 + | |
| 4075 +/*****************************************************************************/ | |
| 4076 + | |
| 4077 +/* | |
| 4078 + * jsimd_ycc_extrgb_convert_neon | |
| 4079 + * jsimd_ycc_extbgr_convert_neon | |
| 4080 + * jsimd_ycc_extrgbx_convert_neon | |
| 4081 + * jsimd_ycc_extbgrx_convert_neon | |
| 4082 + * jsimd_ycc_extxbgr_convert_neon | |
| 4083 + * jsimd_ycc_extxrgb_convert_neon | |
| 4084 + * | |
| 4085 + * Colorspace conversion YCbCr -> RGB | |
| 4086 + */ | |
| 4087 + | |
| 4088 + | |
| 4089 +.macro do_load size | |
| 4090 + .if \size == 8 | |
| 4091 + ld1 {v4.8b}, [U], 8 | |
| 4092 + ld1 {v5.8b}, [V], 8 | |
| 4093 + ld1 {v0.8b}, [Y], 8 | |
| 4094 + prfm PLDL1KEEP, [U, #64] | |
| 4095 + prfm PLDL1KEEP, [V, #64] | |
| 4096 + prfm PLDL1KEEP, [Y, #64] | |
| 4097 + .elseif \size == 4 | |
| 4098 + ld1 {v4.b}[0], [U], 1 | |
| 4099 + ld1 {v4.b}[1], [U], 1 | |
| 4100 + ld1 {v4.b}[2], [U], 1 | |
| 4101 + ld1 {v4.b}[3], [U], 1 | |
| 4102 + ld1 {v5.b}[0], [V], 1 | |
| 4103 + ld1 {v5.b}[1], [V], 1 | |
| 4104 + ld1 {v5.b}[2], [V], 1 | |
| 4105 + ld1 {v5.b}[3], [V], 1 | |
| 4106 + ld1 {v0.b}[0], [Y], 1 | |
| 4107 + ld1 {v0.b}[1], [Y], 1 | |
| 4108 + ld1 {v0.b}[2], [Y], 1 | |
| 4109 + ld1 {v0.b}[3], [Y], 1 | |
| 4110 + .elseif \size == 2 | |
| 4111 + ld1 {v4.b}[4], [U], 1 | |
| 4112 + ld1 {v4.b}[5], [U], 1 | |
| 4113 + ld1 {v5.b}[4], [V], 1 | |
| 4114 + ld1 {v5.b}[5], [V], 1 | |
| 4115 + ld1 {v0.b}[4], [Y], 1 | |
| 4116 + ld1 {v0.b}[5], [Y], 1 | |
| 4117 + .elseif \size == 1 | |
| 4118 + ld1 {v4.b}[6], [U], 1 | |
| 4119 + ld1 {v5.b}[6], [V], 1 | |
| 4120 + ld1 {v0.b}[6], [Y], 1 | |
| 4121 + .else | |
| 4122 + .error unsupported macroblock size | |
| 4123 + .endif | |
| 4124 +.endm | |
| 4125 + | |
| 4126 +.macro do_store bpp, size | |
| 4127 + .if \bpp == 24 | |
| 4128 + .if \size == 8 | |
| 4129 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 | |
| 4130 + .elseif \size == 4 | |
| 4131 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 | |
| 4132 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 | |
| 4133 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 | |
| 4134 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 | |
| 4135 + .elseif \size == 2 | |
| 4136 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 | |
| 4137 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 | |
| 4138 + .elseif \size == 1 | |
| 4139 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 | |
| 4140 + .else | |
| 4141 + .error unsupported macroblock size | |
| 4142 + .endif | |
| 4143 + .elseif \bpp == 32 | |
| 4144 + .if \size == 8 | |
| 4145 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 | |
| 4146 + .elseif \size == 4 | |
| 4147 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 | |
| 4148 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 | |
| 4149 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 | |
| 4150 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 | |
| 4151 + .elseif \size == 2 | |
| 4152 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 | |
| 4153 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 | |
| 4154 + .elseif \size == 1 | |
| 4155 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 | |
| 4156 + .else | |
| 4157 + .error unsupported macroblock size | |
| 4158 + .endif | |
| 4159 + .elseif \bpp==16 | |
| 4160 + .if \size == 8 | |
| 4161 + st1 {v25.8h}, [RGB],16 | |
| 4162 + .elseif \size == 4 | |
| 4163 + st1 {v25.4h}, [RGB],8 | |
| 4164 + .elseif \size == 2 | |
| 4165 + st1 {v25.h}[4], [RGB],2 | |
| 4166 + st1 {v25.h}[5], [RGB],2 | |
| 4167 + .elseif \size == 1 | |
| 4168 + st1 {v25.h}[6], [RGB],2 | |
| 4169 + .else | |
| 4170 + .error unsupported macroblock size | |
| 4171 + .endif | |
| 4172 + .else | |
| 4173 + .error unsupported bpp | |
| 4174 + .endif | |
| 4175 +.endm | |
| 4176 + | |
| 4177 +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs,
gsize, b_offs, bsize, defsize | |
| 4178 + | |
| 4179 +/* | |
| 4180 + * 2-stage pipelined YCbCr->RGB conversion | |
| 4181 + */ | |
| 4182 + | |
| 4183 +.macro do_yuv_to_rgb_stage1 | |
| 4184 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ | |
| 4185 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | |
| 4186 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | |
| 4187 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | |
| 4188 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | |
| 4189 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | |
| 4190 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | |
| 4191 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | |
| 4192 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | |
| 4193 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | |
| 4194 +.endm | |
| 4195 + | |
| 4196 +.macro do_yuv_to_rgb_stage2 | |
| 4197 + rshrn v20.4h, v20.4s, #15 | |
| 4198 + rshrn2 v20.8h, v22.4s, #15 | |
| 4199 + rshrn v24.4h, v24.4s, #14 | |
| 4200 + rshrn2 v24.8h, v26.4s, #14 | |
| 4201 + rshrn v28.4h, v28.4s, #14 | |
| 4202 + rshrn2 v28.8h, v30.4s, #14 | |
| 4203 + uaddw v20.8h, v20.8h, v0.8b | |
| 4204 + uaddw v24.8h, v24.8h, v0.8b | |
| 4205 + uaddw v28.8h, v28.8h, v0.8b | |
| 4206 +.if \bpp != 16 | |
| 4207 + sqxtun v1\g_offs\defsize, v20.8h | |
| 4208 + sqxtun v1\r_offs\defsize, v24.8h | |
| 4209 + sqxtun v1\b_offs\defsize, v28.8h | |
| 4210 +.else | |
| 4211 + sqshlu v21.8h, v20.8h, #8 | |
| 4212 + sqshlu v25.8h, v24.8h, #8 | |
| 4213 + sqshlu v29.8h, v28.8h, #8 | |
| 4214 + sri v25.8h, v21.8h, #5 | |
| 4215 + sri v25.8h, v29.8h, #11 | |
| 4216 +.endif | |
| 4217 + | |
| 4218 +.endm | |
| 4219 + | |
| 4220 +.macro do_yuv_to_rgb_stage2_store_load_stage1 | |
| 4221 + rshrn v20.4h, v20.4s, #15 | |
| 4222 + rshrn v24.4h, v24.4s, #14 | |
| 4223 + rshrn v28.4h, v28.4s, #14 | |
| 4224 + ld1 {v4.8b}, [U], 8 | |
| 4225 + rshrn2 v20.8h, v22.4s, #15 | |
| 4226 + rshrn2 v24.8h, v26.4s, #14 | |
| 4227 + rshrn2 v28.8h, v30.4s, #14 | |
| 4228 + ld1 {v5.8b}, [V], 8 | |
| 4229 + uaddw v20.8h, v20.8h, v0.8b | |
| 4230 + uaddw v24.8h, v24.8h, v0.8b | |
| 4231 + uaddw v28.8h, v28.8h, v0.8b | |
| 4232 +.if \bpp != 16 /**************** rgb24/rgb32 *********************************/ | |
| 4233 + sqxtun v1\g_offs\defsize, v20.8h | |
| 4234 + ld1 {v0.8b}, [Y], 8 | |
| 4235 + sqxtun v1\r_offs\defsize, v24.8h | |
| 4236 + prfm PLDL1KEEP, [U, #64] | |
| 4237 + prfm PLDL1KEEP, [V, #64] | |
| 4238 + prfm PLDL1KEEP, [Y, #64] | |
| 4239 + sqxtun v1\b_offs\defsize, v28.8h | |
| 4240 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | |
| 4241 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | |
| 4242 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | |
| 4243 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | |
| 4244 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | |
| 4245 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | |
| 4246 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | |
| 4247 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | |
| 4248 +.else /**************************** rgb565 ***********************************/ | |
| 4249 + sqshlu v21.8h, v20.8h, #8 | |
| 4250 + sqshlu v25.8h, v24.8h, #8 | |
| 4251 + sqshlu v29.8h, v28.8h, #8 | |
| 4252 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | |
| 4253 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | |
| 4254 + ld1 {v0.8b}, [Y], 8 | |
| 4255 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | |
| 4256 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | |
| 4257 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | |
| 4258 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | |
| 4259 + sri v25.8h, v21.8h, #5 | |
| 4260 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | |
| 4261 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | |
| 4262 + prfm PLDL1KEEP, [U, #64] | |
| 4263 + prfm PLDL1KEEP, [V, #64] | |
| 4264 + prfm PLDL1KEEP, [Y, #64] | |
| 4265 + sri v25.8h, v29.8h, #11 | |
| 4266 +.endif | |
| 4267 + do_store \bpp, 8 | |
| 4268 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | |
| 4269 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | |
| 4270 +.endm | |
| 4271 + | |
| 4272 +.macro do_yuv_to_rgb | |
| 4273 + do_yuv_to_rgb_stage1 | |
| 4274 + do_yuv_to_rgb_stage2 | |
| 4275 +.endm | |
| 4276 + | |
| 4277 +/* Apple gas crashes on adrl, work around that by using adr. | |
| 4278 + * But this requires a copy of these constants for each function. | |
| 4279 + */ | |
| 4280 + | |
| 4281 +.balign 16 | |
| 4282 +jsimd_ycc_\colorid\()_neon_consts: | |
| 4283 + .short 0, 0, 0, 0 | |
| 4284 + .short 22971, -11277, -23401, 29033 | |
| 4285 + .short -128, -128, -128, -128 | |
| 4286 + .short -128, -128, -128, -128 | |
| 4287 + | |
| 4288 +asm_function jsimd_ycc_\colorid\()_convert_neon | |
| 4289 + OUTPUT_WIDTH .req x0 | |
| 4290 + INPUT_BUF .req x1 | |
| 4291 + INPUT_ROW .req x2 | |
| 4292 + OUTPUT_BUF .req x3 | |
| 4293 + NUM_ROWS .req x4 | |
| 4294 + | |
| 4295 + INPUT_BUF0 .req x5 | |
| 4296 + INPUT_BUF1 .req x6 | |
| 4297 + INPUT_BUF2 .req INPUT_BUF | |
| 4298 + | |
| 4299 + RGB .req x7 | |
| 4300 + Y .req x8 | |
| 4301 + U .req x9 | |
| 4302 + V .req x10 | |
| 4303 + N .req x15 | |
| 4304 + | |
| 4305 + sub sp, sp, 336 | |
| 4306 + str x15, [sp], 16 | |
| 4307 + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ | |
| 4308 + adr x15, jsimd_ycc_\colorid\()_neon_consts | |
| 4309 + /* Save NEON registers */ | |
| 4310 + st1 {v0.8b - v3.8b}, [sp], 32 | |
| 4311 + st1 {v4.8b - v7.8b}, [sp], 32 | |
| 4312 + st1 {v8.8b - v11.8b}, [sp], 32 | |
| 4313 + st1 {v12.8b - v15.8b}, [sp], 32 | |
| 4314 + st1 {v16.8b - v19.8b}, [sp], 32 | |
| 4315 + st1 {v20.8b - v23.8b}, [sp], 32 | |
| 4316 + st1 {v24.8b - v27.8b}, [sp], 32 | |
| 4317 + st1 {v28.8b - v31.8b}, [sp], 32 | |
| 4318 + ld1 {v0.4h, v1.4h}, [x15], 16 | |
| 4319 + ld1 {v2.8h}, [x15] | |
| 4320 + | |
| 4321 + /* Save ARM registers and handle input arguments */ | |
| 4322 + /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ | |
| 4323 + stp x4, x5, [sp], 16 | |
| 4324 + stp x6, x7, [sp], 16 | |
| 4325 + stp x8, x9, [sp], 16 | |
| 4326 + stp x10, x30, [sp], 16 | |
| 4327 + ldr INPUT_BUF0, [INPUT_BUF] | |
| 4328 + ldr INPUT_BUF1, [INPUT_BUF, 8] | |
| 4329 + ldr INPUT_BUF2, [INPUT_BUF, 16] | |
| 4330 + .unreq INPUT_BUF | |
| 4331 + | |
| 4332 + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ | |
| 4333 + movi v10.16b, #255 | |
| 4334 + movi v13.16b, #255 | |
| 4335 + | |
| 4336 + /* Outer loop over scanlines */ | |
| 4337 + cmp NUM_ROWS, #1 | |
| 4338 + blt 9f | |
| 4339 +0: | |
| 4340 + lsl x16, INPUT_ROW, #3 | |
| 4341 + ldr Y, [INPUT_BUF0, x16] | |
| 4342 + ldr U, [INPUT_BUF1, x16] | |
| 4343 + mov N, OUTPUT_WIDTH | |
| 4344 + ldr V, [INPUT_BUF2, x16] | |
| 4345 + add INPUT_ROW, INPUT_ROW, #1 | |
| 4346 + ldr RGB, [OUTPUT_BUF], #8 | |
| 4347 + | |
| 4348 + /* Inner loop over pixels */ | |
| 4349 + subs N, N, #8 | |
| 4350 + blt 3f | |
| 4351 + do_load 8 | |
| 4352 + do_yuv_to_rgb_stage1 | |
| 4353 + subs N, N, #8 | |
| 4354 + blt 2f | |
| 4355 +1: | |
| 4356 + do_yuv_to_rgb_stage2_store_load_stage1 | |
| 4357 + subs N, N, #8 | |
| 4358 + bge 1b | |
| 4359 +2: | |
| 4360 + do_yuv_to_rgb_stage2 | |
| 4361 + do_store \bpp, 8 | |
| 4362 + tst N, #7 | |
| 4363 + beq 8f | |
| 4364 +3: | |
| 4365 + tst N, #4 | |
| 4366 + beq 3f | |
| 4367 + do_load 4 | |
| 4368 +3: | |
| 4369 + tst N, #2 | |
| 4370 + beq 4f | |
| 4371 + do_load 2 | |
| 4372 +4: | |
| 4373 + tst N, #1 | |
| 4374 + beq 5f | |
| 4375 + do_load 1 | |
| 4376 +5: | |
| 4377 + do_yuv_to_rgb | |
| 4378 + tst N, #4 | |
| 4379 + beq 6f | |
| 4380 + do_store \bpp, 4 | |
| 4381 +6: | |
| 4382 + tst N, #2 | |
| 4383 + beq 7f | |
| 4384 + do_store \bpp, 2 | |
| 4385 +7: | |
| 4386 + tst N, #1 | |
| 4387 + beq 8f | |
| 4388 + do_store \bpp, 1 | |
| 4389 +8: | |
| 4390 + subs NUM_ROWS, NUM_ROWS, #1 | |
| 4391 + bgt 0b | |
| 4392 +9: | |
| 4393 + /* Restore all registers and return */ | |
| 4394 + sub sp, sp, #336 | |
| 4395 + ldr x15, [sp], 16 | |
| 4396 + ld1 {v0.8b - v3.8b}, [sp], 32 | |
| 4397 + ld1 {v4.8b - v7.8b}, [sp], 32 | |
| 4398 + ld1 {v8.8b - v11.8b}, [sp], 32 | |
| 4399 + ld1 {v12.8b - v15.8b}, [sp], 32 | |
| 4400 + ld1 {v16.8b - v19.8b}, [sp], 32 | |
| 4401 + ld1 {v20.8b - v23.8b}, [sp], 32 | |
| 4402 + ld1 {v24.8b - v27.8b}, [sp], 32 | |
| 4403 + ld1 {v28.8b - v31.8b}, [sp], 32 | |
| 4404 + /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ | |
| 4405 + ldp x4, x5, [sp], 16 | |
| 4406 + ldp x6, x7, [sp], 16 | |
| 4407 + ldp x8, x9, [sp], 16 | |
| 4408 + ldp x10, x30, [sp], 16 | |
| 4409 + br x30 | |
| 4410 + .unreq OUTPUT_WIDTH | |
| 4411 + .unreq INPUT_ROW | |
| 4412 + .unreq OUTPUT_BUF | |
| 4413 + .unreq NUM_ROWS | |
| 4414 + .unreq INPUT_BUF0 | |
| 4415 + .unreq INPUT_BUF1 | |
| 4416 + .unreq INPUT_BUF2 | |
| 4417 + .unreq RGB | |
| 4418 + .unreq Y | |
| 4419 + .unreq U | |
| 4420 + .unreq V | |
| 4421 + .unreq N | |
| 4422 + | |
| 4423 +.purgem do_yuv_to_rgb | |
| 4424 +.purgem do_yuv_to_rgb_stage1 | |
| 4425 +.purgem do_yuv_to_rgb_stage2 | |
| 4426 +.purgem do_yuv_to_rgb_stage2_store_load_stage1 | |
| 4427 +.endm | |
| 4428 + | |
| 4429 +/*--------------------------------- id ----- bpp R rsize G gsize B bsize
defsize */ | |
| 4430 +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h,
.8b | |
| 4431 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h,
.8b | |
| 4432 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h,
.8b | |
| 4433 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h,
.8b | |
| 4434 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h,
.8b | |
| 4435 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h,
.8b | |
| 4436 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h,
.8b | |
| 4437 +.purgem do_load | |
| 4438 +.purgem do_store | |
| OLD | NEW |