| Index: simd/jquanti-altivec.c | 
| diff --git a/simd/jquanti-altivec.c b/simd/jquanti-altivec.c | 
| new file mode 100644 | 
| index 0000000000000000000000000000000000000000..b3adab9ac2984d0b4c37ebe95646e83db7e73e19 | 
| --- /dev/null | 
| +++ b/simd/jquanti-altivec.c | 
| @@ -0,0 +1,252 @@ | 
| +/* | 
| + * AltiVec optimizations for libjpeg-turbo | 
| + * | 
| + * Copyright (C) 2014-2015, D. R. Commander. | 
| + * All rights reserved. | 
| + * This software is provided 'as-is', without any express or implied | 
| + * warranty.  In no event will the authors be held liable for any damages | 
| + * arising from the use of this software. | 
| + * | 
| + * Permission is granted to anyone to use this software for any purpose, | 
| + * including commercial applications, and to alter it and redistribute it | 
| + * freely, subject to the following restrictions: | 
| + * | 
| + * 1. The origin of this software must not be misrepresented; you must not | 
| + *    claim that you wrote the original software. If you use this software | 
| + *    in a product, an acknowledgment in the product documentation would be | 
| + *    appreciated but is not required. | 
| + * 2. Altered source versions must be plainly marked as such, and must not be | 
| + *    misrepresented as being the original software. | 
| + * 3. This notice may not be removed or altered from any source distribution. | 
| + */ | 
| + | 
| +/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ | 
| + | 
| +#include "jsimd_altivec.h" | 
| + | 
| + | 
| +/* NOTE: The address will either be aligned or offset by 8 bytes, so we can | 
| + * always get the data we want by using a single vector load (although we may | 
| + * have to permute the result.) | 
| + */ | 
| +#if __BIG_ENDIAN__ | 
| + | 
| +#define LOAD_ROW(row) {  \ | 
| +  elemptr = sample_data[row] + start_col;  \ | 
| +  in##row = vec_ld(0, elemptr);  \ | 
| +  if ((size_t)elemptr & 15)  \ | 
| +    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \ | 
| +} | 
| + | 
| +#else | 
| + | 
| +#define LOAD_ROW(row) {  \ | 
| +  elemptr = sample_data[row] + start_col;  \ | 
| +  in##row = vec_vsx_ld(0, elemptr);  \ | 
| +} | 
| + | 
| +#endif | 
| + | 
| + | 
| +void | 
| +jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col, | 
| +                        DCTELEM *workspace) | 
| +{ | 
| +  JSAMPROW elemptr; | 
| + | 
| +  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; | 
| +  __vector short out0, out1, out2, out3, out4, out5, out6, out7; | 
| + | 
| +  /* Constants */ | 
| +  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; | 
| +  __vector unsigned char pb_zero = { __16X(0) }; | 
| + | 
| +  LOAD_ROW(0); | 
| +  LOAD_ROW(1); | 
| +  LOAD_ROW(2); | 
| +  LOAD_ROW(3); | 
| +  LOAD_ROW(4); | 
| +  LOAD_ROW(5); | 
| +  LOAD_ROW(6); | 
| +  LOAD_ROW(7); | 
| + | 
| +  out0 = (__vector short)VEC_UNPACKHU(in0); | 
| +  out1 = (__vector short)VEC_UNPACKHU(in1); | 
| +  out2 = (__vector short)VEC_UNPACKHU(in2); | 
| +  out3 = (__vector short)VEC_UNPACKHU(in3); | 
| +  out4 = (__vector short)VEC_UNPACKHU(in4); | 
| +  out5 = (__vector short)VEC_UNPACKHU(in5); | 
| +  out6 = (__vector short)VEC_UNPACKHU(in6); | 
| +  out7 = (__vector short)VEC_UNPACKHU(in7); | 
| + | 
| +  out0 = vec_sub(out0, pw_centerjsamp); | 
| +  out1 = vec_sub(out1, pw_centerjsamp); | 
| +  out2 = vec_sub(out2, pw_centerjsamp); | 
| +  out3 = vec_sub(out3, pw_centerjsamp); | 
| +  out4 = vec_sub(out4, pw_centerjsamp); | 
| +  out5 = vec_sub(out5, pw_centerjsamp); | 
| +  out6 = vec_sub(out6, pw_centerjsamp); | 
| +  out7 = vec_sub(out7, pw_centerjsamp); | 
| + | 
| +  vec_st(out0, 0, workspace); | 
| +  vec_st(out1, 16, workspace); | 
| +  vec_st(out2, 32, workspace); | 
| +  vec_st(out3, 48, workspace); | 
| +  vec_st(out4, 64, workspace); | 
| +  vec_st(out5, 80, workspace); | 
| +  vec_st(out6, 96, workspace); | 
| +  vec_st(out7, 112, workspace); | 
| +} | 
| + | 
| + | 
| +#define WORD_BIT 16 | 
| + | 
| +/* There is no AltiVec 16-bit unsigned multiply instruction, hence this. | 
| +   We basically need an unsigned equivalent of vec_madds(). */ | 
| + | 
| +#define MULTIPLY(vs0, vs1, out) {  \ | 
| +  tmpe = vec_mule((__vector unsigned short)vs0,  \ | 
| +                  (__vector unsigned short)vs1);  \ | 
| +  tmpo = vec_mulo((__vector unsigned short)vs0,  \ | 
| +                  (__vector unsigned short)vs1);  \ | 
| +  out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \ | 
| +                                 (__vector unsigned short)tmpo,  \ | 
| +                                 shift_pack_index);  \ | 
| +} | 
| + | 
| +void | 
| +jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors, | 
| +                        DCTELEM *workspace) | 
| +{ | 
| +  __vector short row0, row1, row2, row3, row4, row5, row6, row7, | 
| +    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, | 
| +    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, | 
| +    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, | 
| +    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; | 
| +  __vector unsigned int tmpe, tmpo; | 
| + | 
| +  /* Constants */ | 
| +  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; | 
| +#if __BIG_ENDIAN__ | 
| +  __vector unsigned char shift_pack_index = | 
| +    {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29}; | 
| +#else | 
| +  __vector unsigned char shift_pack_index = | 
| +    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31}; | 
| +#endif | 
| + | 
| +  row0 = vec_ld(0, workspace); | 
| +  row1 = vec_ld(16, workspace); | 
| +  row2 = vec_ld(32, workspace); | 
| +  row3 = vec_ld(48, workspace); | 
| +  row4 = vec_ld(64, workspace); | 
| +  row5 = vec_ld(80, workspace); | 
| +  row6 = vec_ld(96, workspace); | 
| +  row7 = vec_ld(112, workspace); | 
| + | 
| +  /* Branch-less absolute value */ | 
| +  row0s = vec_sra(row0, pw_word_bit_m1); | 
| +  row1s = vec_sra(row1, pw_word_bit_m1); | 
| +  row2s = vec_sra(row2, pw_word_bit_m1); | 
| +  row3s = vec_sra(row3, pw_word_bit_m1); | 
| +  row4s = vec_sra(row4, pw_word_bit_m1); | 
| +  row5s = vec_sra(row5, pw_word_bit_m1); | 
| +  row6s = vec_sra(row6, pw_word_bit_m1); | 
| +  row7s = vec_sra(row7, pw_word_bit_m1); | 
| +  row0 = vec_xor(row0, row0s); | 
| +  row1 = vec_xor(row1, row1s); | 
| +  row2 = vec_xor(row2, row2s); | 
| +  row3 = vec_xor(row3, row3s); | 
| +  row4 = vec_xor(row4, row4s); | 
| +  row5 = vec_xor(row5, row5s); | 
| +  row6 = vec_xor(row6, row6s); | 
| +  row7 = vec_xor(row7, row7s); | 
| +  row0 = vec_sub(row0, row0s); | 
| +  row1 = vec_sub(row1, row1s); | 
| +  row2 = vec_sub(row2, row2s); | 
| +  row3 = vec_sub(row3, row3s); | 
| +  row4 = vec_sub(row4, row4s); | 
| +  row5 = vec_sub(row5, row5s); | 
| +  row6 = vec_sub(row6, row6s); | 
| +  row7 = vec_sub(row7, row7s); | 
| + | 
| +  corr0 = vec_ld(DCTSIZE2 * 2, divisors); | 
| +  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); | 
| +  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); | 
| +  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); | 
| +  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); | 
| +  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); | 
| +  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); | 
| +  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); | 
| + | 
| +  row0 = vec_add(row0, corr0); | 
| +  row1 = vec_add(row1, corr1); | 
| +  row2 = vec_add(row2, corr2); | 
| +  row3 = vec_add(row3, corr3); | 
| +  row4 = vec_add(row4, corr4); | 
| +  row5 = vec_add(row5, corr5); | 
| +  row6 = vec_add(row6, corr6); | 
| +  row7 = vec_add(row7, corr7); | 
| + | 
| +  recip0 = vec_ld(0, divisors); | 
| +  recip1 = vec_ld(16, divisors); | 
| +  recip2 = vec_ld(32, divisors); | 
| +  recip3 = vec_ld(48, divisors); | 
| +  recip4 = vec_ld(64, divisors); | 
| +  recip5 = vec_ld(80, divisors); | 
| +  recip6 = vec_ld(96, divisors); | 
| +  recip7 = vec_ld(112, divisors); | 
| + | 
| +  MULTIPLY(row0, recip0, row0); | 
| +  MULTIPLY(row1, recip1, row1); | 
| +  MULTIPLY(row2, recip2, row2); | 
| +  MULTIPLY(row3, recip3, row3); | 
| +  MULTIPLY(row4, recip4, row4); | 
| +  MULTIPLY(row5, recip5, row5); | 
| +  MULTIPLY(row6, recip6, row6); | 
| +  MULTIPLY(row7, recip7, row7); | 
| + | 
| +  scale0 = vec_ld(DCTSIZE2 * 4, divisors); | 
| +  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); | 
| +  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); | 
| +  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); | 
| +  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); | 
| +  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); | 
| +  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); | 
| +  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); | 
| + | 
| +  MULTIPLY(row0, scale0, row0); | 
| +  MULTIPLY(row1, scale1, row1); | 
| +  MULTIPLY(row2, scale2, row2); | 
| +  MULTIPLY(row3, scale3, row3); | 
| +  MULTIPLY(row4, scale4, row4); | 
| +  MULTIPLY(row5, scale5, row5); | 
| +  MULTIPLY(row6, scale6, row6); | 
| +  MULTIPLY(row7, scale7, row7); | 
| + | 
| +  row0 = vec_xor(row0, row0s); | 
| +  row1 = vec_xor(row1, row1s); | 
| +  row2 = vec_xor(row2, row2s); | 
| +  row3 = vec_xor(row3, row3s); | 
| +  row4 = vec_xor(row4, row4s); | 
| +  row5 = vec_xor(row5, row5s); | 
| +  row6 = vec_xor(row6, row6s); | 
| +  row7 = vec_xor(row7, row7s); | 
| +  row0 = vec_sub(row0, row0s); | 
| +  row1 = vec_sub(row1, row1s); | 
| +  row2 = vec_sub(row2, row2s); | 
| +  row3 = vec_sub(row3, row3s); | 
| +  row4 = vec_sub(row4, row4s); | 
| +  row5 = vec_sub(row5, row5s); | 
| +  row6 = vec_sub(row6, row6s); | 
| +  row7 = vec_sub(row7, row7s); | 
| + | 
| +  vec_st(row0, 0, coef_block); | 
| +  vec_st(row1, 16, coef_block); | 
| +  vec_st(row2, 32, coef_block); | 
| +  vec_st(row3, 48, coef_block); | 
| +  vec_st(row4, 64, coef_block); | 
| +  vec_st(row5, 80, coef_block); | 
| +  vec_st(row6, 96, coef_block); | 
| +  vec_st(row7, 112, coef_block); | 
| +} | 
|  |