OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * AltiVec optimizations for libjpeg-turbo |
| 3 * |
| 4 * Copyright (C) 2014-2015, D. R. Commander. |
| 5 * All rights reserved. |
| 6 * This software is provided 'as-is', without any express or implied |
| 7 * warranty. In no event will the authors be held liable for any damages |
| 8 * arising from the use of this software. |
| 9 * |
| 10 * Permission is granted to anyone to use this software for any purpose, |
| 11 * including commercial applications, and to alter it and redistribute it |
| 12 * freely, subject to the following restrictions: |
| 13 * |
| 14 * 1. The origin of this software must not be misrepresented; you must not |
| 15 * claim that you wrote the original software. If you use this software |
| 16 * in a product, an acknowledgment in the product documentation would be |
| 17 * appreciated but is not required. |
| 18 * 2. Altered source versions must be plainly marked as such, and must not be |
| 19 * misrepresented as being the original software. |
| 20 * 3. This notice may not be removed or altered from any source distribution. |
| 21 */ |
| 22 |
| 23 /* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ |
| 24 |
| 25 #include "jsimd_altivec.h" |
| 26 |
| 27 |
| 28 /* NOTE: The address will either be aligned or offset by 8 bytes, so we can |
| 29 * always get the data we want by using a single vector load (although we may |
| 30 * have to permute the result.) |
| 31 */ |
| 32 #if __BIG_ENDIAN__ |
| 33 |
| 34 #define LOAD_ROW(row) { \ |
| 35 elemptr = sample_data[row] + start_col; \ |
| 36 in##row = vec_ld(0, elemptr); \ |
| 37 if ((size_t)elemptr & 15) \ |
| 38 in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ |
| 39 } |
| 40 |
| 41 #else |
| 42 |
| 43 #define LOAD_ROW(row) { \ |
| 44 elemptr = sample_data[row] + start_col; \ |
| 45 in##row = vec_vsx_ld(0, elemptr); \ |
| 46 } |
| 47 |
| 48 #endif |
| 49 |
| 50 |
| 51 void |
| 52 jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col, |
| 53 DCTELEM *workspace) |
| 54 { |
| 55 JSAMPROW elemptr; |
| 56 |
| 57 __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; |
| 58 __vector short out0, out1, out2, out3, out4, out5, out6, out7; |
| 59 |
| 60 /* Constants */ |
| 61 __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; |
| 62 __vector unsigned char pb_zero = { __16X(0) }; |
| 63 |
| 64 LOAD_ROW(0); |
| 65 LOAD_ROW(1); |
| 66 LOAD_ROW(2); |
| 67 LOAD_ROW(3); |
| 68 LOAD_ROW(4); |
| 69 LOAD_ROW(5); |
| 70 LOAD_ROW(6); |
| 71 LOAD_ROW(7); |
| 72 |
| 73 out0 = (__vector short)VEC_UNPACKHU(in0); |
| 74 out1 = (__vector short)VEC_UNPACKHU(in1); |
| 75 out2 = (__vector short)VEC_UNPACKHU(in2); |
| 76 out3 = (__vector short)VEC_UNPACKHU(in3); |
| 77 out4 = (__vector short)VEC_UNPACKHU(in4); |
| 78 out5 = (__vector short)VEC_UNPACKHU(in5); |
| 79 out6 = (__vector short)VEC_UNPACKHU(in6); |
| 80 out7 = (__vector short)VEC_UNPACKHU(in7); |
| 81 |
| 82 out0 = vec_sub(out0, pw_centerjsamp); |
| 83 out1 = vec_sub(out1, pw_centerjsamp); |
| 84 out2 = vec_sub(out2, pw_centerjsamp); |
| 85 out3 = vec_sub(out3, pw_centerjsamp); |
| 86 out4 = vec_sub(out4, pw_centerjsamp); |
| 87 out5 = vec_sub(out5, pw_centerjsamp); |
| 88 out6 = vec_sub(out6, pw_centerjsamp); |
| 89 out7 = vec_sub(out7, pw_centerjsamp); |
| 90 |
| 91 vec_st(out0, 0, workspace); |
| 92 vec_st(out1, 16, workspace); |
| 93 vec_st(out2, 32, workspace); |
| 94 vec_st(out3, 48, workspace); |
| 95 vec_st(out4, 64, workspace); |
| 96 vec_st(out5, 80, workspace); |
| 97 vec_st(out6, 96, workspace); |
| 98 vec_st(out7, 112, workspace); |
| 99 } |
| 100 |
| 101 |
| 102 #define WORD_BIT 16 |
| 103 |
| 104 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this. |
| 105 We basically need an unsigned equivalent of vec_madds(). */ |
| 106 |
| 107 #define MULTIPLY(vs0, vs1, out) { \ |
| 108 tmpe = vec_mule((__vector unsigned short)vs0, \ |
| 109 (__vector unsigned short)vs1); \ |
| 110 tmpo = vec_mulo((__vector unsigned short)vs0, \ |
| 111 (__vector unsigned short)vs1); \ |
| 112 out = (__vector short)vec_perm((__vector unsigned short)tmpe, \ |
| 113 (__vector unsigned short)tmpo, \ |
| 114 shift_pack_index); \ |
| 115 } |
| 116 |
| 117 void |
| 118 jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors, |
| 119 DCTELEM *workspace) |
| 120 { |
| 121 __vector short row0, row1, row2, row3, row4, row5, row6, row7, |
| 122 row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, |
| 123 corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, |
| 124 recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, |
| 125 scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; |
| 126 __vector unsigned int tmpe, tmpo; |
| 127 |
| 128 /* Constants */ |
| 129 __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; |
| 130 #if __BIG_ENDIAN__ |
| 131 __vector unsigned char shift_pack_index = |
| 132 {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29}; |
| 133 #else |
| 134 __vector unsigned char shift_pack_index = |
| 135 {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31}; |
| 136 #endif |
| 137 |
| 138 row0 = vec_ld(0, workspace); |
| 139 row1 = vec_ld(16, workspace); |
| 140 row2 = vec_ld(32, workspace); |
| 141 row3 = vec_ld(48, workspace); |
| 142 row4 = vec_ld(64, workspace); |
| 143 row5 = vec_ld(80, workspace); |
| 144 row6 = vec_ld(96, workspace); |
| 145 row7 = vec_ld(112, workspace); |
| 146 |
| 147 /* Branch-less absolute value */ |
| 148 row0s = vec_sra(row0, pw_word_bit_m1); |
| 149 row1s = vec_sra(row1, pw_word_bit_m1); |
| 150 row2s = vec_sra(row2, pw_word_bit_m1); |
| 151 row3s = vec_sra(row3, pw_word_bit_m1); |
| 152 row4s = vec_sra(row4, pw_word_bit_m1); |
| 153 row5s = vec_sra(row5, pw_word_bit_m1); |
| 154 row6s = vec_sra(row6, pw_word_bit_m1); |
| 155 row7s = vec_sra(row7, pw_word_bit_m1); |
| 156 row0 = vec_xor(row0, row0s); |
| 157 row1 = vec_xor(row1, row1s); |
| 158 row2 = vec_xor(row2, row2s); |
| 159 row3 = vec_xor(row3, row3s); |
| 160 row4 = vec_xor(row4, row4s); |
| 161 row5 = vec_xor(row5, row5s); |
| 162 row6 = vec_xor(row6, row6s); |
| 163 row7 = vec_xor(row7, row7s); |
| 164 row0 = vec_sub(row0, row0s); |
| 165 row1 = vec_sub(row1, row1s); |
| 166 row2 = vec_sub(row2, row2s); |
| 167 row3 = vec_sub(row3, row3s); |
| 168 row4 = vec_sub(row4, row4s); |
| 169 row5 = vec_sub(row5, row5s); |
| 170 row6 = vec_sub(row6, row6s); |
| 171 row7 = vec_sub(row7, row7s); |
| 172 |
| 173 corr0 = vec_ld(DCTSIZE2 * 2, divisors); |
| 174 corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); |
| 175 corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); |
| 176 corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); |
| 177 corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); |
| 178 corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); |
| 179 corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); |
| 180 corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); |
| 181 |
| 182 row0 = vec_add(row0, corr0); |
| 183 row1 = vec_add(row1, corr1); |
| 184 row2 = vec_add(row2, corr2); |
| 185 row3 = vec_add(row3, corr3); |
| 186 row4 = vec_add(row4, corr4); |
| 187 row5 = vec_add(row5, corr5); |
| 188 row6 = vec_add(row6, corr6); |
| 189 row7 = vec_add(row7, corr7); |
| 190 |
| 191 recip0 = vec_ld(0, divisors); |
| 192 recip1 = vec_ld(16, divisors); |
| 193 recip2 = vec_ld(32, divisors); |
| 194 recip3 = vec_ld(48, divisors); |
| 195 recip4 = vec_ld(64, divisors); |
| 196 recip5 = vec_ld(80, divisors); |
| 197 recip6 = vec_ld(96, divisors); |
| 198 recip7 = vec_ld(112, divisors); |
| 199 |
| 200 MULTIPLY(row0, recip0, row0); |
| 201 MULTIPLY(row1, recip1, row1); |
| 202 MULTIPLY(row2, recip2, row2); |
| 203 MULTIPLY(row3, recip3, row3); |
| 204 MULTIPLY(row4, recip4, row4); |
| 205 MULTIPLY(row5, recip5, row5); |
| 206 MULTIPLY(row6, recip6, row6); |
| 207 MULTIPLY(row7, recip7, row7); |
| 208 |
| 209 scale0 = vec_ld(DCTSIZE2 * 4, divisors); |
| 210 scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); |
| 211 scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); |
| 212 scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); |
| 213 scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); |
| 214 scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); |
| 215 scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); |
| 216 scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); |
| 217 |
| 218 MULTIPLY(row0, scale0, row0); |
| 219 MULTIPLY(row1, scale1, row1); |
| 220 MULTIPLY(row2, scale2, row2); |
| 221 MULTIPLY(row3, scale3, row3); |
| 222 MULTIPLY(row4, scale4, row4); |
| 223 MULTIPLY(row5, scale5, row5); |
| 224 MULTIPLY(row6, scale6, row6); |
| 225 MULTIPLY(row7, scale7, row7); |
| 226 |
| 227 row0 = vec_xor(row0, row0s); |
| 228 row1 = vec_xor(row1, row1s); |
| 229 row2 = vec_xor(row2, row2s); |
| 230 row3 = vec_xor(row3, row3s); |
| 231 row4 = vec_xor(row4, row4s); |
| 232 row5 = vec_xor(row5, row5s); |
| 233 row6 = vec_xor(row6, row6s); |
| 234 row7 = vec_xor(row7, row7s); |
| 235 row0 = vec_sub(row0, row0s); |
| 236 row1 = vec_sub(row1, row1s); |
| 237 row2 = vec_sub(row2, row2s); |
| 238 row3 = vec_sub(row3, row3s); |
| 239 row4 = vec_sub(row4, row4s); |
| 240 row5 = vec_sub(row5, row5s); |
| 241 row6 = vec_sub(row6, row6s); |
| 242 row7 = vec_sub(row7, row7s); |
| 243 |
| 244 vec_st(row0, 0, coef_block); |
| 245 vec_st(row1, 16, coef_block); |
| 246 vec_st(row2, 32, coef_block); |
| 247 vec_st(row3, 48, coef_block); |
| 248 vec_st(row4, 64, coef_block); |
| 249 vec_st(row5, 80, coef_block); |
| 250 vec_st(row6, 96, coef_block); |
| 251 vec_st(row7, 112, coef_block); |
| 252 } |
OLD | NEW |