| OLD | NEW |
| (Empty) |
| 1 #include <tmmintrin.h> | |
| 2 #include <emmintrin.h> | |
| 3 #include "vpx_ports/mem.h" | |
| 4 #include "vpx_ports/emmintrin_compat.h" | |
| 5 #pragma GCC push_options | |
| 6 #pragma GCC optimize("unroll-loops") | |
| 7 | |
| 8 DECLARE_ALIGNED(16, const unsigned char, filt1_4_h8[16]) = { | |
| 9 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 | |
| 10 }; | |
| 11 DECLARE_ALIGNED(16, const unsigned char, filt2_4_h8[16]) = { | |
| 12 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 | |
| 13 }; | |
| 14 DECLARE_ALIGNED(16, const unsigned char, filt1_global[16]) = { | |
| 15 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | |
| 16 }; | |
| 17 DECLARE_ALIGNED(16, const unsigned char, filt2_global[16]) = { | |
| 18 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | |
| 19 }; | |
| 20 DECLARE_ALIGNED(16, const unsigned char, filt3_global[16]) = { | |
| 21 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | |
| 22 }; | |
| 23 DECLARE_ALIGNED(16, const unsigned char, filt4_global[16]) = { | |
| 24 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | |
| 25 }; | |
| 26 | |
| 27 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, | |
| 28 unsigned int src_pixels_per_line, | |
| 29 unsigned char *output_ptr, | |
| 30 unsigned int output_pitch, | |
| 31 unsigned int output_height, | |
| 32 short *filter) { | |
| 33 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
| 34 thirdFilters, forthFilters, srcReg, srcRegFilt1, srcRegFilt2, | |
| 35 srcRegFilt3, srcRegFilt4; | |
| 36 unsigned int i; | |
| 37 | |
| 38 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
| 39 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
| 40 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); | |
| 41 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); | |
| 42 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); | |
| 43 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); | |
| 44 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); | |
| 45 thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); | |
| 46 forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); | |
| 47 | |
| 48 for (i = 0; i < output_height; i++) { | |
| 49 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | |
| 50 srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); | |
| 51 srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); | |
| 52 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
| 53 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | |
| 54 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); | |
| 55 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); | |
| 56 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
| 57 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); | |
| 58 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | |
| 59 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
| 60 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
| 61 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | |
| 62 src_ptr += src_pixels_per_line; | |
| 63 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); | |
| 64 output_ptr += output_pitch; | |
| 65 } | |
| 66 } | |
| 67 | |
| 68 void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, | |
| 69 unsigned int src_pixels_per_line, | |
| 70 unsigned char *output_ptr, | |
| 71 unsigned int output_pitch, | |
| 72 unsigned int output_height, | |
| 73 short *filter) { | |
| 74 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
| 75 thirdFilters, forthFilters, srcReg, srcRegFilt1, srcRegFilt2, | |
| 76 srcRegFilt3, srcRegFilt4; | |
| 77 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | |
| 78 unsigned int i; | |
| 79 | |
| 80 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
| 81 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
| 82 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
| 83 | |
| 84 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
| 85 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
| 86 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
| 87 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
| 88 | |
| 89 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | |
| 90 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | |
| 91 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | |
| 92 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | |
| 93 | |
| 94 for (i = 0 ; i < output_height ; i++) { | |
| 95 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | |
| 96 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); | |
| 97 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); | |
| 98 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
| 99 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | |
| 100 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); | |
| 101 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); | |
| 102 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); | |
| 103 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); | |
| 104 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | |
| 105 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); | |
| 106 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
| 107 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
| 108 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
| 109 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | |
| 110 src_ptr += src_pixels_per_line; | |
| 111 _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); | |
| 112 output_ptr += output_pitch; | |
| 113 } | |
| 114 } | |
| 115 | |
| 116 void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, | |
| 117 unsigned int src_pixels_per_line, | |
| 118 unsigned char *output_ptr, | |
| 119 unsigned int output_pitch, | |
| 120 unsigned int output_height, | |
| 121 short *filter) { | |
| 122 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
| 123 thirdFilters, forthFilters, srcReg1, srcReg2, srcRegFilt1_1, | |
| 124 srcRegFilt2_1, srcRegFilt2, srcRegFilt3; | |
| 125 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | |
| 126 unsigned int i; | |
| 127 | |
| 128 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
| 129 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
| 130 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
| 131 | |
| 132 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
| 133 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
| 134 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
| 135 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
| 136 | |
| 137 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | |
| 138 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | |
| 139 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | |
| 140 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | |
| 141 | |
| 142 for (i = 0 ; i < output_height ; i++) { | |
| 143 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | |
| 144 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, filt1Reg); | |
| 145 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, filt2Reg); | |
| 146 | |
| 147 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); | |
| 148 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | |
| 149 | |
| 150 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | |
| 151 | |
| 152 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, filt4Reg); | |
| 153 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, filt3Reg); | |
| 154 | |
| 155 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | |
| 156 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
| 157 | |
| 158 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt3); | |
| 159 | |
| 160 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | |
| 161 | |
| 162 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | |
| 163 | |
| 164 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, filt1Reg); | |
| 165 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, filt2Reg); | |
| 166 | |
| 167 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); | |
| 168 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | |
| 169 | |
| 170 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | |
| 171 | |
| 172 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, filt4Reg); | |
| 173 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, filt3Reg); | |
| 174 | |
| 175 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | |
| 176 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
| 177 | |
| 178 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt3); | |
| 179 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | |
| 180 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg127); | |
| 181 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg127); | |
| 182 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); | |
| 183 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); | |
| 184 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); | |
| 185 | |
| 186 src_ptr += src_pixels_per_line; | |
| 187 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | |
| 188 output_ptr += output_pitch; | |
| 189 } | |
| 190 } | |
| 191 | |
| 192 void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr, | |
| 193 unsigned int src_pitch, | |
| 194 unsigned char *output_ptr, | |
| 195 unsigned int out_pitch, | |
| 196 unsigned int output_height, | |
| 197 short *filter) { | |
| 198 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, srcRegFilt1, | |
| 199 srcRegFilt2, srcRegFilt3, srcRegFilt4; | |
| 200 unsigned int i; | |
| 201 | |
| 202 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
| 203 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
| 204 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
| 205 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); | |
| 206 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); | |
| 207 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); | |
| 208 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); | |
| 209 | |
| 210 for (i = 0 ; i < output_height ; i++) { | |
| 211 srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0])); | |
| 212 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0])); | |
| 213 | |
| 214 srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); | |
| 215 | |
| 216 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0])); | |
| 217 srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0])); | |
| 218 | |
| 219 srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); | |
| 220 | |
| 221 srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0])); | |
| 222 srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0])); | |
| 223 | |
| 224 srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); | |
| 225 srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2); | |
| 226 | |
| 227 srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0])); | |
| 228 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0])); | |
| 229 | |
| 230 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2); | |
| 231 srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4); | |
| 232 | |
| 233 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
| 234 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | |
| 235 srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); | |
| 236 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_srli_si128(srcRegFilt3, 8)); | |
| 237 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | |
| 238 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
| 239 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
| 240 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
| 241 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | |
| 242 | |
| 243 src_ptr += src_pitch; | |
| 244 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); | |
| 245 output_ptr += out_pitch; | |
| 246 } | |
| 247 } | |
| 248 | |
| 249 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, | |
| 250 unsigned int src_pitch, | |
| 251 unsigned char *output_ptr, | |
| 252 unsigned int out_pitch, | |
| 253 unsigned int output_height, | |
| 254 short *filter) { | |
| 255 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
| 256 thirdFilters, forthFilters, srcRegFilt1, srcRegFilt2, srcRegFilt3, | |
| 257 srcRegFilt4, srcRegFilt5, srcRegFilt6; | |
| 258 unsigned int i; | |
| 259 | |
| 260 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
| 261 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
| 262 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
| 263 | |
| 264 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
| 265 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
| 266 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
| 267 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
| 268 | |
| 269 for (i = 0 ; i < output_height ; i++) { | |
| 270 srcRegFilt1 = _mm_loadl_epi64((__m128i*)&src_ptr[0]); | |
| 271 srcRegFilt2 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch)[0]); | |
| 272 | |
| 273 srcRegFilt3 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*2)[0]); | |
| 274 srcRegFilt4 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*3)[0]); | |
| 275 | |
| 276 srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); | |
| 277 | |
| 278 srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); | |
| 279 | |
| 280 srcRegFilt2 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*4)[0]); | |
| 281 srcRegFilt4 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*5)[0]); | |
| 282 | |
| 283 srcRegFilt5 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*6)[0]); | |
| 284 srcRegFilt6 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*7)[0]); | |
| 285 | |
| 286 srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); | |
| 287 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); | |
| 288 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
| 289 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | |
| 290 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
| 291 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); | |
| 292 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); | |
| 293 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
| 294 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | |
| 295 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
| 296 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
| 297 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | |
| 298 | |
| 299 src_ptr += src_pitch; | |
| 300 _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); | |
| 301 output_ptr += out_pitch; | |
| 302 } | |
| 303 } | |
| 304 | |
| 305 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, | |
| 306 unsigned int src_pitch, | |
| 307 unsigned char *output_ptr, | |
| 308 unsigned int out_pitch, | |
| 309 unsigned int output_height, | |
| 310 short *filter) { | |
| 311 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
| 312 thirdFilters, forthFilters, srcRegFilt1, srcRegFilt2, srcRegFilt3, | |
| 313 srcRegFilt4, srcRegFilt5, srcRegFilt6; | |
| 314 unsigned int i; | |
| 315 | |
| 316 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
| 317 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
| 318 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
| 319 | |
| 320 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
| 321 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
| 322 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
| 323 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
| 324 | |
| 325 for (i = 0 ; i < output_height ; i++) { | |
| 326 srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); | |
| 327 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); | |
| 328 | |
| 329 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); | |
| 330 srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); | |
| 331 | |
| 332 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); | |
| 333 srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); | |
| 334 srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); | |
| 335 srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); | |
| 336 | |
| 337 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); | |
| 338 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); | |
| 339 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
| 340 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | |
| 341 | |
| 342 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); | |
| 343 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
| 344 | |
| 345 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); | |
| 346 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); | |
| 347 | |
| 348 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); | |
| 349 srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); | |
| 350 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); | |
| 351 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); | |
| 352 | |
| 353 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt4); | |
| 354 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt6); | |
| 355 | |
| 356 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); | |
| 357 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); | |
| 358 | |
| 359 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); | |
| 360 srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); | |
| 361 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, thirdFilters); | |
| 362 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, thirdFilters); | |
| 363 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt4); | |
| 364 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt6); | |
| 365 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg127); | |
| 366 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
| 367 srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); | |
| 368 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
| 369 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); | |
| 370 | |
| 371 src_ptr += src_pitch; | |
| 372 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | |
| 373 output_ptr += out_pitch; | |
| 374 } | |
| 375 } | |
| OLD | NEW |