OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 26 matching lines...) Expand all Loading... |
37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { | 37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { |
38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | 38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
39 }; | 39 }; |
40 | 40 |
41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, | 41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, |
42 unsigned int src_pixels_per_line, | 42 unsigned int src_pixels_per_line, |
43 unsigned char *output_ptr, | 43 unsigned char *output_ptr, |
44 unsigned int output_pitch, | 44 unsigned int output_pitch, |
45 unsigned int output_height, | 45 unsigned int output_height, |
46 int16_t *filter) { | 46 int16_t *filter) { |
47 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 47 __m128i firstFilters, secondFilters, shuffle1, shuffle2; |
48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; | 48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; |
49 __m128i addFilterReg64, filtersReg, srcReg, minReg; | 49 __m128i addFilterReg64, filtersReg, srcReg, minReg; |
50 unsigned int i; | 50 unsigned int i; |
51 | 51 |
52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | 52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 |
53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); | 53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); |
54 filtersReg = _mm_loadu_si128((__m128i *)filter); | 54 filtersReg = _mm_loadu_si128((__m128i *)filter); |
55 // converting the 16 bit (short) to 8 bit (byte) and have the same data | 55 // converting the 16 bit (short) to 8 bit (byte) and have the same data |
56 // in both lanes of 128 bit register. | 56 // in both lanes of 128 bit register. |
57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | 57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); |
58 | 58 |
59 // duplicate only the first 16 bits in the filter into the first lane | 59 // duplicate only the first 16 bits in the filter into the first lane |
60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); | 60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); |
61 // duplicate only the third 16 bit in the filter into the first lane | 61 // duplicate only the third 16 bit in the filter into the first lane |
62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); | 62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); |
63 // duplicate only the seconds 16 bits in the filter into the second lane | 63 // duplicate only the seconds 16 bits in the filter into the second lane |
| 64 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 |
64 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); | 65 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); |
65 // duplicate only the forth 16 bits in the filter into the second lane | 66 // duplicate only the forth 16 bits in the filter into the second lane |
| 67 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 |
66 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); | 68 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); |
67 | 69 |
68 // loading the local filters | 70 // loading the local filters |
69 thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); | 71 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); |
70 forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); | 72 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); |
71 | 73 |
72 for (i = 0; i < output_height; i++) { | 74 for (i = 0; i < output_height; i++) { |
73 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 75 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
74 | 76 |
75 // filter the source buffer | 77 // filter the source buffer |
76 srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); | 78 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); |
77 srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); | 79 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); |
78 | 80 |
79 // multiply 2 adjacent elements with the filter and add the result | 81 // multiply 2 adjacent elements with the filter and add the result |
80 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | 82 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); |
81 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | 83 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); |
82 | 84 |
83 // extract the higher half of the lane | 85 // extract the higher half of the lane |
84 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); | 86 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); |
85 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); | 87 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); |
86 | 88 |
87 minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); | 89 minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
157 | 159 |
158 // filter the source buffer | 160 // filter the source buffer |
159 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); | 161 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); |
160 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); | 162 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); |
161 | 163 |
162 // multiply 2 adjacent elements with the filter and add the result | 164 // multiply 2 adjacent elements with the filter and add the result |
163 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); | 165 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); |
164 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); | 166 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); |
165 | 167 |
166 // add and saturate all the results together | 168 // add and saturate all the results together |
167 minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3); | 169 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); |
| 170 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); |
| 171 |
| 172 srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3); |
| 173 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); |
168 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | 174 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); |
169 | |
170 srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3); | |
171 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); | |
172 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); | |
173 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); | 175 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); |
174 | 176 |
175 // shift by 7 bit each 16 bits | 177 // shift by 7 bit each 16 bits |
176 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | 178 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |
177 | 179 |
178 // shrink to 8 bit each 16 bits | 180 // shrink to 8 bit each 16 bits |
179 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | 181 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); |
180 | 182 |
181 src_ptr+=src_pixels_per_line; | 183 src_ptr+=src_pixels_per_line; |
182 | 184 |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
222 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | 224 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); |
223 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | 225 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); |
224 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | 226 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); |
225 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | 227 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); |
226 | 228 |
227 for (i = 0; i < output_height; i++) { | 229 for (i = 0; i < output_height; i++) { |
228 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 230 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
229 | 231 |
230 // filter the source buffer | 232 // filter the source buffer |
231 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); | 233 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); |
232 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg); | 234 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); |
233 | 235 |
234 // multiply 2 adjacent elements with the filter and add the result | 236 // multiply 2 adjacent elements with the filter and add the result |
235 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); | 237 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); |
236 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | 238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); |
237 | 239 |
238 // add and saturate the results together | 240 // add and saturate the results together |
239 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | 241 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
240 | 242 |
241 // filter the source buffer | 243 // filter the source buffer |
242 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg); | 244 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); |
243 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); | 245 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); |
244 | 246 |
245 // multiply 2 adjacent elements with the filter and add the result | 247 // multiply 2 adjacent elements with the filter and add the result |
246 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | 248 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
247 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | 249 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
248 | 250 |
249 // add and saturate the results together | 251 // add and saturate the results together |
250 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 252 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
251 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 253 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
252 | 254 |
253 // reading the next 16 bytes. | 255 // reading the next 16 bytes. |
254 // (part of it was being read by earlier read) | 256 // (part of it was being read by earlier read) |
255 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | 257 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); |
256 | 258 |
257 // add and saturate the results together | 259 // add and saturate the results together |
258 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 260 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
259 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 261 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
260 | 262 |
261 // filter the source buffer | 263 // filter the source buffer |
262 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); | 264 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); |
263 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg); | 265 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); |
264 | 266 |
265 // multiply 2 adjacent elements with the filter and add the result | 267 // multiply 2 adjacent elements with the filter and add the result |
266 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); | 268 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); |
267 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | 269 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); |
268 | 270 |
269 // add and saturate the results together | 271 // add and saturate the results together |
270 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | 272 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); |
271 | 273 |
272 // filter the source buffer | 274 // filter the source buffer |
273 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg); | 275 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); |
274 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); | 276 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); |
275 | 277 |
276 // multiply 2 adjacent elements with the filter and add the result | 278 // multiply 2 adjacent elements with the filter and add the result |
277 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | 279 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); |
278 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | 280 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); |
279 | 281 |
280 // add and saturate the results together | 282 // add and saturate the results together |
281 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | 283 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
282 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 284 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
283 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | 285 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
284 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 286 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
285 | 287 |
286 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); | 288 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); |
287 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); | 289 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); |
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
481 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); | 483 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); |
482 | 484 |
483 src_ptr+=src_pitch; | 485 src_ptr+=src_pitch; |
484 | 486 |
485 // save 16 bytes convolve result | 487 // save 16 bytes convolve result |
486 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 488 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
487 | 489 |
488 output_ptr+=out_pitch; | 490 output_ptr+=out_pitch; |
489 } | 491 } |
490 } | 492 } |
OLD | NEW |