OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 185 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
196 | 196 |
197 src_ptr+=src_pixels_per_line; | 197 src_ptr+=src_pixels_per_line; |
198 | 198 |
199 // save only 8 bytes | 199 // save only 8 bytes |
200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); | 200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); |
201 | 201 |
202 output_ptr+=output_pitch; | 202 output_ptr+=output_pitch; |
203 } | 203 } |
204 } | 204 } |
205 | 205 |
206 #if ARCH_X86_64 | |
207 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, | |
208 ptrdiff_t src_pixels_per_line, | |
209 uint8_t *output_ptr, | |
210 ptrdiff_t output_pitch, | |
211 uint32_t output_height, | |
212 const int16_t *filter) { | |
213 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; | |
214 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | |
215 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | |
216 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; | |
217 unsigned int i; | |
218 | |
219 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 | |
220 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); | |
221 filtersReg = _mm_loadu_si128((const __m128i *)filter); | |
222 // converting the 16 bit (short) to 8 bit (byte) and have the same data | |
223 // in both lanes of 128 bit register. | |
224 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
225 | |
226 // duplicate only the first 16 bits (first and second byte) | |
227 // across 128 bit register | |
228 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
229 // duplicate only the second 16 bits (third and forth byte) | |
230 // across 128 bit register | |
231 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
232 // duplicate only the third 16 bits (fifth and sixth byte) | |
233 // across 128 bit register | |
234 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
235 // duplicate only the forth 16 bits (seventh and eighth byte) | |
236 // across 128 bit register | |
237 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
238 | |
239 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | |
240 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | |
241 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | |
242 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | |
243 | |
244 for (i = 0; i < output_height; i++) { | |
245 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); | |
246 | |
247 // filter the source buffer | |
248 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); | |
249 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); | |
250 | |
251 // multiply 2 adjacent elements with the filter and add the result | |
252 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); | |
253 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); | |
254 | |
255 // add and saturate the results together | |
256 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | |
257 | |
258 // filter the source buffer | |
259 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); | |
260 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); | |
261 | |
262 // multiply 2 adjacent elements with the filter and add the result | |
263 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | |
264 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
265 | |
266 // add and saturate the results together | |
267 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | |
268 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | |
269 | |
270 // reading the next 16 bytes. | |
271 // (part of it was being read by earlier read) | |
272 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); | |
273 | |
274 // add and saturate the results together | |
275 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | |
276 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | |
277 | |
278 // filter the source buffer | |
279 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); | |
280 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); | |
281 | |
282 // multiply 2 adjacent elements with the filter and add the result | |
283 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); | |
284 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); | |
285 | |
286 // add and saturate the results together | |
287 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | |
288 | |
289 // filter the source buffer | |
290 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); | |
291 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); | |
292 | |
293 // multiply 2 adjacent elements with the filter and add the result | |
294 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | |
295 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
296 | |
297 // add and saturate the results together | |
298 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | |
299 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | |
300 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | |
301 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | |
302 | |
303 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); | |
304 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); | |
305 | |
306 // shift by 7 bit each 16 bit | |
307 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); | |
308 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); | |
309 | |
310 // shrink to 8 bit each 16 bits, the first lane contain the first | |
311 // convolve result and the second lane contain the second convolve | |
312 // result | |
313 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); | |
314 | |
315 src_ptr+=src_pixels_per_line; | |
316 | |
317 // save 16 bytes | |
318 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | |
319 | |
320 output_ptr+=output_pitch; | |
321 } | |
322 } | |
323 #endif // ARCH_X86_64 | |
324 | |
325 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, | 206 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, |
326 ptrdiff_t src_pitch, | 207 ptrdiff_t src_pitch, |
327 uint8_t *output_ptr, | 208 uint8_t *output_ptr, |
328 ptrdiff_t out_pitch, | 209 ptrdiff_t out_pitch, |
329 uint32_t output_height, | 210 uint32_t output_height, |
330 const int16_t *filter) { | 211 const int16_t *filter) { |
331 __m128i addFilterReg64, filtersReg, minReg; | 212 __m128i addFilterReg64, filtersReg, minReg; |
332 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; | 213 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; |
333 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; | 214 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; |
334 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; | 215 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; |
(...skipping 185 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
520 srcReg7 = srcReg8; | 401 srcReg7 = srcReg8; |
521 | 402 |
522 // save 16 bytes convolve result | 403 // save 16 bytes convolve result |
523 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 404 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
524 | 405 |
525 output_ptr+=out_pitch; | 406 output_ptr+=out_pitch; |
526 } | 407 } |
527 } | 408 } |
528 #endif // ARCH_X86_64 | 409 #endif // ARCH_X86_64 |
529 | 410 |
530 #if ARCH_X86_64 | |
531 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; | |
532 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; | |
533 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; | |
534 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; | |
535 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; | |
536 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; | |
537 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 | |
538 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 | |
539 #define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 | |
540 #define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 | |
541 #define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 | |
542 #else // ARCH_X86 | |
543 filter8_1dfunction vpx_filter_block1d16_v8_ssse3; | 411 filter8_1dfunction vpx_filter_block1d16_v8_ssse3; |
544 filter8_1dfunction vpx_filter_block1d16_h8_ssse3; | 412 filter8_1dfunction vpx_filter_block1d16_h8_ssse3; |
545 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; | 413 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; |
546 filter8_1dfunction vpx_filter_block1d8_h8_ssse3; | 414 filter8_1dfunction vpx_filter_block1d8_h8_ssse3; |
547 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; | 415 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; |
548 filter8_1dfunction vpx_filter_block1d4_h8_ssse3; | 416 filter8_1dfunction vpx_filter_block1d4_h8_ssse3; |
549 #endif // ARCH_X86_64 | |
550 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; | 417 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; |
551 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; | 418 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; |
552 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; | 419 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; |
553 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; | 420 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; |
554 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; | 421 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; |
555 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; | 422 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; |
556 | 423 |
557 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; | 424 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; |
558 filter8_1dfunction vpx_filter_block1d16_h2_ssse3; | 425 filter8_1dfunction vpx_filter_block1d16_h2_ssse3; |
559 filter8_1dfunction vpx_filter_block1d8_v2_ssse3; | 426 filter8_1dfunction vpx_filter_block1d8_v2_ssse3; |
(...skipping 596 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1156 // const int16_t *filter_x, int x_step_q4, | 1023 // const int16_t *filter_x, int x_step_q4, |
1157 // const int16_t *filter_y, int y_step_q4, | 1024 // const int16_t *filter_y, int y_step_q4, |
1158 // int w, int h); | 1025 // int w, int h); |
1159 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 1026 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
1160 // uint8_t *dst, ptrdiff_t dst_stride, | 1027 // uint8_t *dst, ptrdiff_t dst_stride, |
1161 // const int16_t *filter_x, int x_step_q4, | 1028 // const int16_t *filter_x, int x_step_q4, |
1162 // const int16_t *filter_y, int y_step_q4, | 1029 // const int16_t *filter_y, int y_step_q4, |
1163 // int w, int h); | 1030 // int w, int h); |
1164 FUN_CONV_2D(, ssse3); | 1031 FUN_CONV_2D(, ssse3); |
1165 FUN_CONV_2D(avg_ , ssse3); | 1032 FUN_CONV_2D(avg_ , ssse3); |
OLD | NEW |