Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(550)

Side by Side Diff: source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

Issue 1339513003: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 185 matching lines...) Expand 10 before | Expand all | Expand 10 after
196 196
197 src_ptr+=src_pixels_per_line; 197 src_ptr+=src_pixels_per_line;
198 198
199 // save only 8 bytes 199 // save only 8 bytes
200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
201 201
202 output_ptr+=output_pitch; 202 output_ptr+=output_pitch;
203 } 203 }
204 } 204 }
205 205
206 #if ARCH_X86_64
207 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
208 ptrdiff_t src_pixels_per_line,
209 uint8_t *output_ptr,
210 ptrdiff_t output_pitch,
211 uint32_t output_height,
212 const int16_t *filter) {
213 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
214 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
215 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
216 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
217 unsigned int i;
218
219 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
220 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
221 filtersReg = _mm_loadu_si128((const __m128i *)filter);
222 // converting the 16 bit (short) to 8 bit (byte) and have the same data
223 // in both lanes of 128 bit register.
224 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
225
226 // duplicate only the first 16 bits (first and second byte)
227 // across 128 bit register
228 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
229 // duplicate only the second 16 bits (third and forth byte)
230 // across 128 bit register
231 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
232 // duplicate only the third 16 bits (fifth and sixth byte)
233 // across 128 bit register
234 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
235 // duplicate only the forth 16 bits (seventh and eighth byte)
236 // across 128 bit register
237 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
238
239 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
240 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
241 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
242 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
243
244 for (i = 0; i < output_height; i++) {
245 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
246
247 // filter the source buffer
248 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
249 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
250
251 // multiply 2 adjacent elements with the filter and add the result
252 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
253 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
254
255 // add and saturate the results together
256 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
257
258 // filter the source buffer
259 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
260 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
261
262 // multiply 2 adjacent elements with the filter and add the result
263 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
264 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
265
266 // add and saturate the results together
267 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
268 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
269
270 // reading the next 16 bytes.
271 // (part of it was being read by earlier read)
272 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
273
274 // add and saturate the results together
275 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
276 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
277
278 // filter the source buffer
279 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
280 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
281
282 // multiply 2 adjacent elements with the filter and add the result
283 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
284 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
285
286 // add and saturate the results together
287 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
288
289 // filter the source buffer
290 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
291 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
292
293 // multiply 2 adjacent elements with the filter and add the result
294 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
295 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
296
297 // add and saturate the results together
298 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
299 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
300 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
301 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
302
303 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
304 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
305
306 // shift by 7 bit each 16 bit
307 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
308 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
309
310 // shrink to 8 bit each 16 bits, the first lane contain the first
311 // convolve result and the second lane contain the second convolve
312 // result
313 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
314
315 src_ptr+=src_pixels_per_line;
316
317 // save 16 bytes
318 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
319
320 output_ptr+=output_pitch;
321 }
322 }
323 #endif // ARCH_X86_64
324
325 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, 206 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
326 ptrdiff_t src_pitch, 207 ptrdiff_t src_pitch,
327 uint8_t *output_ptr, 208 uint8_t *output_ptr,
328 ptrdiff_t out_pitch, 209 ptrdiff_t out_pitch,
329 uint32_t output_height, 210 uint32_t output_height,
330 const int16_t *filter) { 211 const int16_t *filter) {
331 __m128i addFilterReg64, filtersReg, minReg; 212 __m128i addFilterReg64, filtersReg, minReg;
332 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 213 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
333 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; 214 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
334 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; 215 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
(...skipping 185 matching lines...) Expand 10 before | Expand all | Expand 10 after
520 srcReg7 = srcReg8; 401 srcReg7 = srcReg8;
521 402
522 // save 16 bytes convolve result 403 // save 16 bytes convolve result
523 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 404 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
524 405
525 output_ptr+=out_pitch; 406 output_ptr+=out_pitch;
526 } 407 }
527 } 408 }
528 #endif // ARCH_X86_64 409 #endif // ARCH_X86_64
529 410
530 #if ARCH_X86_64
531 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
532 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
533 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
534 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
535 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
536 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
537 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
538 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
539 #define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
540 #define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
541 #define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
542 #else // ARCH_X86
543 filter8_1dfunction vpx_filter_block1d16_v8_ssse3; 411 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
544 filter8_1dfunction vpx_filter_block1d16_h8_ssse3; 412 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
545 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; 413 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
546 filter8_1dfunction vpx_filter_block1d8_h8_ssse3; 414 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
547 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; 415 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
548 filter8_1dfunction vpx_filter_block1d4_h8_ssse3; 416 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
549 #endif // ARCH_X86_64
550 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; 417 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
551 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; 418 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
552 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; 419 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
553 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; 420 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
554 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; 421 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
555 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; 422 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
556 423
557 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; 424 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
558 filter8_1dfunction vpx_filter_block1d16_h2_ssse3; 425 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
559 filter8_1dfunction vpx_filter_block1d8_v2_ssse3; 426 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
(...skipping 596 matching lines...) Expand 10 before | Expand all | Expand 10 after
1156 // const int16_t *filter_x, int x_step_q4, 1023 // const int16_t *filter_x, int x_step_q4,
1157 // const int16_t *filter_y, int y_step_q4, 1024 // const int16_t *filter_y, int y_step_q4,
1158 // int w, int h); 1025 // int w, int h);
1159 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, 1026 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1160 // uint8_t *dst, ptrdiff_t dst_stride, 1027 // uint8_t *dst, ptrdiff_t dst_stride,
1161 // const int16_t *filter_x, int x_step_q4, 1028 // const int16_t *filter_x, int x_step_q4,
1162 // const int16_t *filter_y, int y_step_q4, 1029 // const int16_t *filter_y, int y_step_q4,
1163 // int w, int h); 1030 // int w, int h);
1164 FUN_CONV_2D(, ssse3); 1031 FUN_CONV_2D(, ssse3);
1165 FUN_CONV_2D(avg_ , ssse3); 1032 FUN_CONV_2D(avg_ , ssse3);
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm ('k') | source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698