Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(122)

Side by Side Diff: source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm ('k') | source/libvpx/vpx_mem/vpx_mem.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 // Due to a header conflict between math.h and intrinsics includes with ceil() 11 // Due to a header conflict between math.h and intrinsics includes with ceil()
12 // in certain configurations under vs9 this include needs to precede 12 // in certain configurations under vs9 this include needs to precede
13 // tmmintrin.h. 13 // tmmintrin.h.
14 14
15 #include <tmmintrin.h> 15 #include <tmmintrin.h>
16 16
17 #include "./vpx_dsp_rtcd.h" 17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_dsp/x86/convolve.h" 19 #include "vpx_dsp/x86/convolve.h"
20 #include "vpx_mem/vpx_mem.h"
19 #include "vpx_ports/mem.h" 21 #include "vpx_ports/mem.h"
20 #include "vpx_ports/emmintrin_compat.h" 22 #include "vpx_ports/emmintrin_compat.h"
21 23
22 // filters only for the 4_h8 convolution 24 // filters only for the 4_h8 convolution
23 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 25 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
24 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 26 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
25 }; 27 };
26 28
27 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 29 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
28 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 30 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after
194 196
195 src_ptr+=src_pixels_per_line; 197 src_ptr+=src_pixels_per_line;
196 198
197 // save only 8 bytes 199 // save only 8 bytes
198 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
199 201
200 output_ptr+=output_pitch; 202 output_ptr+=output_pitch;
201 } 203 }
202 } 204 }
203 205
206 #if ARCH_X86_64
204 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, 207 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
205 ptrdiff_t src_pixels_per_line, 208 ptrdiff_t src_pixels_per_line,
206 uint8_t *output_ptr, 209 uint8_t *output_ptr,
207 ptrdiff_t output_pitch, 210 ptrdiff_t output_pitch,
208 uint32_t output_height, 211 uint32_t output_height,
209 const int16_t *filter) { 212 const int16_t *filter) {
210 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; 213 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
211 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; 214 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
212 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 215 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
213 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; 216 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after
310 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); 313 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
311 314
312 src_ptr+=src_pixels_per_line; 315 src_ptr+=src_pixels_per_line;
313 316
314 // save 16 bytes 317 // save 16 bytes
315 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); 318 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
316 319
317 output_ptr+=output_pitch; 320 output_ptr+=output_pitch;
318 } 321 }
319 } 322 }
323 #endif // ARCH_X86_64
320 324
321 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, 325 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
322 ptrdiff_t src_pitch, 326 ptrdiff_t src_pitch,
323 uint8_t *output_ptr, 327 uint8_t *output_ptr,
324 ptrdiff_t out_pitch, 328 ptrdiff_t out_pitch,
325 uint32_t output_height, 329 uint32_t output_height,
326 const int16_t *filter) { 330 const int16_t *filter) {
327 __m128i addFilterReg64, filtersReg, minReg; 331 __m128i addFilterReg64, filtersReg, minReg;
328 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 332 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
329 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; 333 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
399 srcReg6 = srcReg7; 403 srcReg6 = srcReg7;
400 srcReg7 = srcReg8; 404 srcReg7 = srcReg8;
401 405
402 // save only 8 bytes convolve result 406 // save only 8 bytes convolve result
403 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); 407 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
404 408
405 output_ptr+=out_pitch; 409 output_ptr+=out_pitch;
406 } 410 }
407 } 411 }
408 412
413 #if ARCH_X86_64
409 static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, 414 static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
410 ptrdiff_t src_pitch, 415 ptrdiff_t src_pitch,
411 uint8_t *output_ptr, 416 uint8_t *output_ptr,
412 ptrdiff_t out_pitch, 417 ptrdiff_t out_pitch,
413 uint32_t output_height, 418 uint32_t output_height,
414 const int16_t *filter) { 419 const int16_t *filter) {
415 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; 420 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
416 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 421 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
417 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; 422 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
418 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; 423 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
513 srcReg5 = srcReg6; 518 srcReg5 = srcReg6;
514 srcReg6 = srcReg7; 519 srcReg6 = srcReg7;
515 srcReg7 = srcReg8; 520 srcReg7 = srcReg8;
516 521
517 // save 16 bytes convolve result 522 // save 16 bytes convolve result
518 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 523 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
519 524
520 output_ptr+=out_pitch; 525 output_ptr+=out_pitch;
521 } 526 }
522 } 527 }
528 #endif // ARCH_X86_64
523 529
524 #if ARCH_X86_64 530 #if ARCH_X86_64
525 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; 531 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
526 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; 532 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
527 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; 533 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
528 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; 534 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
529 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; 535 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
530 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; 536 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
531 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 537 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
532 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 538 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
580 // uint8_t *dst, ptrdiff_t dst_stride, 586 // uint8_t *dst, ptrdiff_t dst_stride,
581 // const int16_t *filter_x, int x_step_q4, 587 // const int16_t *filter_x, int x_step_q4,
582 // const int16_t *filter_y, int y_step_q4, 588 // const int16_t *filter_y, int y_step_q4,
583 // int w, int h); 589 // int w, int h);
584 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); 590 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
585 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); 591 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
586 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); 592 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
587 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, 593 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
588 ssse3); 594 ssse3);
589 595
590 // void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, 596 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
597 out0, out1, out2, out3, out4, out5, out6, out7) { \
598 const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
599 const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
600 const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
601 const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
602 \
603 const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
604 const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
605 const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
606 const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
607 \
608 const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
609 const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
610 const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
611 const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
612 \
613 out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
614 out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
615 out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
616 out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
617 out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
618 out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
619 out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
620 out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
621 }
622
623 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
624 uint8_t *dst, const int16_t *x_filter) {
625 const __m128i k_256 = _mm_set1_epi16(1 << 8);
626 const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
627 // pack and duplicate the filter values
628 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
629 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
630 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
631 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
632 const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
633 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
634 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
635 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
636 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
637 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
638 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
639 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
640 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
641 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
642 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
643 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
644 // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
645 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
646 // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
647 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
648 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
649 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
650 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
651 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
652 // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
653 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
654 // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
655 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
656 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
657 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
658 const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
659 const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
660 const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
661 // multiply 2 adjacent elements with the filter and add the result
662 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
663 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
664 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
665 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
666 // add and saturate the results together
667 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
668 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
669 __m128i temp = _mm_adds_epi16(x0, x3);
670 temp = _mm_adds_epi16(temp, min_x2x1);
671 temp = _mm_adds_epi16(temp, max_x2x1);
672 // round and shift by 7 bit each 16 bit
673 temp = _mm_mulhrs_epi16(temp, k_256);
674 // shrink to 8 bit each 16 bits
675 temp = _mm_packus_epi16(temp, temp);
676 // save only 8 bytes convolve result
677 _mm_storel_epi64((__m128i*)dst, temp);
678 }
679
680 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
681 uint8_t *dst, ptrdiff_t dst_stride) {
682 __m128i A, B, C, D, E, F, G, H;
683
684 A = _mm_loadl_epi64((const __m128i *)src);
685 B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
686 C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
687 D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
688 E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
689 F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
690 G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
691 H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
692
693 TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
694 A, B, C, D, E, F, G, H);
695
696 _mm_storel_epi64((__m128i*)dst, A);
697 _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
698 _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
699 _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
700 _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
701 _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
702 _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
703 _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
704 }
705
706 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
707 uint8_t *dst, ptrdiff_t dst_stride,
708 const InterpKernel *x_filters,
709 int x0_q4, int x_step_q4, int w, int h) {
710 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
711 int x, y, z;
712 src -= SUBPEL_TAPS / 2 - 1;
713
714 // This function processes 8x8 areas. The intermediate height is not always
715 // a multiple of 8, so force it to be a multiple of 8 here.
716 y = h + (8 - (h & 0x7));
717
718 do {
719 int x_q4 = x0_q4;
720 for (x = 0; x < w; x += 8) {
721 // process 8 src_x steps
722 for (z = 0; z < 8; ++z) {
723 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
724 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
725 if (x_q4 & SUBPEL_MASK) {
726 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
727 } else {
728 int i;
729 for (i = 0; i < 8; ++i) {
730 temp[z * 8 + i] = src_x[i * src_stride + 3];
731 }
732 }
733 x_q4 += x_step_q4;
734 }
735
736 // transpose the 8x8 filters values back to dst
737 transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
738 }
739
740 src += src_stride * 8;
741 dst += dst_stride * 8;
742 } while (y -= 8);
743 }
744
745 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
746 uint8_t *dst, const int16_t *filter) {
747 const __m128i k_256 = _mm_set1_epi16(1 << 8);
748 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
749 // pack and duplicate the filter values
750 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
751 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
752 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
753 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
754 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
755 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
756 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
757 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
758 // TRANSPOSE...
759 // 00 01 02 03 04 05 06 07
760 // 10 11 12 13 14 15 16 17
761 // 20 21 22 23 24 25 26 27
762 // 30 31 32 33 34 35 36 37
763 //
764 // TO
765 //
766 // 00 10 20 30
767 // 01 11 21 31
768 // 02 12 22 32
769 // 03 13 23 33
770 // 04 14 24 34
771 // 05 15 25 35
772 // 06 16 26 36
773 // 07 17 27 37
774 //
775 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
776 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
777 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
778 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
779 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
780 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
781 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
782 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
783 // 02 03 12 13 22 23 32 33
784 const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
785 // 06 07 16 17 26 27 36 37
786 const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
787 // multiply 2 adjacent elements with the filter and add the result
788 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
789 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
790 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
791 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
792 // add and saturate the results together
793 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
794 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
795 __m128i temp = _mm_adds_epi16(x0, x3);
796 temp = _mm_adds_epi16(temp, min_x2x1);
797 temp = _mm_adds_epi16(temp, max_x2x1);
798 // round and shift by 7 bit each 16 bit
799 temp = _mm_mulhrs_epi16(temp, k_256);
800 // shrink to 8 bit each 16 bits
801 temp = _mm_packus_epi16(temp, temp);
802 // save only 4 bytes
803 *(int *)dst = _mm_cvtsi128_si32(temp);
804 }
805
806 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
807 uint8_t *dst, ptrdiff_t dst_stride) {
808 __m128i A = _mm_cvtsi32_si128(*(const int *)src);
809 __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
810 __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
811 __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
812 // 00 10 01 11 02 12 03 13
813 const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
814 // 20 30 21 31 22 32 23 33
815 const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
816 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
817 A = _mm_unpacklo_epi16(tr0_0, tr0_1);
818 B = _mm_srli_si128(A, 4);
819 C = _mm_srli_si128(A, 8);
820 D = _mm_srli_si128(A, 12);
821
822 *(int *)(dst) = _mm_cvtsi128_si32(A);
823 *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
824 *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
825 *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
826 }
827
828 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
829 uint8_t *dst, ptrdiff_t dst_stride,
830 const InterpKernel *x_filters,
831 int x0_q4, int x_step_q4, int w, int h) {
832 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
833 int x, y, z;
834 src -= SUBPEL_TAPS / 2 - 1;
835
836 for (y = 0; y < h; y += 4) {
837 int x_q4 = x0_q4;
838 for (x = 0; x < w; x += 4) {
839 // process 4 src_x steps
840 for (z = 0; z < 4; ++z) {
841 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
842 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
843 if (x_q4 & SUBPEL_MASK) {
844 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
845 } else {
846 int i;
847 for (i = 0; i < 4; ++i) {
848 temp[z * 4 + i] = src_x[i * src_stride + 3];
849 }
850 }
851 x_q4 += x_step_q4;
852 }
853
854 // transpose the 4x4 filters values back to dst
855 transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
856 }
857
858 src += src_stride * 4;
859 dst += dst_stride * 4;
860 }
861 }
862
863 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
864 uint8_t *dst, const int16_t *filter) {
865 const __m128i k_256 = _mm_set1_epi16(1 << 8);
866 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
867 // pack and duplicate the filter values
868 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
869 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
870 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
871 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
872 const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
873 const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
874 const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
875 const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
876 const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
877 const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
878 const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
879 const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
880 const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
881 const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
882 const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
883 const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
884 // multiply 2 adjacent elements with the filter and add the result
885 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
886 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
887 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
888 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
889 // add and saturate the results together
890 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
891 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
892 __m128i temp = _mm_adds_epi16(x0, x3);
893 temp = _mm_adds_epi16(temp, min_x2x1);
894 temp = _mm_adds_epi16(temp, max_x2x1);
895 // round and shift by 7 bit each 16 bit
896 temp = _mm_mulhrs_epi16(temp, k_256);
897 // shrink to 8 bit each 16 bits
898 temp = _mm_packus_epi16(temp, temp);
899 // save only 4 bytes
900 *(int *)dst = _mm_cvtsi128_si32(temp);
901 }
902
903 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
904 uint8_t *dst, ptrdiff_t dst_stride,
905 const InterpKernel *y_filters,
906 int y0_q4, int y_step_q4, int w, int h) {
907 int y;
908 int y_q4 = y0_q4;
909
910 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
911 for (y = 0; y < h; ++y) {
912 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
913 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
914
915 if (y_q4 & SUBPEL_MASK) {
916 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
917 } else {
918 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
919 }
920
921 y_q4 += y_step_q4;
922 }
923 }
924
925 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
926 uint8_t *dst, const int16_t *filter) {
927 const __m128i k_256 = _mm_set1_epi16(1 << 8);
928 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
929 // pack and duplicate the filter values
930 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
931 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
932 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
933 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
934 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
935 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
936 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
937 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
938 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
939 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
940 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
941 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
942 const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
943 const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
944 const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
945 const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
946 // multiply 2 adjacent elements with the filter and add the result
947 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
948 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
949 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
950 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
951 // add and saturate the results together
952 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
953 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
954 __m128i temp = _mm_adds_epi16(x0, x3);
955 temp = _mm_adds_epi16(temp, min_x2x1);
956 temp = _mm_adds_epi16(temp, max_x2x1);
957 // round and shift by 7 bit each 16 bit
958 temp = _mm_mulhrs_epi16(temp, k_256);
959 // shrink to 8 bit each 16 bits
960 temp = _mm_packus_epi16(temp, temp);
961 // save only 8 bytes convolve result
962 _mm_storel_epi64((__m128i*)dst, temp);
963 }
964
965 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
966 uint8_t *dst, ptrdiff_t dst_stride,
967 const InterpKernel *y_filters,
968 int y0_q4, int y_step_q4, int w, int h) {
969 int y;
970 int y_q4 = y0_q4;
971
972 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
973 for (y = 0; y < h; ++y) {
974 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
975 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
976 if (y_q4 & SUBPEL_MASK) {
977 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
978 } else {
979 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
980 }
981 y_q4 += y_step_q4;
982 }
983 }
984
985 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
986 uint8_t *dst, const int16_t *filter, int w) {
987 const __m128i k_256 = _mm_set1_epi16(1 << 8);
988 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
989 // pack and duplicate the filter values
990 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
991 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
992 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
993 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
994 int i;
995
996 for (i = 0; i < w; i += 16) {
997 const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
998 const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
999 const __m128i C =
1000 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
1001 const __m128i D =
1002 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
1003 const __m128i E =
1004 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
1005 const __m128i F =
1006 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
1007 const __m128i G =
1008 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
1009 const __m128i H =
1010 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
1011 // merge the result together
1012 const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
1013 const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
1014 const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
1015 const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
1016 // multiply 2 adjacent elements with the filter and add the result
1017 const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
1018 const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
1019 const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
1020 const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
1021 // add and saturate the results together
1022 const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
1023 const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
1024 // merge the result together
1025 const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
1026 const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
1027 // multiply 2 adjacent elements with the filter and add the result
1028 const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
1029 const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
1030 // merge the result together
1031 const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
1032 const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
1033 // multiply 2 adjacent elements with the filter and add the result
1034 const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
1035 const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
1036 // add and saturate the results together
1037 __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
1038 __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
1039
1040 // add and saturate the results together
1041 temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
1042 temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
1043 // round and shift by 7 bit each 16 bit
1044 temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
1045 temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
1046 // shrink to 8 bit each 16 bits, the first lane contain the first
1047 // convolve result and the second lane contain the second convolve
1048 // result
1049 temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
1050 src_ptr += 16;
1051 // save 16 bytes convolve result
1052 _mm_store_si128((__m128i*)&dst[i], temp_hi);
1053 }
1054 }
1055
1056 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
1057 uint8_t *dst, ptrdiff_t dst_stride,
1058 const InterpKernel *y_filters,
1059 int y0_q4, int y_step_q4, int w, int h) {
1060 int y;
1061 int y_q4 = y0_q4;
1062
1063 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1064 for (y = 0; y < h; ++y) {
1065 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1066 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1067 if (y_q4 & SUBPEL_MASK) {
1068 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
1069 w);
1070 } else {
1071 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
1072 }
1073 y_q4 += y_step_q4;
1074 }
1075 }
1076
1077 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
1078 uint8_t *dst, ptrdiff_t dst_stride,
1079 const InterpKernel *const x_filters,
1080 int x0_q4, int x_step_q4,
1081 const InterpKernel *const y_filters,
1082 int y0_q4, int y_step_q4,
1083 int w, int h) {
1084 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1085 // 2d filtering proceeds in 2 steps:
1086 // (1) Interpolate horizontally into an intermediate buffer, temp.
1087 // (2) Interpolate temp vertically to derive the sub-pixel result.
1088 // Deriving the maximum number of rows in the temp buffer (135):
1089 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1090 // --Largest block size is 64x64 pixels.
1091 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1092 // original frame (in 1/16th pixel units).
1093 // --Must round-up because block may be located at sub-pixel position.
1094 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1095 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1096 // --Require an additional 8 rows for the horiz_w8 transpose tail.
1097 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1098 const int intermediate_height =
1099 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1100
1101 assert(w <= 64);
1102 assert(h <= 64);
1103 assert(y_step_q4 <= 32);
1104 assert(x_step_q4 <= 32);
1105
1106 if (w >= 8) {
1107 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1108 src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
1109 w, intermediate_height);
1110 } else {
1111 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1112 src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
1113 w, intermediate_height);
1114 }
1115
1116 if (w >= 16) {
1117 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1118 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1119 } else if (w == 8) {
1120 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1121 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1122 } else {
1123 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1124 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1125 }
1126 }
1127
1128 static const InterpKernel *get_filter_base(const int16_t *filter) {
1129 // NOTE: This assumes that the filter table is 256-byte aligned.
1130 // TODO(agrange) Modify to make independent of table alignment.
1131 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1132 }
1133
1134 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1135 return (int)((const InterpKernel *)(intptr_t)f - base);
1136 }
1137
1138 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1139 uint8_t *dst, ptrdiff_t dst_stride,
1140 const int16_t *filter_x, int x_step_q4,
1141 const int16_t *filter_y, int y_step_q4,
1142 int w, int h) {
1143 const InterpKernel *const filters_x = get_filter_base(filter_x);
1144 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1145
1146 const InterpKernel *const filters_y = get_filter_base(filter_y);
1147 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1148
1149 scaledconvolve2d(src, src_stride, dst, dst_stride,
1150 filters_x, x0_q4, x_step_q4,
1151 filters_y, y0_q4, y_step_q4, w, h);
1152 }
1153
1154 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
591 // uint8_t *dst, ptrdiff_t dst_stride, 1155 // uint8_t *dst, ptrdiff_t dst_stride,
592 // const int16_t *filter_x, int x_step_q4, 1156 // const int16_t *filter_x, int x_step_q4,
593 // const int16_t *filter_y, int y_step_q4, 1157 // const int16_t *filter_y, int y_step_q4,
594 // int w, int h); 1158 // int w, int h);
595 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, 1159 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
596 // uint8_t *dst, ptrdiff_t dst_stride, 1160 // uint8_t *dst, ptrdiff_t dst_stride,
597 // const int16_t *filter_x, int x_step_q4, 1161 // const int16_t *filter_x, int x_step_q4,
598 // const int16_t *filter_y, int y_step_q4, 1162 // const int16_t *filter_y, int y_step_q4,
599 // int w, int h); 1163 // int w, int h);
600 FUN_CONV_2D(, ssse3); 1164 FUN_CONV_2D(, ssse3);
601 FUN_CONV_2D(avg_ , ssse3); 1165 FUN_CONV_2D(avg_ , ssse3);
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm ('k') | source/libvpx/vpx_mem/vpx_mem.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698