Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(65)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 3555 matching lines...) Expand 10 before | Expand all | Expand 10 after
3566 RECON_AND_STORE(dest, in[31]); 3566 RECON_AND_STORE(dest, in[31]);
3567 3567
3568 dest += 8 - (stride * 32); 3568 dest += 8 - (stride * 32);
3569 } 3569 }
3570 } 3570 }
3571 3571
3572 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3572 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3573 int stride) { 3573 int stride) {
3574 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3574 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3575 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3575 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3576 const __m128i zero = _mm_setzero_si128();
3576 3577
3577 // idct constants for each stage 3578 // idct constants for each stage
3578 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3579 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3579 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3580 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3580 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3581 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3581 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3582 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3582 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3583 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3583 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3584 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3584 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3585 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3585 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3586 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
3628 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3629 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3629 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3630 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3630 stp1_30, stp1_31; 3631 stp1_30, stp1_31;
3631 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3632 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3632 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3633 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3633 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3634 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3634 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3635 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3635 stp2_30, stp2_31; 3636 stp2_30, stp2_31;
3636 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3637 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3637 int i, j, i32; 3638 int i, j, i32;
3638 int zero_flag[2];
3639 3639
3640 for (i = 0; i < 4; i++) { 3640 for (i = 0; i < 4; i++) {
3641 i32 = (i << 5); 3641 i32 = (i << 5);
3642 // First 1-D idct 3642 // First 1-D idct
3643 // Load input data. 3643 // Load input data.
3644 LOAD_DQCOEFF(in[0], input); 3644 LOAD_DQCOEFF(in[0], input);
3645 LOAD_DQCOEFF(in[8], input); 3645 LOAD_DQCOEFF(in[8], input);
3646 LOAD_DQCOEFF(in[16], input); 3646 LOAD_DQCOEFF(in[16], input);
3647 LOAD_DQCOEFF(in[24], input); 3647 LOAD_DQCOEFF(in[24], input);
3648 LOAD_DQCOEFF(in[1], input); 3648 LOAD_DQCOEFF(in[1], input);
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
3703 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3703 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3704 3704
3705 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3705 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3706 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3706 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3707 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3707 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3708 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3708 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3709 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3709 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3710 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3710 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3711 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3711 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3712 3712
3713 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); 3713 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3714 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
3715 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
3716 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
3717 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
3718
3719 if (!zero_flag[0] && !zero_flag[1]) {
3720 col[i32 + 0] = _mm_setzero_si128(); 3714 col[i32 + 0] = _mm_setzero_si128();
3721 col[i32 + 1] = _mm_setzero_si128(); 3715 col[i32 + 1] = _mm_setzero_si128();
3722 col[i32 + 2] = _mm_setzero_si128(); 3716 col[i32 + 2] = _mm_setzero_si128();
3723 col[i32 + 3] = _mm_setzero_si128(); 3717 col[i32 + 3] = _mm_setzero_si128();
3724 col[i32 + 4] = _mm_setzero_si128(); 3718 col[i32 + 4] = _mm_setzero_si128();
3725 col[i32 + 5] = _mm_setzero_si128(); 3719 col[i32 + 5] = _mm_setzero_si128();
3726 col[i32 + 6] = _mm_setzero_si128(); 3720 col[i32 + 6] = _mm_setzero_si128();
3727 col[i32 + 7] = _mm_setzero_si128(); 3721 col[i32 + 7] = _mm_setzero_si128();
3728 col[i32 + 8] = _mm_setzero_si128(); 3722 col[i32 + 8] = _mm_setzero_si128();
3729 col[i32 + 9] = _mm_setzero_si128(); 3723 col[i32 + 9] = _mm_setzero_si128();
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
3788 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3782 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3789 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3783 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3790 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3784 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3791 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3785 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3792 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3786 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3793 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3787 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3794 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3788 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3795 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3789 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3796 } 3790 }
3797 for (i = 0; i < 4; i++) { 3791 for (i = 0; i < 4; i++) {
3798 const __m128i zero = _mm_setzero_si128();
3799 // Second 1-D idct 3792 // Second 1-D idct
3800 j = i << 3; 3793 j = i << 3;
3801 3794
3802 // Transpose 32x8 block to 8x32 block 3795 // Transpose 32x8 block to 8x32 block
3803 array_transpose_8x8(col+j, in); 3796 array_transpose_8x8(col+j, in);
3804 array_transpose_8x8(col+j+32, in+8); 3797 array_transpose_8x8(col+j+32, in+8);
3805 array_transpose_8x8(col+j+64, in+16); 3798 array_transpose_8x8(col+j+64, in+16);
3806 array_transpose_8x8(col+j+96, in+24); 3799 array_transpose_8x8(col+j+96, in+24);
3807 3800
3808 IDCT32 3801 IDCT32
(...skipping 176 matching lines...) Expand 10 before | Expand all | Expand 10 after
3985 RECON_AND_STORE(dest, dc_value); 3978 RECON_AND_STORE(dest, dc_value);
3986 RECON_AND_STORE(dest, dc_value); 3979 RECON_AND_STORE(dest, dc_value);
3987 RECON_AND_STORE(dest, dc_value); 3980 RECON_AND_STORE(dest, dc_value);
3988 RECON_AND_STORE(dest, dc_value); 3981 RECON_AND_STORE(dest, dc_value);
3989 RECON_AND_STORE(dest, dc_value); 3982 RECON_AND_STORE(dest, dc_value);
3990 RECON_AND_STORE(dest, dc_value); 3983 RECON_AND_STORE(dest, dc_value);
3991 RECON_AND_STORE(dest, dc_value); 3984 RECON_AND_STORE(dest, dc_value);
3992 dest += 8 - (stride * 32); 3985 dest += 8 - (stride * 32);
3993 } 3986 }
3994 } 3987 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698