OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 3555 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3566 RECON_AND_STORE(dest, in[31]); | 3566 RECON_AND_STORE(dest, in[31]); |
3567 | 3567 |
3568 dest += 8 - (stride * 32); | 3568 dest += 8 - (stride * 32); |
3569 } | 3569 } |
3570 } | 3570 } |
3571 | 3571 |
3572 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, | 3572 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
3573 int stride) { | 3573 int stride) { |
3574 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3574 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
3575 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 3575 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
| 3576 const __m128i zero = _mm_setzero_si128(); |
3576 | 3577 |
3577 // idct constants for each stage | 3578 // idct constants for each stage |
3578 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3579 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
3579 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3580 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
3580 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | 3581 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
3581 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); | 3582 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
3582 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 3583 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
3583 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); | 3584 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
3584 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); | 3585 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
3585 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); | 3586 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3628 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, | 3629 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
3629 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, | 3630 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
3630 stp1_30, stp1_31; | 3631 stp1_30, stp1_31; |
3631 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 3632 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
3632 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, | 3633 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
3633 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, | 3634 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
3634 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, | 3635 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
3635 stp2_30, stp2_31; | 3636 stp2_30, stp2_31; |
3636 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 3637 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
3637 int i, j, i32; | 3638 int i, j, i32; |
3638 int zero_flag[2]; | |
3639 | 3639 |
3640 for (i = 0; i < 4; i++) { | 3640 for (i = 0; i < 4; i++) { |
3641 i32 = (i << 5); | 3641 i32 = (i << 5); |
3642 // First 1-D idct | 3642 // First 1-D idct |
3643 // Load input data. | 3643 // Load input data. |
3644 LOAD_DQCOEFF(in[0], input); | 3644 LOAD_DQCOEFF(in[0], input); |
3645 LOAD_DQCOEFF(in[8], input); | 3645 LOAD_DQCOEFF(in[8], input); |
3646 LOAD_DQCOEFF(in[16], input); | 3646 LOAD_DQCOEFF(in[16], input); |
3647 LOAD_DQCOEFF(in[24], input); | 3647 LOAD_DQCOEFF(in[24], input); |
3648 LOAD_DQCOEFF(in[1], input); | 3648 LOAD_DQCOEFF(in[1], input); |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3703 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); | 3703 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); |
3704 | 3704 |
3705 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); | 3705 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
3706 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); | 3706 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
3707 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); | 3707 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
3708 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); | 3708 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
3709 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); | 3709 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
3710 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); | 3710 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
3711 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); | 3711 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
3712 | 3712 |
3713 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); | 3713 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { |
3714 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); | |
3715 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); | |
3716 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); | |
3717 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); | |
3718 | |
3719 if (!zero_flag[0] && !zero_flag[1]) { | |
3720 col[i32 + 0] = _mm_setzero_si128(); | 3714 col[i32 + 0] = _mm_setzero_si128(); |
3721 col[i32 + 1] = _mm_setzero_si128(); | 3715 col[i32 + 1] = _mm_setzero_si128(); |
3722 col[i32 + 2] = _mm_setzero_si128(); | 3716 col[i32 + 2] = _mm_setzero_si128(); |
3723 col[i32 + 3] = _mm_setzero_si128(); | 3717 col[i32 + 3] = _mm_setzero_si128(); |
3724 col[i32 + 4] = _mm_setzero_si128(); | 3718 col[i32 + 4] = _mm_setzero_si128(); |
3725 col[i32 + 5] = _mm_setzero_si128(); | 3719 col[i32 + 5] = _mm_setzero_si128(); |
3726 col[i32 + 6] = _mm_setzero_si128(); | 3720 col[i32 + 6] = _mm_setzero_si128(); |
3727 col[i32 + 7] = _mm_setzero_si128(); | 3721 col[i32 + 7] = _mm_setzero_si128(); |
3728 col[i32 + 8] = _mm_setzero_si128(); | 3722 col[i32 + 8] = _mm_setzero_si128(); |
3729 col[i32 + 9] = _mm_setzero_si128(); | 3723 col[i32 + 9] = _mm_setzero_si128(); |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3788 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); | 3782 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
3789 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); | 3783 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
3790 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); | 3784 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
3791 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); | 3785 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
3792 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); | 3786 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
3793 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); | 3787 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
3794 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); | 3788 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
3795 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); | 3789 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
3796 } | 3790 } |
3797 for (i = 0; i < 4; i++) { | 3791 for (i = 0; i < 4; i++) { |
3798 const __m128i zero = _mm_setzero_si128(); | |
3799 // Second 1-D idct | 3792 // Second 1-D idct |
3800 j = i << 3; | 3793 j = i << 3; |
3801 | 3794 |
3802 // Transpose 32x8 block to 8x32 block | 3795 // Transpose 32x8 block to 8x32 block |
3803 array_transpose_8x8(col+j, in); | 3796 array_transpose_8x8(col+j, in); |
3804 array_transpose_8x8(col+j+32, in+8); | 3797 array_transpose_8x8(col+j+32, in+8); |
3805 array_transpose_8x8(col+j+64, in+16); | 3798 array_transpose_8x8(col+j+64, in+16); |
3806 array_transpose_8x8(col+j+96, in+24); | 3799 array_transpose_8x8(col+j+96, in+24); |
3807 | 3800 |
3808 IDCT32 | 3801 IDCT32 |
(...skipping 176 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3985 RECON_AND_STORE(dest, dc_value); | 3978 RECON_AND_STORE(dest, dc_value); |
3986 RECON_AND_STORE(dest, dc_value); | 3979 RECON_AND_STORE(dest, dc_value); |
3987 RECON_AND_STORE(dest, dc_value); | 3980 RECON_AND_STORE(dest, dc_value); |
3988 RECON_AND_STORE(dest, dc_value); | 3981 RECON_AND_STORE(dest, dc_value); |
3989 RECON_AND_STORE(dest, dc_value); | 3982 RECON_AND_STORE(dest, dc_value); |
3990 RECON_AND_STORE(dest, dc_value); | 3983 RECON_AND_STORE(dest, dc_value); |
3991 RECON_AND_STORE(dest, dc_value); | 3984 RECON_AND_STORE(dest, dc_value); |
3992 dest += 8 - (stride * 32); | 3985 dest += 8 - (stride * 32); |
3993 } | 3986 } |
3994 } | 3987 } |
OLD | NEW |