| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | 11 #include <assert.h> |
| 12 #include <emmintrin.h> // SSE2 | 12 #include <emmintrin.h> // SSE2 |
| 13 |
| 14 #include "./vp9_rtcd.h" |
| 13 #include "vp9/common/vp9_idct.h" // for cospi constants | 15 #include "vp9/common/vp9_idct.h" // for cospi constants |
| 14 #include "vp9/encoder/vp9_dct.h" | 16 #include "vp9/encoder/vp9_dct.h" |
| 15 #include "vp9/encoder/x86/vp9_dct_sse2.h" | 17 #include "vp9/encoder/x86/vp9_dct_sse2.h" |
| 16 #include "vpx_ports/mem.h" | 18 #include "vpx_ports/mem.h" |
| 17 | 19 |
| 18 void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { | 20 void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { |
| 19 __m128i in0, in1; | 21 __m128i in0, in1; |
| 20 __m128i tmp; | 22 __m128i tmp; |
| 21 const __m128i zero = _mm_setzero_si128(); | 23 const __m128i zero = _mm_setzero_si128(); |
| 22 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); | 24 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 89 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); | 91 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 90 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); | 92 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 91 | 93 |
| 92 // 00 10 20 30 01 11 21 31 | 94 // 00 10 20 30 01 11 21 31 |
| 93 // 02 12 22 32 03 13 23 33 | 95 // 02 12 22 32 03 13 23 33 |
| 94 // only use the first 4 16-bit integers | 96 // only use the first 4 16-bit integers |
| 95 res[1] = _mm_unpackhi_epi64(res[0], res[0]); | 97 res[1] = _mm_unpackhi_epi64(res[0], res[0]); |
| 96 res[3] = _mm_unpackhi_epi64(res[2], res[2]); | 98 res[3] = _mm_unpackhi_epi64(res[2], res[2]); |
| 97 } | 99 } |
| 98 | 100 |
| 99 void fdct4_sse2(__m128i *in) { | 101 static void fdct4_sse2(__m128i *in) { |
| 100 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); | 102 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
| 101 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 103 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 102 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 104 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
| 103 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 105 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
| 104 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 106 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 105 | 107 |
| 106 __m128i u[4], v[4]; | 108 __m128i u[4], v[4]; |
| 107 u[0]=_mm_unpacklo_epi16(in[0], in[1]); | 109 u[0]=_mm_unpacklo_epi16(in[0], in[1]); |
| 108 u[1]=_mm_unpacklo_epi16(in[3], in[2]); | 110 u[1]=_mm_unpacklo_epi16(in[3], in[2]); |
| 109 | 111 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 122 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 124 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
| 123 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 125 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
| 124 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 126 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
| 125 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 127 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
| 126 | 128 |
| 127 in[0] = _mm_packs_epi32(u[0], u[1]); | 129 in[0] = _mm_packs_epi32(u[0], u[1]); |
| 128 in[1] = _mm_packs_epi32(u[2], u[3]); | 130 in[1] = _mm_packs_epi32(u[2], u[3]); |
| 129 transpose_4x4(in); | 131 transpose_4x4(in); |
| 130 } | 132 } |
| 131 | 133 |
| 132 void fadst4_sse2(__m128i *in) { | 134 static void fadst4_sse2(__m128i *in) { |
| 133 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); | 135 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); |
| 134 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); | 136 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); |
| 135 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); | 137 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); |
| 136 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); | 138 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); |
| 137 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); | 139 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); |
| 138 const __m128i kZero = _mm_set1_epi16(0); | 140 const __m128i kZero = _mm_set1_epi16(0); |
| 139 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 141 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 140 __m128i u[8], v[8]; | 142 __m128i u[8], v[8]; |
| 141 __m128i in7 = _mm_add_epi16(in[0], in[1]); | 143 __m128i in7 = _mm_add_epi16(in[0], in[1]); |
| 142 | 144 |
| (...skipping 681 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 824 // 00 10 20 30 40 50 60 70 | 826 // 00 10 20 30 40 50 60 70 |
| 825 // 01 11 21 31 41 51 61 71 | 827 // 01 11 21 31 41 51 61 71 |
| 826 // 02 12 22 32 42 52 62 72 | 828 // 02 12 22 32 42 52 62 72 |
| 827 // 03 13 23 33 43 53 63 73 | 829 // 03 13 23 33 43 53 63 73 |
| 828 // 04 14 24 34 44 54 64 74 | 830 // 04 14 24 34 44 54 64 74 |
| 829 // 05 15 25 35 45 55 65 75 | 831 // 05 15 25 35 45 55 65 75 |
| 830 // 06 16 26 36 46 56 66 76 | 832 // 06 16 26 36 46 56 66 76 |
| 831 // 07 17 27 37 47 57 67 77 | 833 // 07 17 27 37 47 57 67 77 |
| 832 } | 834 } |
| 833 | 835 |
| 834 void fdct8_sse2(__m128i *in) { | 836 static void fdct8_sse2(__m128i *in) { |
| 835 // constants | 837 // constants |
| 836 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); | 838 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
| 837 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 839 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 838 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 840 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 839 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 841 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| 840 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 842 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
| 841 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 843 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
| 842 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | 844 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
| 843 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 845 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
| 844 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 846 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| (...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 964 | 966 |
| 965 in[1] = _mm_packs_epi32(v0, v1); | 967 in[1] = _mm_packs_epi32(v0, v1); |
| 966 in[3] = _mm_packs_epi32(v4, v5); | 968 in[3] = _mm_packs_epi32(v4, v5); |
| 967 in[5] = _mm_packs_epi32(v2, v3); | 969 in[5] = _mm_packs_epi32(v2, v3); |
| 968 in[7] = _mm_packs_epi32(v6, v7); | 970 in[7] = _mm_packs_epi32(v6, v7); |
| 969 | 971 |
| 970 // transpose | 972 // transpose |
| 971 array_transpose_8x8(in, in); | 973 array_transpose_8x8(in, in); |
| 972 } | 974 } |
| 973 | 975 |
| 974 void fadst8_sse2(__m128i *in) { | 976 static void fadst8_sse2(__m128i *in) { |
| 975 // Constants | 977 // Constants |
| 976 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); | 978 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
| 977 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 979 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
| 978 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); | 980 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
| 979 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 981 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
| 980 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); | 982 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
| 981 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 983 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
| 982 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); | 984 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
| 983 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 985 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
| 984 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 986 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
| (...skipping 361 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1346 } | 1348 } |
| 1347 | 1349 |
| 1348 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { | 1350 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { |
| 1349 // perform rounding operations | 1351 // perform rounding operations |
| 1350 right_shift_8x8(res0, 2); | 1352 right_shift_8x8(res0, 2); |
| 1351 right_shift_8x8(res0 + 8, 2); | 1353 right_shift_8x8(res0 + 8, 2); |
| 1352 right_shift_8x8(res1, 2); | 1354 right_shift_8x8(res1, 2); |
| 1353 right_shift_8x8(res1 + 8, 2); | 1355 right_shift_8x8(res1 + 8, 2); |
| 1354 } | 1356 } |
| 1355 | 1357 |
| 1356 void fdct16_8col(__m128i *in) { | 1358 static void fdct16_8col(__m128i *in) { |
| 1357 // perform 16x16 1-D DCT for 8 columns | 1359 // perform 16x16 1-D DCT for 8 columns |
| 1358 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; | 1360 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; |
| 1359 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); | 1361 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
| 1360 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1362 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 1361 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 1363 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
| 1362 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1364 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 1363 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); | 1365 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); |
| 1364 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1366 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| 1365 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 1367 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
| 1366 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 1368 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
| (...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1668 in[1] = _mm_packs_epi32(v[0], v[1]); | 1670 in[1] = _mm_packs_epi32(v[0], v[1]); |
| 1669 in[9] = _mm_packs_epi32(v[2], v[3]); | 1671 in[9] = _mm_packs_epi32(v[2], v[3]); |
| 1670 in[5] = _mm_packs_epi32(v[4], v[5]); | 1672 in[5] = _mm_packs_epi32(v[4], v[5]); |
| 1671 in[13] = _mm_packs_epi32(v[6], v[7]); | 1673 in[13] = _mm_packs_epi32(v[6], v[7]); |
| 1672 in[3] = _mm_packs_epi32(v[8], v[9]); | 1674 in[3] = _mm_packs_epi32(v[8], v[9]); |
| 1673 in[11] = _mm_packs_epi32(v[10], v[11]); | 1675 in[11] = _mm_packs_epi32(v[10], v[11]); |
| 1674 in[7] = _mm_packs_epi32(v[12], v[13]); | 1676 in[7] = _mm_packs_epi32(v[12], v[13]); |
| 1675 in[15] = _mm_packs_epi32(v[14], v[15]); | 1677 in[15] = _mm_packs_epi32(v[14], v[15]); |
| 1676 } | 1678 } |
| 1677 | 1679 |
| 1678 void fadst16_8col(__m128i *in) { | 1680 static void fadst16_8col(__m128i *in) { |
| 1679 // perform 16x16 1-D ADST for 8 columns | 1681 // perform 16x16 1-D ADST for 8 columns |
| 1680 __m128i s[16], x[16], u[32], v[32]; | 1682 __m128i s[16], x[16], u[32], v[32]; |
| 1681 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); | 1683 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); |
| 1682 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 1684 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
| 1683 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); | 1685 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); |
| 1684 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 1686 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
| 1685 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); | 1687 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); |
| 1686 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 1688 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
| 1687 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); | 1689 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); |
| 1688 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); | 1690 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
| (...skipping 449 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2138 in[8] = _mm_packs_epi32(v[2], v[3]); | 2140 in[8] = _mm_packs_epi32(v[2], v[3]); |
| 2139 in[9] = _mm_packs_epi32(v[10], v[11]); | 2141 in[9] = _mm_packs_epi32(v[10], v[11]); |
| 2140 in[10] = _mm_packs_epi32(v[14], v[15]); | 2142 in[10] = _mm_packs_epi32(v[14], v[15]); |
| 2141 in[11] = _mm_packs_epi32(v[6], v[7]); | 2143 in[11] = _mm_packs_epi32(v[6], v[7]); |
| 2142 in[12] = s[5]; | 2144 in[12] = s[5]; |
| 2143 in[13] = _mm_sub_epi16(kZero, s[13]); | 2145 in[13] = _mm_sub_epi16(kZero, s[13]); |
| 2144 in[14] = s[9]; | 2146 in[14] = s[9]; |
| 2145 in[15] = _mm_sub_epi16(kZero, s[1]); | 2147 in[15] = _mm_sub_epi16(kZero, s[1]); |
| 2146 } | 2148 } |
| 2147 | 2149 |
| 2148 void fdct16_sse2(__m128i *in0, __m128i *in1) { | 2150 static void fdct16_sse2(__m128i *in0, __m128i *in1) { |
| 2149 fdct16_8col(in0); | 2151 fdct16_8col(in0); |
| 2150 fdct16_8col(in1); | 2152 fdct16_8col(in1); |
| 2151 array_transpose_16x16(in0, in1); | 2153 array_transpose_16x16(in0, in1); |
| 2152 } | 2154 } |
| 2153 | 2155 |
| 2154 void fadst16_sse2(__m128i *in0, __m128i *in1) { | 2156 static void fadst16_sse2(__m128i *in0, __m128i *in1) { |
| 2155 fadst16_8col(in0); | 2157 fadst16_8col(in0); |
| 2156 fadst16_8col(in1); | 2158 fadst16_8col(in1); |
| 2157 array_transpose_16x16(in0, in1); | 2159 array_transpose_16x16(in0, in1); |
| 2158 } | 2160 } |
| 2159 | 2161 |
| 2160 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, | 2162 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, |
| 2161 int stride, int tx_type) { | 2163 int stride, int tx_type) { |
| 2162 __m128i in0[16], in1[16]; | 2164 __m128i in0[16], in1[16]; |
| 2163 | 2165 |
| 2164 switch (tx_type) { | 2166 switch (tx_type) { |
| (...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2327 for (i = 0; i < 8; ++i) { | 2329 for (i = 0; i < 8; ++i) { |
| 2328 for (j = 0; j < 8; ++j) | 2330 for (j = 0; j < 8; ++j) |
| 2329 temp_in[j] = out[j + i * 8]; | 2331 temp_in[j] = out[j + i * 8]; |
| 2330 ht.rows(temp_in, temp_out); | 2332 ht.rows(temp_in, temp_out); |
| 2331 for (j = 0; j < 8; ++j) | 2333 for (j = 0; j < 8; ++j) |
| 2332 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; | 2334 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; |
| 2333 } | 2335 } |
| 2334 } | 2336 } |
| 2335 } | 2337 } |
| 2336 | 2338 |
| 2337 void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output, | 2339 void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output, |
| 2338 int stride, int tx_type) { | 2340 int stride, int tx_type) { |
| 2339 if (tx_type == DCT_DCT) { | 2341 if (tx_type == DCT_DCT) { |
| 2340 vp9_highbd_fdct16x16_sse2(input, output, stride); | 2342 vp9_highbd_fdct16x16_sse2(input, output, stride); |
| 2341 } else { | 2343 } else { |
| 2342 tran_low_t out[256]; | 2344 tran_low_t out[256]; |
| 2343 tran_low_t *outptr = &out[0]; | 2345 tran_low_t *outptr = &out[0]; |
| 2344 int i, j; | 2346 int i, j; |
| 2345 tran_low_t temp_in[16], temp_out[16]; | 2347 tran_low_t temp_in[16], temp_out[16]; |
| 2346 const transform_2d ht = FHT_16[tx_type]; | 2348 const transform_2d ht = FHT_16[tx_type]; |
| 2347 | 2349 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 2361 ht.rows(temp_in, temp_out); | 2363 ht.rows(temp_in, temp_out); |
| 2362 for (j = 0; j < 16; ++j) | 2364 for (j = 0; j < 16; ++j) |
| 2363 output[j + i * 16] = temp_out[j]; | 2365 output[j + i * 16] = temp_out[j]; |
| 2364 } | 2366 } |
| 2365 } | 2367 } |
| 2366 } | 2368 } |
| 2367 #endif // CONFIG_VP9_HIGHBITDEPTH | 2369 #endif // CONFIG_VP9_HIGHBITDEPTH |
| 2368 | 2370 |
| 2369 /* | 2371 /* |
| 2370 * The DCTnxn functions are defined using the macros below. The main code for | 2372 * The DCTnxn functions are defined using the macros below. The main code for |
| 2371 * them is in separate files (vp9/encoder/x86/vp9_dct_impl_sse2.c & | 2373 * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h & |
| 2372 * vp9/encoder/x86/vp9_dct32x32_sse2.c) which are used by both the 8 bit code | 2374 * vp9/encoder/x86/vp9_dct32x32_sse2_impl.h) which are used by both the 8 bit co
de |
| 2373 * and the high bit depth code. | 2375 * and the high bit depth code. |
| 2374 */ | 2376 */ |
| 2375 | 2377 |
| 2376 #define DCT_HIGH_BIT_DEPTH 0 | 2378 #define DCT_HIGH_BIT_DEPTH 0 |
| 2377 | 2379 |
| 2378 #define FDCT4x4_2D vp9_fdct4x4_sse2 | 2380 #define FDCT4x4_2D vp9_fdct4x4_sse2 |
| 2379 #define FDCT8x8_2D vp9_fdct8x8_sse2 | 2381 #define FDCT8x8_2D vp9_fdct8x8_sse2 |
| 2380 #define FDCT16x16_2D vp9_fdct16x16_sse2 | 2382 #define FDCT16x16_2D vp9_fdct16x16_sse2 |
| 2381 #include "vp9/encoder/x86/vp9_dct_impl_sse2.c" | 2383 #include "vp9/encoder/x86/vp9_dct_sse2_impl.h" |
| 2382 #undef FDCT4x4_2D | 2384 #undef FDCT4x4_2D |
| 2383 #undef FDCT8x8_2D | 2385 #undef FDCT8x8_2D |
| 2384 #undef FDCT16x16_2D | 2386 #undef FDCT16x16_2D |
| 2385 | 2387 |
| 2386 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 | 2388 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 |
| 2387 #define FDCT32x32_HIGH_PRECISION 0 | 2389 #define FDCT32x32_HIGH_PRECISION 0 |
| 2388 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" | 2390 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" |
| 2389 #undef FDCT32x32_2D | 2391 #undef FDCT32x32_2D |
| 2390 #undef FDCT32x32_HIGH_PRECISION | 2392 #undef FDCT32x32_HIGH_PRECISION |
| 2391 | 2393 |
| 2392 #define FDCT32x32_2D vp9_fdct32x32_sse2 | 2394 #define FDCT32x32_2D vp9_fdct32x32_sse2 |
| 2393 #define FDCT32x32_HIGH_PRECISION 1 | 2395 #define FDCT32x32_HIGH_PRECISION 1 |
| 2394 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2396 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT |
| 2395 #undef FDCT32x32_2D | 2397 #undef FDCT32x32_2D |
| 2396 #undef FDCT32x32_HIGH_PRECISION | 2398 #undef FDCT32x32_HIGH_PRECISION |
| 2397 | 2399 |
| 2398 #undef DCT_HIGH_BIT_DEPTH | 2400 #undef DCT_HIGH_BIT_DEPTH |
| 2399 | 2401 |
| 2400 | 2402 |
| 2401 #if CONFIG_VP9_HIGHBITDEPTH | 2403 #if CONFIG_VP9_HIGHBITDEPTH |
| 2402 | 2404 |
| 2403 #define DCT_HIGH_BIT_DEPTH 1 | 2405 #define DCT_HIGH_BIT_DEPTH 1 |
| 2404 | 2406 |
| 2405 #define FDCT4x4_2D vp9_highbd_fdct4x4_sse2 | 2407 #define FDCT4x4_2D vp9_highbd_fdct4x4_sse2 |
| 2406 #define FDCT8x8_2D vp9_highbd_fdct8x8_sse2 | 2408 #define FDCT8x8_2D vp9_highbd_fdct8x8_sse2 |
| 2407 #define FDCT16x16_2D vp9_highbd_fdct16x16_sse2 | 2409 #define FDCT16x16_2D vp9_highbd_fdct16x16_sse2 |
| 2408 #include "vp9/encoder/x86/vp9_dct_impl_sse2.c" // NOLINT | 2410 #include "vp9/encoder/x86/vp9_dct_sse2_impl.h" // NOLINT |
| 2409 #undef FDCT4x4_2D | 2411 #undef FDCT4x4_2D |
| 2410 #undef FDCT8x8_2D | 2412 #undef FDCT8x8_2D |
| 2411 #undef FDCT16x16_2D | 2413 #undef FDCT16x16_2D |
| 2412 | 2414 |
| 2413 #define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2 | 2415 #define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2 |
| 2414 #define FDCT32x32_HIGH_PRECISION 0 | 2416 #define FDCT32x32_HIGH_PRECISION 0 |
| 2415 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2417 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT |
| 2416 #undef FDCT32x32_2D | 2418 #undef FDCT32x32_2D |
| 2417 #undef FDCT32x32_HIGH_PRECISION | 2419 #undef FDCT32x32_HIGH_PRECISION |
| 2418 | 2420 |
| 2419 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 | 2421 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 |
| 2420 #define FDCT32x32_HIGH_PRECISION 1 | 2422 #define FDCT32x32_HIGH_PRECISION 1 |
| 2421 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2423 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT |
| 2422 #undef FDCT32x32_2D | 2424 #undef FDCT32x32_2D |
| 2423 #undef FDCT32x32_HIGH_PRECISION | 2425 #undef FDCT32x32_HIGH_PRECISION |
| 2424 | 2426 |
| 2425 #undef DCT_HIGH_BIT_DEPTH | 2427 #undef DCT_HIGH_BIT_DEPTH |
| 2426 | 2428 |
| 2427 #endif // CONFIG_VP9_HIGHBITDEPTH | 2429 #endif // CONFIG_VP9_HIGHBITDEPTH |
| OLD | NEW |