Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(153)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 111463005: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <emmintrin.h> // SSE2 11 #include <emmintrin.h> // SSE2
12 #include "vp9/common/vp9_idct.h" // for cospi constants 12 #include "vp9/common/vp9_idct.h" // for cospi constants
13 #include "vpx_ports/mem.h" 13 #include "vpx_ports/mem.h"
14 14
15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { 15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
16 // The 2D transform is done with two passes which are actually pretty 16 // The 2D transform is done with two passes which are actually pretty
17 // similar. In the first one, we transform the columns and transpose 17 // similar. In the first one, we transform the columns and transpose
18 // the results. In the second one, we transform the rows. To achieve that, 18 // the results. In the second one, we transform the rows. To achieve that,
19 // as the first pass results are transposed, we tranpose the columns (that 19 // as the first pass results are transposed, we tranpose the columns (that
20 // is the transposed rows) and transpose the results (so that it goes back 20 // is the transposed rows) and transpose the results (so that it goes back
21 // in normal/row positions). 21 // in normal/row positions).
22 int pass; 22 int pass;
23 // Constants 23 // Constants
24 // When we use them, in one case, they are all the same. In all others 24 // When we use them, in one case, they are all the same. In all others
25 // it's a pair of them that we need to repeat four times. This is done 25 // it's a pair of them that we need to repeat four times. This is done
26 // by constructing the 32 bit constant corresponding to that pair. 26 // by constructing the 32 bit constant corresponding to that pair.
27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 29 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 30 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
34 const __m128i kOne = _mm_set1_epi16(1); 34 const __m128i kOne = _mm_set1_epi16(1);
35 __m128i in0, in1, in2, in3; 35 __m128i in0, in1;
36 // Load inputs. 36 // Load inputs.
37 { 37 {
38 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 38 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
39 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 39 in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
40 in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 40 (input + 1 * stride)));
41 in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 41 in1 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
42 in1 = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)
43 (input + 3 * stride)), in1);
44
42 // x = x << 4 45 // x = x << 4
43 in0 = _mm_slli_epi16(in0, 4); 46 in0 = _mm_slli_epi16(in0, 4);
44 in1 = _mm_slli_epi16(in1, 4); 47 in1 = _mm_slli_epi16(in1, 4);
45 in2 = _mm_slli_epi16(in2, 4);
46 in3 = _mm_slli_epi16(in3, 4);
47 // if (i == 0 && input[0]) input[0] += 1; 48 // if (i == 0 && input[0]) input[0] += 1;
48 { 49 {
49 // The mask will only contain wether the first value is zero, all 50 // The mask will only contain wether the first value is zero, all
50 // other comparison will fail as something shifted by 4 (above << 4) 51 // other comparison will fail as something shifted by 4 (above << 4)
51 // can never be equal to one. To increment in the non-zero case, we 52 // can never be equal to one. To increment in the non-zero case, we
52 // add the mask and one for the first element: 53 // add the mask and one for the first element:
53 // - if zero, mask = -1, v = v - 1 + 1 = v 54 // - if zero, mask = -1, v = v - 1 + 1 = v
54 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 55 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
55 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); 56 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
56 in0 = _mm_add_epi16(in0, mask); 57 in0 = _mm_add_epi16(in0, mask);
57 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); 58 in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
58 } 59 }
59 } 60 }
60 // Do the two transform/transpose passes 61 // Do the two transform/transpose passes
61 for (pass = 0; pass < 2; ++pass) { 62 for (pass = 0; pass < 2; ++pass) {
62 // Transform 1/2: Add/substract 63 // Transform 1/2: Add/substract
63 const __m128i r0 = _mm_add_epi16(in0, in3); 64 const __m128i r0 = _mm_add_epi16(in0, in1);
64 const __m128i r1 = _mm_add_epi16(in1, in2); 65 const __m128i r1 = _mm_sub_epi16(in0, in1);
65 const __m128i r2 = _mm_sub_epi16(in1, in2); 66 const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
66 const __m128i r3 = _mm_sub_epi16(in0, in3); 67 const __m128i r3 = _mm_unpackhi_epi64(r0, r1);
67 // Transform 1/2: Interleave to do the multiply by constants which gets us 68 // Transform 1/2: Interleave to do the multiply by constants which gets us
68 // into 32 bits. 69 // into 32 bits.
69 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 70 const __m128i t0 = _mm_unpacklo_epi16(r2, r3);
70 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 71 const __m128i t2 = _mm_unpackhi_epi16(r2, r3);
71 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 72 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
72 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 73 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
73 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 74 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p08_p24);
74 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 75 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_p24_m08);
75 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 76 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
76 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 77 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
77 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 78 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
78 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 79 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
79 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 80 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
80 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 81 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
81 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 82 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
82 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 83 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
83 // Combine and transpose 84 // Combine and transpose
84 const __m128i res0 = _mm_packs_epi32(w0, w2); 85 const __m128i res0 = _mm_packs_epi32(w0, w2);
85 const __m128i res1 = _mm_packs_epi32(w4, w6); 86 const __m128i res1 = _mm_packs_epi32(w4, w6);
86 // 00 01 02 03 20 21 22 23 87 // 00 01 02 03 20 21 22 23
87 // 10 11 12 13 30 31 32 33 88 // 10 11 12 13 30 31 32 33
88 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 89 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
89 const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); 90 const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
90 // 00 10 01 11 02 12 03 13 91 // 00 10 01 11 02 12 03 13
91 // 20 30 21 31 22 32 23 33 92 // 20 30 21 31 22 32 23 33
92 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 93 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
93 in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 94 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
95 in1 = _mm_shuffle_epi32(in1, 0x4E);
94 // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 96 // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1
95 // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 97 // 02 12 22 32 03 13 23 33 in1 contains 2 followed by 3
96 if (0 == pass) { 98 }
97 // Extract values in the high part for second pass as transform code 99 in1 = _mm_shuffle_epi32(in1, 0x4E);
98 // only uses the first four values. 100 // Post-condition output and store it (v + 1) >> 2, taking advantage
99 in1 = _mm_unpackhi_epi64(in0, in0); 101 // of the fact 1/3 are stored just after 0/2.
100 in3 = _mm_unpackhi_epi64(in2, in2); 102 {
101 } else { 103 __m128i out01 = _mm_add_epi16(in0, kOne);
102 // Post-condition output and store it (v + 1) >> 2, taking advantage 104 __m128i out23 = _mm_add_epi16(in1, kOne);
103 // of the fact 1/3 are stored just after 0/2. 105 out01 = _mm_srai_epi16(out01, 2);
104 __m128i out01 = _mm_add_epi16(in0, kOne); 106 out23 = _mm_srai_epi16(out23, 2);
105 __m128i out23 = _mm_add_epi16(in2, kOne); 107 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
106 out01 = _mm_srai_epi16(out01, 2); 108 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
107 out23 = _mm_srai_epi16(out23, 2);
108 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
109 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
110 }
111 } 109 }
112 } 110 }
113 111
114 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, 112 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
115 int stride) { 113 int stride) {
116 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 114 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
117 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 115 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
118 __m128i mask; 116 __m128i mask;
119 117
120 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 118 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after
199 void fadst4_1d_sse2(__m128i *in) { 197 void fadst4_1d_sse2(__m128i *in) {
200 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 198 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
201 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 199 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
202 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 200 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
203 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 201 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
204 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 202 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
205 const __m128i kZero = _mm_set1_epi16(0); 203 const __m128i kZero = _mm_set1_epi16(0);
206 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 204 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
207 __m128i u[8], v[8]; 205 __m128i u[8], v[8];
208 __m128i in7 = _mm_add_epi16(in[0], in[1]); 206 __m128i in7 = _mm_add_epi16(in[0], in[1]);
209 in7 = _mm_sub_epi16(in7, in[3]);
210 207
211 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 208 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
212 u[1] = _mm_unpacklo_epi16(in[2], in[3]); 209 u[1] = _mm_unpacklo_epi16(in[2], in[3]);
213 u[2] = _mm_unpacklo_epi16(in7, kZero); 210 u[2] = _mm_unpacklo_epi16(in7, kZero);
214 u[3] = _mm_unpacklo_epi16(in[2], kZero); 211 u[3] = _mm_unpacklo_epi16(in[2], kZero);
212 u[4] = _mm_unpacklo_epi16(in[3], kZero);
215 213
216 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 214 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
217 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 215 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
218 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 216 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
219 v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 217 v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
220 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 218 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
221 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 219 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
220 v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
222 221
223 u[0] = _mm_add_epi32(v[0], v[1]); 222 u[0] = _mm_add_epi32(v[0], v[1]);
224 u[1] = v[2]; 223 u[1] = _mm_sub_epi32(v[2], v[6]);
225 u[2] = _mm_add_epi32(v[3], v[4]); 224 u[2] = _mm_add_epi32(v[3], v[4]);
226 u[3] = _mm_sub_epi32(u[2], u[0]); 225 u[3] = _mm_sub_epi32(u[2], u[0]);
227 u[4] = _mm_slli_epi32(v[5], 2); 226 u[4] = _mm_slli_epi32(v[5], 2);
228 u[5] = _mm_sub_epi32(u[4], v[5]); 227 u[5] = _mm_sub_epi32(u[4], v[5]);
229 u[6] = _mm_add_epi32(u[3], u[5]); 228 u[6] = _mm_add_epi32(u[3], u[5]);
230 229
231 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 230 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
232 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 231 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
233 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 232 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
234 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 233 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
(...skipping 2334 matching lines...) Expand 10 before | Expand all | Expand 10 after
2569 #define FDCT32x32_HIGH_PRECISION 0 2568 #define FDCT32x32_HIGH_PRECISION 0
2570 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" 2569 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
2571 #undef FDCT32x32_2D 2570 #undef FDCT32x32_2D
2572 #undef FDCT32x32_HIGH_PRECISION 2571 #undef FDCT32x32_HIGH_PRECISION
2573 2572
2574 #define FDCT32x32_2D vp9_fdct32x32_sse2 2573 #define FDCT32x32_2D vp9_fdct32x32_sse2
2575 #define FDCT32x32_HIGH_PRECISION 1 2574 #define FDCT32x32_HIGH_PRECISION 1
2576 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT 2575 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
2577 #undef FDCT32x32_2D 2576 #undef FDCT32x32_2D
2578 #undef FDCT32x32_HIGH_PRECISION 2577 #undef FDCT32x32_HIGH_PRECISION
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698