Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(586)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <emmintrin.h> // SSE2 11 #include <emmintrin.h> // SSE2
12 #include "vp9/common/vp9_idct.h" // for cospi constants 12 #include "vp9/common/vp9_idct.h" // for cospi constants
13 #include "vpx_ports/mem.h" 13 #include "vpx_ports/mem.h"
14 14
15 void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { 15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
16 // The 2D transform is done with two passes which are actually pretty 16 // The 2D transform is done with two passes which are actually pretty
17 // similar. In the first one, we transform the columns and transpose 17 // similar. In the first one, we transform the columns and transpose
18 // the results. In the second one, we transform the rows. To achieve that, 18 // the results. In the second one, we transform the rows. To achieve that,
19 // as the first pass results are transposed, we tranpose the columns (that 19 // as the first pass results are transposed, we tranpose the columns (that
20 // is the transposed rows) and transpose the results (so that it goes back 20 // is the transposed rows) and transpose the results (so that it goes back
21 // in normal/row positions). 21 // in normal/row positions).
22 const int stride = pitch >> 1;
23 int pass; 22 int pass;
24 // Constants 23 // Constants
25 // When we use them, in one case, they are all the same. In all others 24 // When we use them, in one case, they are all the same. In all others
26 // it's a pair of them that we need to repeat four times. This is done 25 // it's a pair of them that we need to repeat four times. This is done
27 // by constructing the 32 bit constant corresponding to that pair. 26 // by constructing the 32 bit constant corresponding to that pair.
28 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
29 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
30 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
31 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
32 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after
105 __m128i out01 = _mm_add_epi16(in0, kOne); 104 __m128i out01 = _mm_add_epi16(in0, kOne);
106 __m128i out23 = _mm_add_epi16(in2, kOne); 105 __m128i out23 = _mm_add_epi16(in2, kOne);
107 out01 = _mm_srai_epi16(out01, 2); 106 out01 = _mm_srai_epi16(out01, 2);
108 out23 = _mm_srai_epi16(out23, 2); 107 out23 = _mm_srai_epi16(out23, 2);
109 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); 108 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
110 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); 109 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
111 } 110 }
112 } 111 }
113 } 112 }
114 113
115 void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { 114 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
116 vp9_short_fdct4x4_sse2(input, output, pitch); 115 int stride) {
117 vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
118 }
119
120 static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
121 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 116 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
122 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 117 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
123 __m128i mask; 118 __m128i mask;
124 119
125 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 120 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
126 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 121 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
127 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 122 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
128 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 123 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
129 124
130 in[0] = _mm_slli_epi16(in[0], 4); 125 in[0] = _mm_slli_epi16(in[0], 4);
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
164 // 00 10 20 30 01 11 21 31 159 // 00 10 20 30 01 11 21 31
165 // 02 12 22 32 03 13 23 33 160 // 02 12 22 32 03 13 23 33
166 // only use the first 4 16-bit integers 161 // only use the first 4 16-bit integers
167 res[1] = _mm_unpackhi_epi64(res[0], res[0]); 162 res[1] = _mm_unpackhi_epi64(res[0], res[0]);
168 res[3] = _mm_unpackhi_epi64(res[2], res[2]); 163 res[3] = _mm_unpackhi_epi64(res[2], res[2]);
169 } 164 }
170 165
171 void fdct4_1d_sse2(__m128i *in) { 166 void fdct4_1d_sse2(__m128i *in) {
172 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 167 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
173 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 168 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
174 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 169 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
175 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 170 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
176 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 171 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
177 172
178 __m128i u[4], v[4]; 173 __m128i u[4], v[4];
179 u[0] = _mm_add_epi16(in[0], in[3]); 174 u[0]=_mm_unpacklo_epi16(in[0], in[1]);
180 u[1] = _mm_add_epi16(in[1], in[2]); 175 u[1]=_mm_unpacklo_epi16(in[3], in[2]);
181 u[2] = _mm_sub_epi16(in[1], in[2]);
182 u[3] = _mm_sub_epi16(in[0], in[3]);
183 176
184 v[0] = _mm_unpacklo_epi16(u[0], u[1]); 177 v[0] = _mm_add_epi16(u[0], u[1]);
185 v[1] = _mm_unpacklo_epi16(u[2], u[3]); 178 v[1] = _mm_sub_epi16(u[0], u[1]);
179
186 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 180 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
187 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 181 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
188 u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1 182 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
189 u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3 183 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
190 184
191 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 185 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
192 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 186 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
193 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 187 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
194 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 188 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
195 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 189 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
196 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 190 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
197 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 191 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
198 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 192 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
199 193
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
242 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 236 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
243 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 237 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
244 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 238 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
245 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 239 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
246 240
247 in[0] = _mm_packs_epi32(u[0], u[2]); 241 in[0] = _mm_packs_epi32(u[0], u[2]);
248 in[1] = _mm_packs_epi32(u[1], u[3]); 242 in[1] = _mm_packs_epi32(u[1], u[3]);
249 transpose_4x4(in); 243 transpose_4x4(in);
250 } 244 }
251 245
252 void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, 246 void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
253 int stride, int tx_type) { 247 int stride, int tx_type) {
254 __m128i in[4]; 248 __m128i in[4];
255 load_buffer_4x4(input, in, stride); 249 load_buffer_4x4(input, in, stride);
256 switch (tx_type) { 250 switch (tx_type) {
257 case 0: // DCT_DCT 251 case 0: // DCT_DCT
258 fdct4_1d_sse2(in); 252 fdct4_1d_sse2(in);
259 fdct4_1d_sse2(in); 253 fdct4_1d_sse2(in);
260 break; 254 break;
261 case 1: // ADST_DCT 255 case 1: // ADST_DCT
262 fadst4_1d_sse2(in); 256 fadst4_1d_sse2(in);
263 fdct4_1d_sse2(in); 257 fdct4_1d_sse2(in);
264 break; 258 break;
265 case 2: // DCT_ADST 259 case 2: // DCT_ADST
266 fdct4_1d_sse2(in); 260 fdct4_1d_sse2(in);
267 fadst4_1d_sse2(in); 261 fadst4_1d_sse2(in);
268 break; 262 break;
269 case 3: // ADST_ADST 263 case 3: // ADST_ADST
270 fadst4_1d_sse2(in); 264 fadst4_1d_sse2(in);
271 fadst4_1d_sse2(in); 265 fadst4_1d_sse2(in);
272 break; 266 break;
273 default: 267 default:
274 assert(0); 268 assert(0);
275 break; 269 break;
276 } 270 }
277 write_buffer_4x4(output, in); 271 write_buffer_4x4(output, in);
278 } 272 }
279 273
280 void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { 274 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
281 const int stride = pitch >> 1;
282 int pass; 275 int pass;
283 // Constants 276 // Constants
284 // When we use them, in one case, they are all the same. In all others 277 // When we use them, in one case, they are all the same. In all others
285 // it's a pair of them that we need to repeat four times. This is done 278 // it's a pair of them that we need to repeat four times. This is done
286 // by constructing the 32 bit constant corresponding to that pair. 279 // by constructing the 32 bit constant corresponding to that pair.
287 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 280 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
288 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 281 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
289 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 282 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
290 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 283 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
291 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 284 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
(...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after
528 _mm_store_si128((__m128i *)(output + 2 * 8), in2); 521 _mm_store_si128((__m128i *)(output + 2 * 8), in2);
529 _mm_store_si128((__m128i *)(output + 3 * 8), in3); 522 _mm_store_si128((__m128i *)(output + 3 * 8), in3);
530 _mm_store_si128((__m128i *)(output + 4 * 8), in4); 523 _mm_store_si128((__m128i *)(output + 4 * 8), in4);
531 _mm_store_si128((__m128i *)(output + 5 * 8), in5); 524 _mm_store_si128((__m128i *)(output + 5 * 8), in5);
532 _mm_store_si128((__m128i *)(output + 6 * 8), in6); 525 _mm_store_si128((__m128i *)(output + 6 * 8), in6);
533 _mm_store_si128((__m128i *)(output + 7 * 8), in7); 526 _mm_store_si128((__m128i *)(output + 7 * 8), in7);
534 } 527 }
535 } 528 }
536 529
537 // load 8x8 array 530 // load 8x8 array
538 static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { 531 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
539 in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); 532 int stride) {
540 in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); 533 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
541 in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); 534 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
542 in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); 535 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
543 in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); 536 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
544 in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); 537 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
545 in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); 538 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
546 in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); 539 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
540 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
547 541
548 in[0] = _mm_slli_epi16(in[0], 2); 542 in[0] = _mm_slli_epi16(in[0], 2);
549 in[1] = _mm_slli_epi16(in[1], 2); 543 in[1] = _mm_slli_epi16(in[1], 2);
550 in[2] = _mm_slli_epi16(in[2], 2); 544 in[2] = _mm_slli_epi16(in[2], 2);
551 in[3] = _mm_slli_epi16(in[3], 2); 545 in[3] = _mm_slli_epi16(in[3], 2);
552 in[4] = _mm_slli_epi16(in[4], 2); 546 in[4] = _mm_slli_epi16(in[4], 2);
553 in[5] = _mm_slli_epi16(in[5], 2); 547 in[5] = _mm_slli_epi16(in[5], 2);
554 in[6] = _mm_slli_epi16(in[6], 2); 548 in[6] = _mm_slli_epi16(in[6], 2);
555 in[7] = _mm_slli_epi16(in[7], 2); 549 in[7] = _mm_slli_epi16(in[7], 2);
556 } 550 }
(...skipping 469 matching lines...) Expand 10 before | Expand all | Expand 10 after
1026 in[3] = _mm_sub_epi16(k__const_0, s2); 1020 in[3] = _mm_sub_epi16(k__const_0, s2);
1027 in[4] = s3; 1021 in[4] = s3;
1028 in[5] = _mm_sub_epi16(k__const_0, s7); 1022 in[5] = _mm_sub_epi16(k__const_0, s7);
1029 in[6] = s5; 1023 in[6] = s5;
1030 in[7] = _mm_sub_epi16(k__const_0, s1); 1024 in[7] = _mm_sub_epi16(k__const_0, s1);
1031 1025
1032 // transpose 1026 // transpose
1033 array_transpose_8x8(in, in); 1027 array_transpose_8x8(in, in);
1034 } 1028 }
1035 1029
1036 void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, 1030 void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
1037 int stride, int tx_type) { 1031 int stride, int tx_type) {
1038 __m128i in[8]; 1032 __m128i in[8];
1039 load_buffer_8x8(input, in, stride); 1033 load_buffer_8x8(input, in, stride);
1040 switch (tx_type) { 1034 switch (tx_type) {
1041 case 0: // DCT_DCT 1035 case 0: // DCT_DCT
1042 fdct8_1d_sse2(in); 1036 fdct8_1d_sse2(in);
1043 fdct8_1d_sse2(in); 1037 fdct8_1d_sse2(in);
1044 break; 1038 break;
1045 case 1: // ADST_DCT 1039 case 1: // ADST_DCT
1046 fadst8_1d_sse2(in); 1040 fadst8_1d_sse2(in);
1047 fdct8_1d_sse2(in); 1041 fdct8_1d_sse2(in);
1048 break; 1042 break;
1049 case 2: // DCT_ADST 1043 case 2: // DCT_ADST
1050 fdct8_1d_sse2(in); 1044 fdct8_1d_sse2(in);
1051 fadst8_1d_sse2(in); 1045 fadst8_1d_sse2(in);
1052 break; 1046 break;
1053 case 3: // ADST_ADST 1047 case 3: // ADST_ADST
1054 fadst8_1d_sse2(in); 1048 fadst8_1d_sse2(in);
1055 fadst8_1d_sse2(in); 1049 fadst8_1d_sse2(in);
1056 break; 1050 break;
1057 default: 1051 default:
1058 assert(0); 1052 assert(0);
1059 break; 1053 break;
1060 } 1054 }
1061 right_shift_8x8(in, 1); 1055 right_shift_8x8(in, 1);
1062 write_buffer_8x8(output, in, 8); 1056 write_buffer_8x8(output, in, 8);
1063 } 1057 }
1064 1058
1065 void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { 1059 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
1066 // The 2D transform is done with two passes which are actually pretty 1060 // The 2D transform is done with two passes which are actually pretty
1067 // similar. In the first one, we transform the columns and transpose 1061 // similar. In the first one, we transform the columns and transpose
1068 // the results. In the second one, we transform the rows. To achieve that, 1062 // the results. In the second one, we transform the rows. To achieve that,
1069 // as the first pass results are transposed, we tranpose the columns (that 1063 // as the first pass results are transposed, we tranpose the columns (that
1070 // is the transposed rows) and transpose the results (so that it goes back 1064 // is the transposed rows) and transpose the results (so that it goes back
1071 // in normal/row positions). 1065 // in normal/row positions).
1072 const int stride = pitch >> 1;
1073 int pass; 1066 int pass;
1074 // We need an intermediate buffer between passes. 1067 // We need an intermediate buffer between passes.
1075 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); 1068 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
1076 int16_t *in = input; 1069 const int16_t *in = input;
1077 int16_t *out = intermediate; 1070 int16_t *out = intermediate;
1078 // Constants 1071 // Constants
1079 // When we use them, in one case, they are all the same. In all others 1072 // When we use them, in one case, they are all the same. In all others
1080 // it's a pair of them that we need to repeat four times. This is done 1073 // it's a pair of them that we need to repeat four times. This is done
1081 // by constructing the 32 bit constant corresponding to that pair. 1074 // by constructing the 32 bit constant corresponding to that pair.
1082 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1075 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1083 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1076 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1084 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1077 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1085 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1078 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1086 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1079 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
(...skipping 594 matching lines...) Expand 10 before | Expand all | Expand 10 after
1681 _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); 1674 _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
1682 } 1675 }
1683 out += 8*16; 1676 out += 8*16;
1684 } 1677 }
1685 // Setup in/out for next pass. 1678 // Setup in/out for next pass.
1686 in = intermediate; 1679 in = intermediate;
1687 out = output; 1680 out = output;
1688 } 1681 }
1689 } 1682 }
1690 1683
1691 static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, 1684 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
1692 __m128i *in1, int stride) { 1685 __m128i *in1, int stride) {
1693 // load first 8 columns 1686 // load first 8 columns
1694 load_buffer_8x8(input, in0, stride); 1687 load_buffer_8x8(input, in0, stride);
1695 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); 1688 load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
1696 1689
1697 input += 8; 1690 input += 8;
1698 // load second 8 columns 1691 // load second 8 columns
1699 load_buffer_8x8(input, in1, stride); 1692 load_buffer_8x8(input, in1, stride);
1700 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); 1693 load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
1701 } 1694 }
(...skipping 831 matching lines...) Expand 10 before | Expand all | Expand 10 after
2533 fdct16_1d_8col(in1); 2526 fdct16_1d_8col(in1);
2534 array_transpose_16x16(in0, in1); 2527 array_transpose_16x16(in0, in1);
2535 } 2528 }
2536 2529
2537 void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { 2530 void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
2538 fadst16_1d_8col(in0); 2531 fadst16_1d_8col(in0);
2539 fadst16_1d_8col(in1); 2532 fadst16_1d_8col(in1);
2540 array_transpose_16x16(in0, in1); 2533 array_transpose_16x16(in0, in1);
2541 } 2534 }
2542 2535
2543 void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, 2536 void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
2544 int stride, int tx_type) { 2537 int stride, int tx_type) {
2545 __m128i in0[16], in1[16]; 2538 __m128i in0[16], in1[16];
2546 load_buffer_16x16(input, in0, in1, stride); 2539 load_buffer_16x16(input, in0, in1, stride);
2547 switch (tx_type) { 2540 switch (tx_type) {
2548 case 0: // DCT_DCT 2541 case 0: // DCT_DCT
2549 fdct16_1d_sse2(in0, in1); 2542 fdct16_1d_sse2(in0, in1);
2550 right_shift_16x16(in0, in1); 2543 right_shift_16x16(in0, in1);
2551 fdct16_1d_sse2(in0, in1); 2544 fdct16_1d_sse2(in0, in1);
2552 break; 2545 break;
2553 case 1: // ADST_DCT 2546 case 1: // ADST_DCT
(...skipping 11 matching lines...) Expand all
2565 right_shift_16x16(in0, in1); 2558 right_shift_16x16(in0, in1);
2566 fadst16_1d_sse2(in0, in1); 2559 fadst16_1d_sse2(in0, in1);
2567 break; 2560 break;
2568 default: 2561 default:
2569 assert(0); 2562 assert(0);
2570 break; 2563 break;
2571 } 2564 }
2572 write_buffer_16x16(output, in0, in1, 16); 2565 write_buffer_16x16(output, in0, in1, 16);
2573 } 2566 }
2574 2567
2575 #define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2 2568 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
2576 #define FDCT32x32_HIGH_PRECISION 0 2569 #define FDCT32x32_HIGH_PRECISION 0
2577 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" 2570 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
2578 #undef FDCT32x32_2D 2571 #undef FDCT32x32_2D
2579 #undef FDCT32x32_HIGH_PRECISION 2572 #undef FDCT32x32_HIGH_PRECISION
2580 2573
2581 #define FDCT32x32_2D vp9_short_fdct32x32_sse2 2574 #define FDCT32x32_2D vp9_fdct32x32_sse2
2582 #define FDCT32x32_HIGH_PRECISION 1 2575 #define FDCT32x32_HIGH_PRECISION 1
2583 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT 2576 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
2584 #undef FDCT32x32_2D 2577 #undef FDCT32x32_2D
2585 #undef FDCT32x32_HIGH_PRECISION 2578 #undef FDCT32x32_HIGH_PRECISION
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698