OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <emmintrin.h> // SSE2 | 11 #include <emmintrin.h> // SSE2 |
12 #include "vp9/common/vp9_idct.h" // for cospi constants | 12 #include "vp9/common/vp9_idct.h" // for cospi constants |
13 #include "vpx_ports/mem.h" | 13 #include "vpx_ports/mem.h" |
14 | 14 |
15 void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { | 15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { |
16 // The 2D transform is done with two passes which are actually pretty | 16 // The 2D transform is done with two passes which are actually pretty |
17 // similar. In the first one, we transform the columns and transpose | 17 // similar. In the first one, we transform the columns and transpose |
18 // the results. In the second one, we transform the rows. To achieve that, | 18 // the results. In the second one, we transform the rows. To achieve that, |
19 // as the first pass results are transposed, we tranpose the columns (that | 19 // as the first pass results are transposed, we tranpose the columns (that |
20 // is the transposed rows) and transpose the results (so that it goes back | 20 // is the transposed rows) and transpose the results (so that it goes back |
21 // in normal/row positions). | 21 // in normal/row positions). |
22 const int stride = pitch >> 1; | |
23 int pass; | 22 int pass; |
24 // Constants | 23 // Constants |
25 // When we use them, in one case, they are all the same. In all others | 24 // When we use them, in one case, they are all the same. In all others |
26 // it's a pair of them that we need to repeat four times. This is done | 25 // it's a pair of them that we need to repeat four times. This is done |
27 // by constructing the 32 bit constant corresponding to that pair. | 26 // by constructing the 32 bit constant corresponding to that pair. |
28 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
29 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
30 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
31 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
32 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
105 __m128i out01 = _mm_add_epi16(in0, kOne); | 104 __m128i out01 = _mm_add_epi16(in0, kOne); |
106 __m128i out23 = _mm_add_epi16(in2, kOne); | 105 __m128i out23 = _mm_add_epi16(in2, kOne); |
107 out01 = _mm_srai_epi16(out01, 2); | 106 out01 = _mm_srai_epi16(out01, 2); |
108 out23 = _mm_srai_epi16(out23, 2); | 107 out23 = _mm_srai_epi16(out23, 2); |
109 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); | 108 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); |
110 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); | 109 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); |
111 } | 110 } |
112 } | 111 } |
113 } | 112 } |
114 | 113 |
115 void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { | 114 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, |
116 vp9_short_fdct4x4_sse2(input, output, pitch); | 115 int stride) { |
117 vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch); | |
118 } | |
119 | |
120 static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { | |
121 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); | 116 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); |
122 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); | 117 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); |
123 __m128i mask; | 118 __m128i mask; |
124 | 119 |
125 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); | 120 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
126 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); | 121 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); |
127 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); | 122 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); |
128 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); | 123 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); |
129 | 124 |
130 in[0] = _mm_slli_epi16(in[0], 4); | 125 in[0] = _mm_slli_epi16(in[0], 4); |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
164 // 00 10 20 30 01 11 21 31 | 159 // 00 10 20 30 01 11 21 31 |
165 // 02 12 22 32 03 13 23 33 | 160 // 02 12 22 32 03 13 23 33 |
166 // only use the first 4 16-bit integers | 161 // only use the first 4 16-bit integers |
167 res[1] = _mm_unpackhi_epi64(res[0], res[0]); | 162 res[1] = _mm_unpackhi_epi64(res[0], res[0]); |
168 res[3] = _mm_unpackhi_epi64(res[2], res[2]); | 163 res[3] = _mm_unpackhi_epi64(res[2], res[2]); |
169 } | 164 } |
170 | 165 |
171 void fdct4_1d_sse2(__m128i *in) { | 166 void fdct4_1d_sse2(__m128i *in) { |
172 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 167 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
173 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 168 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
174 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 169 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
175 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 170 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
176 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 171 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
177 | 172 |
178 __m128i u[4], v[4]; | 173 __m128i u[4], v[4]; |
179 u[0] = _mm_add_epi16(in[0], in[3]); | 174 u[0]=_mm_unpacklo_epi16(in[0], in[1]); |
180 u[1] = _mm_add_epi16(in[1], in[2]); | 175 u[1]=_mm_unpacklo_epi16(in[3], in[2]); |
181 u[2] = _mm_sub_epi16(in[1], in[2]); | |
182 u[3] = _mm_sub_epi16(in[0], in[3]); | |
183 | 176 |
184 v[0] = _mm_unpacklo_epi16(u[0], u[1]); | 177 v[0] = _mm_add_epi16(u[0], u[1]); |
185 v[1] = _mm_unpacklo_epi16(u[2], u[3]); | 178 v[1] = _mm_sub_epi16(u[0], u[1]); |
| 179 |
186 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 | 180 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 |
187 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 | 181 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 |
188 u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1 | 182 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 |
189 u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3 | 183 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 |
190 | 184 |
191 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); | 185 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
192 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); | 186 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
193 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); | 187 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
194 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); | 188 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
195 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 189 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
196 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 190 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
197 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 191 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
198 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 192 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
199 | 193 |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
242 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 236 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
243 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 237 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
244 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 238 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
245 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 239 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
246 | 240 |
247 in[0] = _mm_packs_epi32(u[0], u[2]); | 241 in[0] = _mm_packs_epi32(u[0], u[2]); |
248 in[1] = _mm_packs_epi32(u[1], u[3]); | 242 in[1] = _mm_packs_epi32(u[1], u[3]); |
249 transpose_4x4(in); | 243 transpose_4x4(in); |
250 } | 244 } |
251 | 245 |
252 void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, | 246 void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, |
253 int stride, int tx_type) { | 247 int stride, int tx_type) { |
254 __m128i in[4]; | 248 __m128i in[4]; |
255 load_buffer_4x4(input, in, stride); | 249 load_buffer_4x4(input, in, stride); |
256 switch (tx_type) { | 250 switch (tx_type) { |
257 case 0: // DCT_DCT | 251 case 0: // DCT_DCT |
258 fdct4_1d_sse2(in); | 252 fdct4_1d_sse2(in); |
259 fdct4_1d_sse2(in); | 253 fdct4_1d_sse2(in); |
260 break; | 254 break; |
261 case 1: // ADST_DCT | 255 case 1: // ADST_DCT |
262 fadst4_1d_sse2(in); | 256 fadst4_1d_sse2(in); |
263 fdct4_1d_sse2(in); | 257 fdct4_1d_sse2(in); |
264 break; | 258 break; |
265 case 2: // DCT_ADST | 259 case 2: // DCT_ADST |
266 fdct4_1d_sse2(in); | 260 fdct4_1d_sse2(in); |
267 fadst4_1d_sse2(in); | 261 fadst4_1d_sse2(in); |
268 break; | 262 break; |
269 case 3: // ADST_ADST | 263 case 3: // ADST_ADST |
270 fadst4_1d_sse2(in); | 264 fadst4_1d_sse2(in); |
271 fadst4_1d_sse2(in); | 265 fadst4_1d_sse2(in); |
272 break; | 266 break; |
273 default: | 267 default: |
274 assert(0); | 268 assert(0); |
275 break; | 269 break; |
276 } | 270 } |
277 write_buffer_4x4(output, in); | 271 write_buffer_4x4(output, in); |
278 } | 272 } |
279 | 273 |
280 void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { | 274 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { |
281 const int stride = pitch >> 1; | |
282 int pass; | 275 int pass; |
283 // Constants | 276 // Constants |
284 // When we use them, in one case, they are all the same. In all others | 277 // When we use them, in one case, they are all the same. In all others |
285 // it's a pair of them that we need to repeat four times. This is done | 278 // it's a pair of them that we need to repeat four times. This is done |
286 // by constructing the 32 bit constant corresponding to that pair. | 279 // by constructing the 32 bit constant corresponding to that pair. |
287 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 280 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
288 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 281 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
289 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 282 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
290 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 283 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
291 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 284 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
(...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
528 _mm_store_si128((__m128i *)(output + 2 * 8), in2); | 521 _mm_store_si128((__m128i *)(output + 2 * 8), in2); |
529 _mm_store_si128((__m128i *)(output + 3 * 8), in3); | 522 _mm_store_si128((__m128i *)(output + 3 * 8), in3); |
530 _mm_store_si128((__m128i *)(output + 4 * 8), in4); | 523 _mm_store_si128((__m128i *)(output + 4 * 8), in4); |
531 _mm_store_si128((__m128i *)(output + 5 * 8), in5); | 524 _mm_store_si128((__m128i *)(output + 5 * 8), in5); |
532 _mm_store_si128((__m128i *)(output + 6 * 8), in6); | 525 _mm_store_si128((__m128i *)(output + 6 * 8), in6); |
533 _mm_store_si128((__m128i *)(output + 7 * 8), in7); | 526 _mm_store_si128((__m128i *)(output + 7 * 8), in7); |
534 } | 527 } |
535 } | 528 } |
536 | 529 |
537 // load 8x8 array | 530 // load 8x8 array |
538 static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { | 531 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, |
539 in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); | 532 int stride) { |
540 in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); | 533 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
541 in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); | 534 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
542 in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); | 535 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
543 in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); | 536 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
544 in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); | 537 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
545 in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); | 538 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
546 in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); | 539 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
| 540 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
547 | 541 |
548 in[0] = _mm_slli_epi16(in[0], 2); | 542 in[0] = _mm_slli_epi16(in[0], 2); |
549 in[1] = _mm_slli_epi16(in[1], 2); | 543 in[1] = _mm_slli_epi16(in[1], 2); |
550 in[2] = _mm_slli_epi16(in[2], 2); | 544 in[2] = _mm_slli_epi16(in[2], 2); |
551 in[3] = _mm_slli_epi16(in[3], 2); | 545 in[3] = _mm_slli_epi16(in[3], 2); |
552 in[4] = _mm_slli_epi16(in[4], 2); | 546 in[4] = _mm_slli_epi16(in[4], 2); |
553 in[5] = _mm_slli_epi16(in[5], 2); | 547 in[5] = _mm_slli_epi16(in[5], 2); |
554 in[6] = _mm_slli_epi16(in[6], 2); | 548 in[6] = _mm_slli_epi16(in[6], 2); |
555 in[7] = _mm_slli_epi16(in[7], 2); | 549 in[7] = _mm_slli_epi16(in[7], 2); |
556 } | 550 } |
(...skipping 469 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1026 in[3] = _mm_sub_epi16(k__const_0, s2); | 1020 in[3] = _mm_sub_epi16(k__const_0, s2); |
1027 in[4] = s3; | 1021 in[4] = s3; |
1028 in[5] = _mm_sub_epi16(k__const_0, s7); | 1022 in[5] = _mm_sub_epi16(k__const_0, s7); |
1029 in[6] = s5; | 1023 in[6] = s5; |
1030 in[7] = _mm_sub_epi16(k__const_0, s1); | 1024 in[7] = _mm_sub_epi16(k__const_0, s1); |
1031 | 1025 |
1032 // transpose | 1026 // transpose |
1033 array_transpose_8x8(in, in); | 1027 array_transpose_8x8(in, in); |
1034 } | 1028 } |
1035 | 1029 |
1036 void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, | 1030 void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, |
1037 int stride, int tx_type) { | 1031 int stride, int tx_type) { |
1038 __m128i in[8]; | 1032 __m128i in[8]; |
1039 load_buffer_8x8(input, in, stride); | 1033 load_buffer_8x8(input, in, stride); |
1040 switch (tx_type) { | 1034 switch (tx_type) { |
1041 case 0: // DCT_DCT | 1035 case 0: // DCT_DCT |
1042 fdct8_1d_sse2(in); | 1036 fdct8_1d_sse2(in); |
1043 fdct8_1d_sse2(in); | 1037 fdct8_1d_sse2(in); |
1044 break; | 1038 break; |
1045 case 1: // ADST_DCT | 1039 case 1: // ADST_DCT |
1046 fadst8_1d_sse2(in); | 1040 fadst8_1d_sse2(in); |
1047 fdct8_1d_sse2(in); | 1041 fdct8_1d_sse2(in); |
1048 break; | 1042 break; |
1049 case 2: // DCT_ADST | 1043 case 2: // DCT_ADST |
1050 fdct8_1d_sse2(in); | 1044 fdct8_1d_sse2(in); |
1051 fadst8_1d_sse2(in); | 1045 fadst8_1d_sse2(in); |
1052 break; | 1046 break; |
1053 case 3: // ADST_ADST | 1047 case 3: // ADST_ADST |
1054 fadst8_1d_sse2(in); | 1048 fadst8_1d_sse2(in); |
1055 fadst8_1d_sse2(in); | 1049 fadst8_1d_sse2(in); |
1056 break; | 1050 break; |
1057 default: | 1051 default: |
1058 assert(0); | 1052 assert(0); |
1059 break; | 1053 break; |
1060 } | 1054 } |
1061 right_shift_8x8(in, 1); | 1055 right_shift_8x8(in, 1); |
1062 write_buffer_8x8(output, in, 8); | 1056 write_buffer_8x8(output, in, 8); |
1063 } | 1057 } |
1064 | 1058 |
1065 void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { | 1059 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { |
1066 // The 2D transform is done with two passes which are actually pretty | 1060 // The 2D transform is done with two passes which are actually pretty |
1067 // similar. In the first one, we transform the columns and transpose | 1061 // similar. In the first one, we transform the columns and transpose |
1068 // the results. In the second one, we transform the rows. To achieve that, | 1062 // the results. In the second one, we transform the rows. To achieve that, |
1069 // as the first pass results are transposed, we tranpose the columns (that | 1063 // as the first pass results are transposed, we tranpose the columns (that |
1070 // is the transposed rows) and transpose the results (so that it goes back | 1064 // is the transposed rows) and transpose the results (so that it goes back |
1071 // in normal/row positions). | 1065 // in normal/row positions). |
1072 const int stride = pitch >> 1; | |
1073 int pass; | 1066 int pass; |
1074 // We need an intermediate buffer between passes. | 1067 // We need an intermediate buffer between passes. |
1075 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); | 1068 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); |
1076 int16_t *in = input; | 1069 const int16_t *in = input; |
1077 int16_t *out = intermediate; | 1070 int16_t *out = intermediate; |
1078 // Constants | 1071 // Constants |
1079 // When we use them, in one case, they are all the same. In all others | 1072 // When we use them, in one case, they are all the same. In all others |
1080 // it's a pair of them that we need to repeat four times. This is done | 1073 // it's a pair of them that we need to repeat four times. This is done |
1081 // by constructing the 32 bit constant corresponding to that pair. | 1074 // by constructing the 32 bit constant corresponding to that pair. |
1082 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 1075 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
1083 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1076 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
1084 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1077 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
1085 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 1078 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
1086 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1079 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
(...skipping 594 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1681 _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); | 1674 _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); |
1682 } | 1675 } |
1683 out += 8*16; | 1676 out += 8*16; |
1684 } | 1677 } |
1685 // Setup in/out for next pass. | 1678 // Setup in/out for next pass. |
1686 in = intermediate; | 1679 in = intermediate; |
1687 out = output; | 1680 out = output; |
1688 } | 1681 } |
1689 } | 1682 } |
1690 | 1683 |
1691 static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, | 1684 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, |
1692 __m128i *in1, int stride) { | 1685 __m128i *in1, int stride) { |
1693 // load first 8 columns | 1686 // load first 8 columns |
1694 load_buffer_8x8(input, in0, stride); | 1687 load_buffer_8x8(input, in0, stride); |
1695 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); | 1688 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); |
1696 | 1689 |
1697 input += 8; | 1690 input += 8; |
1698 // load second 8 columns | 1691 // load second 8 columns |
1699 load_buffer_8x8(input, in1, stride); | 1692 load_buffer_8x8(input, in1, stride); |
1700 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); | 1693 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); |
1701 } | 1694 } |
(...skipping 831 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2533 fdct16_1d_8col(in1); | 2526 fdct16_1d_8col(in1); |
2534 array_transpose_16x16(in0, in1); | 2527 array_transpose_16x16(in0, in1); |
2535 } | 2528 } |
2536 | 2529 |
2537 void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { | 2530 void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { |
2538 fadst16_1d_8col(in0); | 2531 fadst16_1d_8col(in0); |
2539 fadst16_1d_8col(in1); | 2532 fadst16_1d_8col(in1); |
2540 array_transpose_16x16(in0, in1); | 2533 array_transpose_16x16(in0, in1); |
2541 } | 2534 } |
2542 | 2535 |
2543 void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, | 2536 void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, |
2544 int stride, int tx_type) { | 2537 int stride, int tx_type) { |
2545 __m128i in0[16], in1[16]; | 2538 __m128i in0[16], in1[16]; |
2546 load_buffer_16x16(input, in0, in1, stride); | 2539 load_buffer_16x16(input, in0, in1, stride); |
2547 switch (tx_type) { | 2540 switch (tx_type) { |
2548 case 0: // DCT_DCT | 2541 case 0: // DCT_DCT |
2549 fdct16_1d_sse2(in0, in1); | 2542 fdct16_1d_sse2(in0, in1); |
2550 right_shift_16x16(in0, in1); | 2543 right_shift_16x16(in0, in1); |
2551 fdct16_1d_sse2(in0, in1); | 2544 fdct16_1d_sse2(in0, in1); |
2552 break; | 2545 break; |
2553 case 1: // ADST_DCT | 2546 case 1: // ADST_DCT |
(...skipping 11 matching lines...) Expand all Loading... |
2565 right_shift_16x16(in0, in1); | 2558 right_shift_16x16(in0, in1); |
2566 fadst16_1d_sse2(in0, in1); | 2559 fadst16_1d_sse2(in0, in1); |
2567 break; | 2560 break; |
2568 default: | 2561 default: |
2569 assert(0); | 2562 assert(0); |
2570 break; | 2563 break; |
2571 } | 2564 } |
2572 write_buffer_16x16(output, in0, in1, 16); | 2565 write_buffer_16x16(output, in0, in1, 16); |
2573 } | 2566 } |
2574 | 2567 |
2575 #define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2 | 2568 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 |
2576 #define FDCT32x32_HIGH_PRECISION 0 | 2569 #define FDCT32x32_HIGH_PRECISION 0 |
2577 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" | 2570 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |
2578 #undef FDCT32x32_2D | 2571 #undef FDCT32x32_2D |
2579 #undef FDCT32x32_HIGH_PRECISION | 2572 #undef FDCT32x32_HIGH_PRECISION |
2580 | 2573 |
2581 #define FDCT32x32_2D vp9_short_fdct32x32_sse2 | 2574 #define FDCT32x32_2D vp9_fdct32x32_sse2 |
2582 #define FDCT32x32_HIGH_PRECISION 1 | 2575 #define FDCT32x32_HIGH_PRECISION 1 |
2583 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2576 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
2584 #undef FDCT32x32_2D | 2577 #undef FDCT32x32_2D |
2585 #undef FDCT32x32_HIGH_PRECISION | 2578 #undef FDCT32x32_HIGH_PRECISION |
OLD | NEW |