Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(810)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <emmintrin.h> // SSE2 11 #include <emmintrin.h> // SSE2
12 #include "vp9/common/vp9_idct.h" // for cospi constants 12 #include "vp9/common/vp9_idct.h" // for cospi constants
13 #include "vpx_ports/mem.h" 13 #include "vpx_ports/mem.h"
14 14
15 #if FDCT32x32_HIGH_PRECISION 15 #if FDCT32x32_HIGH_PRECISION
16 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { 16 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
17 __m128i buf0, buf1; 17 __m128i buf0, buf1;
18 buf0 = _mm_mul_epu32(a, b); 18 buf0 = _mm_mul_epu32(a, b);
19 a = _mm_srli_epi64(a, 32); 19 a = _mm_srli_epi64(a, 32);
20 b = _mm_srli_epi64(b, 32); 20 b = _mm_srli_epi64(b, 32);
21 buf1 = _mm_mul_epu32(a, b); 21 buf1 = _mm_mul_epu32(a, b);
22 return _mm_add_epi64(buf0, buf1); 22 return _mm_add_epi64(buf0, buf1);
23 } 23 }
24 24
25 static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { 25 static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
26 __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); 26 __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
27 __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); 27 __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
28 return _mm_unpacklo_epi64(buf0, buf1); 28 return _mm_unpacklo_epi64(buf0, buf1);
29 } 29 }
30
31 static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) {
32 // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
33 __m128i sign_bit = _mm_and_si128(a, mask16);
34 __m128i b = _mm_unpacklo_epi16(a, kZero);
35 sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
36 sign_bit = _mm_unpacklo_epi16(kZero, sign_bit);
37 return _mm_or_si128(sign_bit, b);
38 }
39
40 static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) {
41 // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
42 __m128i sign_bit = _mm_and_si128(a, mask16);
43 __m128i b = _mm_unpackhi_epi16(a, kZero);
44 sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
45 sign_bit = _mm_unpackhi_epi16(kZero, sign_bit);
46 return _mm_or_si128(sign_bit, b);
47 }
48 #endif 30 #endif
49 31
50 void FDCT32x32_2D(int16_t *input, 32 void FDCT32x32_2D(const int16_t *input,
51 int16_t *output_org, int pitch) { 33 int16_t *output_org, int stride) {
52 // Calculate pre-multiplied strides 34 // Calculate pre-multiplied strides
53 const int str1 = pitch >> 1; 35 const int str1 = stride;
54 const int str2 = pitch; 36 const int str2 = 2 * stride;
55 const int str3 = pitch + str1; 37 const int str3 = 2 * stride + str1;
56 // We need an intermediate buffer between passes. 38 // We need an intermediate buffer between passes.
57 DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); 39 DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
58 // Constants 40 // Constants
59 // When we use them, in one case, they are all the same. In all others 41 // When we use them, in one case, they are all the same. In all others
60 // it's a pair of them that we need to repeat four times. This is done 42 // it's a pair of them that we need to repeat four times. This is done
61 // by constructing the 32 bit constant corresponding to that pair. 43 // by constructing the 32 bit constant corresponding to that pair.
62 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); 44 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
63 const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); 45 const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
64 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 46 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
65 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 47 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
104 int column_start; 86 int column_start;
105 for (column_start = 0; column_start < 32; column_start += 8) { 87 for (column_start = 0; column_start < 32; column_start += 8) {
106 __m128i step1[32]; 88 __m128i step1[32];
107 __m128i step2[32]; 89 __m128i step2[32];
108 __m128i step3[32]; 90 __m128i step3[32];
109 __m128i out[32]; 91 __m128i out[32];
110 // Stage 1 92 // Stage 1
111 // Note: even though all the loads below are aligned, using the aligned 93 // Note: even though all the loads below are aligned, using the aligned
112 // intrinsic make the code slightly slower. 94 // intrinsic make the code slightly slower.
113 if (0 == pass) { 95 if (0 == pass) {
114 int16_t *in = &input[column_start]; 96 const int16_t *in = &input[column_start];
115 // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; 97 // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
116 // Note: the next four blocks could be in a loop. That would help the 98 // Note: the next four blocks could be in a loop. That would help the
117 // instruction cache but is actually slower. 99 // instruction cache but is actually slower.
118 { 100 {
119 int16_t *ina = in + 0 * str1; 101 const int16_t *ina = in + 0 * str1;
120 int16_t *inb = in + 31 * str1; 102 const int16_t *inb = in + 31 * str1;
121 __m128i *step1a = &step1[ 0]; 103 __m128i *step1a = &step1[ 0];
122 __m128i *step1b = &step1[31]; 104 __m128i *step1b = &step1[31];
123 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 105 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
124 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 106 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
125 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 107 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
126 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 108 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
127 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 109 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
128 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 110 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
129 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 111 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
130 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 112 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
131 step1a[ 0] = _mm_add_epi16(ina0, inb0); 113 step1a[ 0] = _mm_add_epi16(ina0, inb0);
132 step1a[ 1] = _mm_add_epi16(ina1, inb1); 114 step1a[ 1] = _mm_add_epi16(ina1, inb1);
133 step1a[ 2] = _mm_add_epi16(ina2, inb2); 115 step1a[ 2] = _mm_add_epi16(ina2, inb2);
134 step1a[ 3] = _mm_add_epi16(ina3, inb3); 116 step1a[ 3] = _mm_add_epi16(ina3, inb3);
135 step1b[-3] = _mm_sub_epi16(ina3, inb3); 117 step1b[-3] = _mm_sub_epi16(ina3, inb3);
136 step1b[-2] = _mm_sub_epi16(ina2, inb2); 118 step1b[-2] = _mm_sub_epi16(ina2, inb2);
137 step1b[-1] = _mm_sub_epi16(ina1, inb1); 119 step1b[-1] = _mm_sub_epi16(ina1, inb1);
138 step1b[-0] = _mm_sub_epi16(ina0, inb0); 120 step1b[-0] = _mm_sub_epi16(ina0, inb0);
139 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 121 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
140 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 122 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
141 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 123 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
142 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 124 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
143 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 125 step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
144 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 126 step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
145 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 127 step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
146 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 128 step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
147 } 129 }
148 { 130 {
149 int16_t *ina = in + 4 * str1; 131 const int16_t *ina = in + 4 * str1;
150 int16_t *inb = in + 27 * str1; 132 const int16_t *inb = in + 27 * str1;
151 __m128i *step1a = &step1[ 4]; 133 __m128i *step1a = &step1[ 4];
152 __m128i *step1b = &step1[27]; 134 __m128i *step1b = &step1[27];
153 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 135 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
154 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 136 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
155 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 137 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
156 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 138 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
157 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 139 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
158 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 140 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
159 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 141 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
160 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 142 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
161 step1a[ 0] = _mm_add_epi16(ina0, inb0); 143 step1a[ 0] = _mm_add_epi16(ina0, inb0);
162 step1a[ 1] = _mm_add_epi16(ina1, inb1); 144 step1a[ 1] = _mm_add_epi16(ina1, inb1);
163 step1a[ 2] = _mm_add_epi16(ina2, inb2); 145 step1a[ 2] = _mm_add_epi16(ina2, inb2);
164 step1a[ 3] = _mm_add_epi16(ina3, inb3); 146 step1a[ 3] = _mm_add_epi16(ina3, inb3);
165 step1b[-3] = _mm_sub_epi16(ina3, inb3); 147 step1b[-3] = _mm_sub_epi16(ina3, inb3);
166 step1b[-2] = _mm_sub_epi16(ina2, inb2); 148 step1b[-2] = _mm_sub_epi16(ina2, inb2);
167 step1b[-1] = _mm_sub_epi16(ina1, inb1); 149 step1b[-1] = _mm_sub_epi16(ina1, inb1);
168 step1b[-0] = _mm_sub_epi16(ina0, inb0); 150 step1b[-0] = _mm_sub_epi16(ina0, inb0);
169 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 151 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
170 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 152 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
171 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 153 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
172 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 154 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
173 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 155 step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
174 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 156 step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
175 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 157 step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
176 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 158 step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
177 } 159 }
178 { 160 {
179 int16_t *ina = in + 8 * str1; 161 const int16_t *ina = in + 8 * str1;
180 int16_t *inb = in + 23 * str1; 162 const int16_t *inb = in + 23 * str1;
181 __m128i *step1a = &step1[ 8]; 163 __m128i *step1a = &step1[ 8];
182 __m128i *step1b = &step1[23]; 164 __m128i *step1b = &step1[23];
183 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 165 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
184 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 166 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
185 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 167 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
186 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 168 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
187 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 169 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
188 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 170 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
189 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 171 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
190 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 172 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
191 step1a[ 0] = _mm_add_epi16(ina0, inb0); 173 step1a[ 0] = _mm_add_epi16(ina0, inb0);
192 step1a[ 1] = _mm_add_epi16(ina1, inb1); 174 step1a[ 1] = _mm_add_epi16(ina1, inb1);
193 step1a[ 2] = _mm_add_epi16(ina2, inb2); 175 step1a[ 2] = _mm_add_epi16(ina2, inb2);
194 step1a[ 3] = _mm_add_epi16(ina3, inb3); 176 step1a[ 3] = _mm_add_epi16(ina3, inb3);
195 step1b[-3] = _mm_sub_epi16(ina3, inb3); 177 step1b[-3] = _mm_sub_epi16(ina3, inb3);
196 step1b[-2] = _mm_sub_epi16(ina2, inb2); 178 step1b[-2] = _mm_sub_epi16(ina2, inb2);
197 step1b[-1] = _mm_sub_epi16(ina1, inb1); 179 step1b[-1] = _mm_sub_epi16(ina1, inb1);
198 step1b[-0] = _mm_sub_epi16(ina0, inb0); 180 step1b[-0] = _mm_sub_epi16(ina0, inb0);
199 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 181 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
200 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 182 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
201 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 183 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
202 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 184 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
203 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 185 step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
204 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 186 step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
205 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 187 step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
206 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 188 step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
207 } 189 }
208 { 190 {
209 int16_t *ina = in + 12 * str1; 191 const int16_t *ina = in + 12 * str1;
210 int16_t *inb = in + 19 * str1; 192 const int16_t *inb = in + 19 * str1;
211 __m128i *step1a = &step1[12]; 193 __m128i *step1a = &step1[12];
212 __m128i *step1b = &step1[19]; 194 __m128i *step1b = &step1[19];
213 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 195 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
214 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 196 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
215 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 197 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
216 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 198 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
217 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 199 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
218 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 200 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
219 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 201 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
220 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 202 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
(...skipping 931 matching lines...) Expand 10 before | Expand all | Expand 10 after
1152 out[29] = _mm_packs_epi32(out_29_6, out_29_7); 1134 out[29] = _mm_packs_epi32(out_29_6, out_29_7);
1153 out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); 1135 out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
1154 out[19] = _mm_packs_epi32(out_19_6, out_19_7); 1136 out[19] = _mm_packs_epi32(out_19_6, out_19_7);
1155 out[11] = _mm_packs_epi32(out_11_6, out_11_7); 1137 out[11] = _mm_packs_epi32(out_11_6, out_11_7);
1156 out[27] = _mm_packs_epi32(out_27_6, out_27_7); 1138 out[27] = _mm_packs_epi32(out_27_6, out_27_7);
1157 } 1139 }
1158 #if FDCT32x32_HIGH_PRECISION 1140 #if FDCT32x32_HIGH_PRECISION
1159 } else { 1141 } else {
1160 __m128i lstep1[64], lstep2[64], lstep3[64]; 1142 __m128i lstep1[64], lstep2[64], lstep3[64];
1161 __m128i u[32], v[32], sign[16]; 1143 __m128i u[32], v[32], sign[16];
1162 const __m128i mask16 = _mm_set1_epi32(0x80008000);
1163 const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); 1144 const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
1164 // start using 32-bit operations 1145 // start using 32-bit operations
1165 // stage 3 1146 // stage 3
1166 { 1147 {
1167 // expanding to 32-bit length priori to addition operations 1148 // expanding to 32-bit length priori to addition operations
1168 lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero); 1149 lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
1169 lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero); 1150 lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
1170 lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero); 1151 lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
1171 lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero); 1152 lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
1172 lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero); 1153 lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
1173 lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero); 1154 lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
1174 lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero); 1155 lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
1175 lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero); 1156 lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
1176 lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero); 1157 lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
1177 lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero); 1158 lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
1178 lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero); 1159 lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
1179 lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero); 1160 lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
1180 lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero); 1161 lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
1181 lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero); 1162 lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
1182 lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero); 1163 lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
1183 lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero); 1164 lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
1165 lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
1166 lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
1167 lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
1168 lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
1169 lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
1170 lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
1171 lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
1172 lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
1173 lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
1174 lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
1175 lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
1176 lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
1177 lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
1178 lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
1179 lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
1180 lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
1184 1181
1185 lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); 1182 lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
1186 lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); 1183 lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
1187 lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]); 1184 lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
1188 lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]); 1185 lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
1189 lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]); 1186 lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
1190 lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]); 1187 lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
1191 lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]); 1188 lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
1192 lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]); 1189 lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
1193 lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]); 1190 lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
(...skipping 30 matching lines...) Expand all
1224 lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); 1221 lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
1225 lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); 1222 lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
1226 lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); 1223 lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
1227 lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); 1224 lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
1228 lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); 1225 lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
1229 lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); 1226 lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
1230 lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); 1227 lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
1231 lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); 1228 lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
1232 } 1229 }
1233 { 1230 {
1234 lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero); 1231 lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
1235 lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero); 1232 lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
1236 lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero); 1233 lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
1237 lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero); 1234 lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
1238 lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero); 1235 lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
1239 lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero); 1236 lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
1240 lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero); 1237 lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
1241 lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero); 1238 lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
1242 lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero); 1239 lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
1243 lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero); 1240 lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
1244 lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero); 1241 lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
1245 lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero); 1242 lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
1246 lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero); 1243 lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
1247 lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero); 1244 lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
1248 lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero); 1245 lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
1249 lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero); 1246 lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
1247 lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
1248 lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
1249 lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
1250 lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
1251 lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
1252 lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
1253 lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
1254 lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
1255 lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
1256 lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
1257 lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
1258 lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
1259 lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
1260 lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
1261 lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
1262 lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
1250 1263
1251 lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero); 1264 lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
1252 lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero); 1265 lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
1253 lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero); 1266 lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
1254 lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero); 1267 lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
1255 lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero); 1268 lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
1256 lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero); 1269 lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
1257 lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero); 1270 lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
1258 lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero); 1271 lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
1259 lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero); 1272 lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
1260 lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero); 1273 lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
1261 lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero); 1274 lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
1262 lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero); 1275 lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
1263 lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero); 1276 lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
1264 lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero); 1277 lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
1265 lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero); 1278 lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
1266 lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero); 1279 lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
1280 lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
1281 lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
1282 lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
1283 lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
1284 lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
1285 lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
1286 lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
1287 lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
1288 lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
1289 lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
1290 lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
1291 lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
1292 lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
1293 lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
1294 lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
1295 lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
1267 1296
1268 lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); 1297 lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
1269 lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); 1298 lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
1299
1270 lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); 1300 lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
1271 lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); 1301 lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
1272 lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); 1302 lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
1273 lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); 1303 lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
1274 lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); 1304 lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
1275 lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); 1305 lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
1276 lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); 1306 lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
1277 lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); 1307 lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
1278 lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); 1308 lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
1279 lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); 1309 lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
(...skipping 15 matching lines...) Expand all
1295 lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); 1325 lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
1296 lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); 1326 lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
1297 lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); 1327 lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
1298 lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); 1328 lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
1299 lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); 1329 lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
1300 } 1330 }
1301 1331
1302 // stage 4 1332 // stage 4
1303 { 1333 {
1304 // expanding to 32-bit length priori to addition operations 1334 // expanding to 32-bit length priori to addition operations
1305 lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero); 1335 lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
1306 lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero); 1336 lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
1307 lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero); 1337 lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
1308 lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero); 1338 lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
1309 lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero); 1339 lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
1310 lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero); 1340 lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
1311 lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero); 1341 lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
1312 lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero); 1342 lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
1343 lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
1344 lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
1345 lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
1346 lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
1347 lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
1348 lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
1349 lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
1350 lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
1313 1351
1314 lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]); 1352 lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
1315 lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]); 1353 lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
1316 lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]); 1354 lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
1317 lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]); 1355 lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
1318 lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]); 1356 lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
1319 lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]); 1357 lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
1320 lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]); 1358 lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
1321 lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]); 1359 lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
1322 lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); 1360 lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
1323 lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); 1361 lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
1324 lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); 1362 lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
1325 lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); 1363 lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
1326 lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); 1364 lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
1327 lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); 1365 lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
1328 lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); 1366 lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
1329 lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); 1367 lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
1330 lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); 1368 lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
1331 lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); 1369 lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
1332 lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); 1370 lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
1333 lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); 1371 lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
1334 lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); 1372 lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
1335 lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); 1373 lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
1336 lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); 1374 lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
1337 lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); 1375 lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
1338 } 1376 }
1339 { 1377 {
1340 // to be continued... 1378 // to be continued...
1341 // 1379 //
1342 const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); 1380 const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
1343 const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); 1381 const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
1344 1382
1345 u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); 1383 u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
1346 u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); 1384 u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
1347 u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); 1385 u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
1348 u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); 1386 u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
1349 1387
1350 // TODO(jingning): manually inline k_madd_epi32_ to further hide 1388 // TODO(jingning): manually inline k_madd_epi32_ to further hide
1351 // instruction latency. 1389 // instruction latency.
1352 v[ 0] = k_madd_epi32(u[0], k32_p16_m16); 1390 v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
1353 v[ 1] = k_madd_epi32(u[1], k32_p16_m16); 1391 v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
1354 v[ 2] = k_madd_epi32(u[2], k32_p16_m16); 1392 v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
1355 v[ 3] = k_madd_epi32(u[3], k32_p16_m16); 1393 v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
1356 v[ 4] = k_madd_epi32(u[0], k32_p16_p16); 1394 v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
1357 v[ 5] = k_madd_epi32(u[1], k32_p16_p16); 1395 v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
1358 v[ 6] = k_madd_epi32(u[2], k32_p16_p16); 1396 v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
1359 v[ 7] = k_madd_epi32(u[3], k32_p16_p16); 1397 v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
1360 1398
1361 u[0] = k_packs_epi64(v[0], v[1]); 1399 u[0] = k_packs_epi64(v[0], v[1]);
1362 u[1] = k_packs_epi64(v[2], v[3]); 1400 u[1] = k_packs_epi64(v[2], v[3]);
1363 u[2] = k_packs_epi64(v[4], v[5]); 1401 u[2] = k_packs_epi64(v[4], v[5]);
1364 u[3] = k_packs_epi64(v[6], v[7]); 1402 u[3] = k_packs_epi64(v[6], v[7]);
1365 1403
1366 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1404 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1367 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1405 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1368 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1406 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1369 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1407 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1370 1408
1371 lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1409 lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1372 lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1410 lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1373 lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1411 lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1374 lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1412 lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1375 } 1413 }
1376 { 1414 {
1377 const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); 1415 const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1378 const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); 1416 const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
1379 const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); 1417 const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1380 1418
1381 u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); 1419 u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
1382 u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); 1420 u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
1383 u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); 1421 u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
1384 u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); 1422 u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
(...skipping 1255 matching lines...) Expand 10 before | Expand all | Expand 10 after
2640 _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); 2678 _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
2641 _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); 2679 _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
2642 _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); 2680 _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
2643 _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); 2681 _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
2644 // Process next 8x8 2682 // Process next 8x8
2645 output += 8; 2683 output += 8;
2646 } 2684 }
2647 } 2685 }
2648 } 2686 }
2649 } 2687 }
2650 } 2688 } // NOLINT
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_variance_c.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698