OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <emmintrin.h> // SSE2 | 11 #include <emmintrin.h> // SSE2 |
12 #include "vp9/common/vp9_idct.h" // for cospi constants | 12 #include "vp9/common/vp9_idct.h" // for cospi constants |
13 #include "vpx_ports/mem.h" | 13 #include "vpx_ports/mem.h" |
14 | 14 |
15 #if FDCT32x32_HIGH_PRECISION | 15 #if FDCT32x32_HIGH_PRECISION |
16 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { | 16 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { |
17 __m128i buf0, buf1; | 17 __m128i buf0, buf1; |
18 buf0 = _mm_mul_epu32(a, b); | 18 buf0 = _mm_mul_epu32(a, b); |
19 a = _mm_srli_epi64(a, 32); | 19 a = _mm_srli_epi64(a, 32); |
20 b = _mm_srli_epi64(b, 32); | 20 b = _mm_srli_epi64(b, 32); |
21 buf1 = _mm_mul_epu32(a, b); | 21 buf1 = _mm_mul_epu32(a, b); |
22 return _mm_add_epi64(buf0, buf1); | 22 return _mm_add_epi64(buf0, buf1); |
23 } | 23 } |
24 | 24 |
25 static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { | 25 static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { |
26 __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); | 26 __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); |
27 __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); | 27 __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); |
28 return _mm_unpacklo_epi64(buf0, buf1); | 28 return _mm_unpacklo_epi64(buf0, buf1); |
29 } | 29 } |
30 | |
31 static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) { | |
32 // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers | |
33 __m128i sign_bit = _mm_and_si128(a, mask16); | |
34 __m128i b = _mm_unpacklo_epi16(a, kZero); | |
35 sign_bit = _mm_cmplt_epi16(sign_bit, kZero); | |
36 sign_bit = _mm_unpacklo_epi16(kZero, sign_bit); | |
37 return _mm_or_si128(sign_bit, b); | |
38 } | |
39 | |
40 static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) { | |
41 // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers | |
42 __m128i sign_bit = _mm_and_si128(a, mask16); | |
43 __m128i b = _mm_unpackhi_epi16(a, kZero); | |
44 sign_bit = _mm_cmplt_epi16(sign_bit, kZero); | |
45 sign_bit = _mm_unpackhi_epi16(kZero, sign_bit); | |
46 return _mm_or_si128(sign_bit, b); | |
47 } | |
48 #endif | 30 #endif |
49 | 31 |
50 void FDCT32x32_2D(int16_t *input, | 32 void FDCT32x32_2D(const int16_t *input, |
51 int16_t *output_org, int pitch) { | 33 int16_t *output_org, int stride) { |
52 // Calculate pre-multiplied strides | 34 // Calculate pre-multiplied strides |
53 const int str1 = pitch >> 1; | 35 const int str1 = stride; |
54 const int str2 = pitch; | 36 const int str2 = 2 * stride; |
55 const int str3 = pitch + str1; | 37 const int str3 = 2 * stride + str1; |
56 // We need an intermediate buffer between passes. | 38 // We need an intermediate buffer between passes. |
57 DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); | 39 DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); |
58 // Constants | 40 // Constants |
59 // When we use them, in one case, they are all the same. In all others | 41 // When we use them, in one case, they are all the same. In all others |
60 // it's a pair of them that we need to repeat four times. This is done | 42 // it's a pair of them that we need to repeat four times. This is done |
61 // by constructing the 32 bit constant corresponding to that pair. | 43 // by constructing the 32 bit constant corresponding to that pair. |
62 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); | 44 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); |
63 const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); | 45 const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); |
64 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 46 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
65 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 47 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
104 int column_start; | 86 int column_start; |
105 for (column_start = 0; column_start < 32; column_start += 8) { | 87 for (column_start = 0; column_start < 32; column_start += 8) { |
106 __m128i step1[32]; | 88 __m128i step1[32]; |
107 __m128i step2[32]; | 89 __m128i step2[32]; |
108 __m128i step3[32]; | 90 __m128i step3[32]; |
109 __m128i out[32]; | 91 __m128i out[32]; |
110 // Stage 1 | 92 // Stage 1 |
111 // Note: even though all the loads below are aligned, using the aligned | 93 // Note: even though all the loads below are aligned, using the aligned |
112 // intrinsic make the code slightly slower. | 94 // intrinsic make the code slightly slower. |
113 if (0 == pass) { | 95 if (0 == pass) { |
114 int16_t *in = &input[column_start]; | 96 const int16_t *in = &input[column_start]; |
115 // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; | 97 // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; |
116 // Note: the next four blocks could be in a loop. That would help the | 98 // Note: the next four blocks could be in a loop. That would help the |
117 // instruction cache but is actually slower. | 99 // instruction cache but is actually slower. |
118 { | 100 { |
119 int16_t *ina = in + 0 * str1; | 101 const int16_t *ina = in + 0 * str1; |
120 int16_t *inb = in + 31 * str1; | 102 const int16_t *inb = in + 31 * str1; |
121 __m128i *step1a = &step1[ 0]; | 103 __m128i *step1a = &step1[ 0]; |
122 __m128i *step1b = &step1[31]; | 104 __m128i *step1b = &step1[31]; |
123 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); | 105 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); |
124 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); | 106 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); |
125 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); | 107 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); |
126 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); | 108 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); |
127 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); | 109 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); |
128 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); | 110 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); |
129 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); | 111 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); |
130 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); | 112 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); |
131 step1a[ 0] = _mm_add_epi16(ina0, inb0); | 113 step1a[ 0] = _mm_add_epi16(ina0, inb0); |
132 step1a[ 1] = _mm_add_epi16(ina1, inb1); | 114 step1a[ 1] = _mm_add_epi16(ina1, inb1); |
133 step1a[ 2] = _mm_add_epi16(ina2, inb2); | 115 step1a[ 2] = _mm_add_epi16(ina2, inb2); |
134 step1a[ 3] = _mm_add_epi16(ina3, inb3); | 116 step1a[ 3] = _mm_add_epi16(ina3, inb3); |
135 step1b[-3] = _mm_sub_epi16(ina3, inb3); | 117 step1b[-3] = _mm_sub_epi16(ina3, inb3); |
136 step1b[-2] = _mm_sub_epi16(ina2, inb2); | 118 step1b[-2] = _mm_sub_epi16(ina2, inb2); |
137 step1b[-1] = _mm_sub_epi16(ina1, inb1); | 119 step1b[-1] = _mm_sub_epi16(ina1, inb1); |
138 step1b[-0] = _mm_sub_epi16(ina0, inb0); | 120 step1b[-0] = _mm_sub_epi16(ina0, inb0); |
139 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); | 121 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); |
140 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); | 122 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); |
141 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); | 123 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); |
142 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); | 124 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); |
143 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); | 125 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); |
144 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); | 126 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); |
145 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); | 127 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); |
146 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); | 128 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); |
147 } | 129 } |
148 { | 130 { |
149 int16_t *ina = in + 4 * str1; | 131 const int16_t *ina = in + 4 * str1; |
150 int16_t *inb = in + 27 * str1; | 132 const int16_t *inb = in + 27 * str1; |
151 __m128i *step1a = &step1[ 4]; | 133 __m128i *step1a = &step1[ 4]; |
152 __m128i *step1b = &step1[27]; | 134 __m128i *step1b = &step1[27]; |
153 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); | 135 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); |
154 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); | 136 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); |
155 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); | 137 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); |
156 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); | 138 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); |
157 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); | 139 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); |
158 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); | 140 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); |
159 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); | 141 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); |
160 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); | 142 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); |
161 step1a[ 0] = _mm_add_epi16(ina0, inb0); | 143 step1a[ 0] = _mm_add_epi16(ina0, inb0); |
162 step1a[ 1] = _mm_add_epi16(ina1, inb1); | 144 step1a[ 1] = _mm_add_epi16(ina1, inb1); |
163 step1a[ 2] = _mm_add_epi16(ina2, inb2); | 145 step1a[ 2] = _mm_add_epi16(ina2, inb2); |
164 step1a[ 3] = _mm_add_epi16(ina3, inb3); | 146 step1a[ 3] = _mm_add_epi16(ina3, inb3); |
165 step1b[-3] = _mm_sub_epi16(ina3, inb3); | 147 step1b[-3] = _mm_sub_epi16(ina3, inb3); |
166 step1b[-2] = _mm_sub_epi16(ina2, inb2); | 148 step1b[-2] = _mm_sub_epi16(ina2, inb2); |
167 step1b[-1] = _mm_sub_epi16(ina1, inb1); | 149 step1b[-1] = _mm_sub_epi16(ina1, inb1); |
168 step1b[-0] = _mm_sub_epi16(ina0, inb0); | 150 step1b[-0] = _mm_sub_epi16(ina0, inb0); |
169 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); | 151 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); |
170 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); | 152 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); |
171 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); | 153 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); |
172 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); | 154 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); |
173 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); | 155 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); |
174 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); | 156 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); |
175 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); | 157 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); |
176 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); | 158 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); |
177 } | 159 } |
178 { | 160 { |
179 int16_t *ina = in + 8 * str1; | 161 const int16_t *ina = in + 8 * str1; |
180 int16_t *inb = in + 23 * str1; | 162 const int16_t *inb = in + 23 * str1; |
181 __m128i *step1a = &step1[ 8]; | 163 __m128i *step1a = &step1[ 8]; |
182 __m128i *step1b = &step1[23]; | 164 __m128i *step1b = &step1[23]; |
183 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); | 165 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); |
184 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); | 166 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); |
185 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); | 167 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); |
186 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); | 168 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); |
187 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); | 169 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); |
188 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); | 170 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); |
189 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); | 171 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); |
190 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); | 172 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); |
191 step1a[ 0] = _mm_add_epi16(ina0, inb0); | 173 step1a[ 0] = _mm_add_epi16(ina0, inb0); |
192 step1a[ 1] = _mm_add_epi16(ina1, inb1); | 174 step1a[ 1] = _mm_add_epi16(ina1, inb1); |
193 step1a[ 2] = _mm_add_epi16(ina2, inb2); | 175 step1a[ 2] = _mm_add_epi16(ina2, inb2); |
194 step1a[ 3] = _mm_add_epi16(ina3, inb3); | 176 step1a[ 3] = _mm_add_epi16(ina3, inb3); |
195 step1b[-3] = _mm_sub_epi16(ina3, inb3); | 177 step1b[-3] = _mm_sub_epi16(ina3, inb3); |
196 step1b[-2] = _mm_sub_epi16(ina2, inb2); | 178 step1b[-2] = _mm_sub_epi16(ina2, inb2); |
197 step1b[-1] = _mm_sub_epi16(ina1, inb1); | 179 step1b[-1] = _mm_sub_epi16(ina1, inb1); |
198 step1b[-0] = _mm_sub_epi16(ina0, inb0); | 180 step1b[-0] = _mm_sub_epi16(ina0, inb0); |
199 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); | 181 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); |
200 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); | 182 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); |
201 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); | 183 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); |
202 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); | 184 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); |
203 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); | 185 step1b[-3] = _mm_slli_epi16(step1b[-3], 2); |
204 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); | 186 step1b[-2] = _mm_slli_epi16(step1b[-2], 2); |
205 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); | 187 step1b[-1] = _mm_slli_epi16(step1b[-1], 2); |
206 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); | 188 step1b[-0] = _mm_slli_epi16(step1b[-0], 2); |
207 } | 189 } |
208 { | 190 { |
209 int16_t *ina = in + 12 * str1; | 191 const int16_t *ina = in + 12 * str1; |
210 int16_t *inb = in + 19 * str1; | 192 const int16_t *inb = in + 19 * str1; |
211 __m128i *step1a = &step1[12]; | 193 __m128i *step1a = &step1[12]; |
212 __m128i *step1b = &step1[19]; | 194 __m128i *step1b = &step1[19]; |
213 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); | 195 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); |
214 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); | 196 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); |
215 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); | 197 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); |
216 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); | 198 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); |
217 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); | 199 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); |
218 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); | 200 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); |
219 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); | 201 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); |
220 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); | 202 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); |
(...skipping 931 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1152 out[29] = _mm_packs_epi32(out_29_6, out_29_7); | 1134 out[29] = _mm_packs_epi32(out_29_6, out_29_7); |
1153 out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); | 1135 out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); |
1154 out[19] = _mm_packs_epi32(out_19_6, out_19_7); | 1136 out[19] = _mm_packs_epi32(out_19_6, out_19_7); |
1155 out[11] = _mm_packs_epi32(out_11_6, out_11_7); | 1137 out[11] = _mm_packs_epi32(out_11_6, out_11_7); |
1156 out[27] = _mm_packs_epi32(out_27_6, out_27_7); | 1138 out[27] = _mm_packs_epi32(out_27_6, out_27_7); |
1157 } | 1139 } |
1158 #if FDCT32x32_HIGH_PRECISION | 1140 #if FDCT32x32_HIGH_PRECISION |
1159 } else { | 1141 } else { |
1160 __m128i lstep1[64], lstep2[64], lstep3[64]; | 1142 __m128i lstep1[64], lstep2[64], lstep3[64]; |
1161 __m128i u[32], v[32], sign[16]; | 1143 __m128i u[32], v[32], sign[16]; |
1162 const __m128i mask16 = _mm_set1_epi32(0x80008000); | |
1163 const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); | 1144 const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); |
1164 // start using 32-bit operations | 1145 // start using 32-bit operations |
1165 // stage 3 | 1146 // stage 3 |
1166 { | 1147 { |
1167 // expanding to 32-bit length priori to addition operations | 1148 // expanding to 32-bit length priori to addition operations |
1168 lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero); | 1149 lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero); |
1169 lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero); | 1150 lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero); |
1170 lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero); | 1151 lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero); |
1171 lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero); | 1152 lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero); |
1172 lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero); | 1153 lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero); |
1173 lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero); | 1154 lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero); |
1174 lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero); | 1155 lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero); |
1175 lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero); | 1156 lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero); |
1176 lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero); | 1157 lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero); |
1177 lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero); | 1158 lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero); |
1178 lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero); | 1159 lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero); |
1179 lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero); | 1160 lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero); |
1180 lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero); | 1161 lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero); |
1181 lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero); | 1162 lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero); |
1182 lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero); | 1163 lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero); |
1183 lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero); | 1164 lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero); |
| 1165 lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne); |
| 1166 lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne); |
| 1167 lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne); |
| 1168 lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne); |
| 1169 lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne); |
| 1170 lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne); |
| 1171 lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne); |
| 1172 lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne); |
| 1173 lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne); |
| 1174 lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne); |
| 1175 lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); |
| 1176 lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); |
| 1177 lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); |
| 1178 lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); |
| 1179 lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); |
| 1180 lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); |
1184 | 1181 |
1185 lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); | 1182 lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); |
1186 lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); | 1183 lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); |
1187 lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]); | 1184 lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]); |
1188 lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]); | 1185 lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]); |
1189 lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]); | 1186 lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]); |
1190 lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]); | 1187 lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]); |
1191 lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]); | 1188 lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]); |
1192 lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]); | 1189 lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]); |
1193 lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]); | 1190 lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]); |
(...skipping 30 matching lines...) Expand all Loading... |
1224 lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); | 1221 lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); |
1225 lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); | 1222 lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); |
1226 lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); | 1223 lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); |
1227 lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); | 1224 lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); |
1228 lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); | 1225 lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); |
1229 lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); | 1226 lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); |
1230 lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); | 1227 lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); |
1231 lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); | 1228 lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); |
1232 } | 1229 } |
1233 { | 1230 { |
1234 lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero); | 1231 lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); |
1235 lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero); | 1232 lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); |
1236 lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero); | 1233 lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); |
1237 lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero); | 1234 lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); |
1238 lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero); | 1235 lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); |
1239 lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero); | 1236 lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); |
1240 lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero); | 1237 lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); |
1241 lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero); | 1238 lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); |
1242 lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero); | 1239 lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); |
1243 lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero); | 1240 lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); |
1244 lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero); | 1241 lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); |
1245 lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero); | 1242 lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); |
1246 lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero); | 1243 lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); |
1247 lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero); | 1244 lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); |
1248 lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero); | 1245 lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); |
1249 lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero); | 1246 lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); |
| 1247 lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); |
| 1248 lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); |
| 1249 lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); |
| 1250 lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); |
| 1251 lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); |
| 1252 lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); |
| 1253 lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); |
| 1254 lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); |
| 1255 lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); |
| 1256 lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); |
| 1257 lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); |
| 1258 lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); |
| 1259 lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); |
| 1260 lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); |
| 1261 lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); |
| 1262 lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); |
1250 | 1263 |
1251 lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero); | 1264 lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); |
1252 lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero); | 1265 lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); |
1253 lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero); | 1266 lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); |
1254 lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero); | 1267 lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); |
1255 lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero); | 1268 lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); |
1256 lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero); | 1269 lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); |
1257 lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero); | 1270 lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); |
1258 lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero); | 1271 lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); |
1259 lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero); | 1272 lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); |
1260 lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero); | 1273 lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); |
1261 lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero); | 1274 lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); |
1262 lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero); | 1275 lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); |
1263 lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero); | 1276 lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); |
1264 lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero); | 1277 lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); |
1265 lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero); | 1278 lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); |
1266 lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero); | 1279 lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); |
| 1280 lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); |
| 1281 lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); |
| 1282 lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); |
| 1283 lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); |
| 1284 lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); |
| 1285 lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); |
| 1286 lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); |
| 1287 lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); |
| 1288 lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); |
| 1289 lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); |
| 1290 lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); |
| 1291 lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); |
| 1292 lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); |
| 1293 lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); |
| 1294 lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); |
| 1295 lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); |
1267 | 1296 |
1268 lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); | 1297 lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); |
1269 lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); | 1298 lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); |
| 1299 |
1270 lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); | 1300 lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); |
1271 lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); | 1301 lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); |
1272 lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); | 1302 lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); |
1273 lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); | 1303 lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); |
1274 lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); | 1304 lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); |
1275 lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); | 1305 lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); |
1276 lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); | 1306 lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); |
1277 lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); | 1307 lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); |
1278 lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); | 1308 lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); |
1279 lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); | 1309 lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); |
(...skipping 15 matching lines...) Expand all Loading... |
1295 lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); | 1325 lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); |
1296 lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); | 1326 lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); |
1297 lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); | 1327 lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); |
1298 lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); | 1328 lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); |
1299 lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); | 1329 lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); |
1300 } | 1330 } |
1301 | 1331 |
1302 // stage 4 | 1332 // stage 4 |
1303 { | 1333 { |
1304 // expanding to 32-bit length priori to addition operations | 1334 // expanding to 32-bit length priori to addition operations |
1305 lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero); | 1335 lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero); |
1306 lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero); | 1336 lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero); |
1307 lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero); | 1337 lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero); |
1308 lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero); | 1338 lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero); |
1309 lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero); | 1339 lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); |
1310 lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero); | 1340 lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); |
1311 lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero); | 1341 lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); |
1312 lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero); | 1342 lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); |
| 1343 lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); |
| 1344 lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); |
| 1345 lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); |
| 1346 lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); |
| 1347 lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); |
| 1348 lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); |
| 1349 lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); |
| 1350 lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); |
1313 | 1351 |
1314 lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]); | 1352 lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]); |
1315 lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]); | 1353 lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]); |
1316 lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]); | 1354 lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]); |
1317 lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]); | 1355 lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]); |
1318 lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]); | 1356 lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]); |
1319 lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]); | 1357 lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]); |
1320 lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]); | 1358 lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]); |
1321 lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]); | 1359 lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]); |
1322 lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); | 1360 lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); |
1323 lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); | 1361 lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); |
1324 lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); | 1362 lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); |
1325 lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); | 1363 lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); |
1326 lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); | 1364 lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); |
1327 lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); | 1365 lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); |
1328 lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); | 1366 lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); |
1329 lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); | 1367 lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); |
1330 lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); | 1368 lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); |
1331 lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); | 1369 lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); |
1332 lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); | 1370 lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); |
1333 lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); | 1371 lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); |
1334 lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); | 1372 lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); |
1335 lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); | 1373 lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); |
1336 lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); | 1374 lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); |
1337 lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); | 1375 lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); |
1338 } | 1376 } |
1339 { | 1377 { |
1340 // to be continued... | 1378 // to be continued... |
1341 // | 1379 // |
1342 const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); | 1380 const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); |
1343 const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); | 1381 const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); |
1344 | 1382 |
1345 u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); | 1383 u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); |
1346 u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); | 1384 u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); |
1347 u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); | 1385 u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); |
1348 u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); | 1386 u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); |
1349 | 1387 |
1350 // TODO(jingning): manually inline k_madd_epi32_ to further hide | 1388 // TODO(jingning): manually inline k_madd_epi32_ to further hide |
1351 // instruction latency. | 1389 // instruction latency. |
1352 v[ 0] = k_madd_epi32(u[0], k32_p16_m16); | 1390 v[ 0] = k_madd_epi32(u[0], k32_p16_m16); |
1353 v[ 1] = k_madd_epi32(u[1], k32_p16_m16); | 1391 v[ 1] = k_madd_epi32(u[1], k32_p16_m16); |
1354 v[ 2] = k_madd_epi32(u[2], k32_p16_m16); | 1392 v[ 2] = k_madd_epi32(u[2], k32_p16_m16); |
1355 v[ 3] = k_madd_epi32(u[3], k32_p16_m16); | 1393 v[ 3] = k_madd_epi32(u[3], k32_p16_m16); |
1356 v[ 4] = k_madd_epi32(u[0], k32_p16_p16); | 1394 v[ 4] = k_madd_epi32(u[0], k32_p16_p16); |
1357 v[ 5] = k_madd_epi32(u[1], k32_p16_p16); | 1395 v[ 5] = k_madd_epi32(u[1], k32_p16_p16); |
1358 v[ 6] = k_madd_epi32(u[2], k32_p16_p16); | 1396 v[ 6] = k_madd_epi32(u[2], k32_p16_p16); |
1359 v[ 7] = k_madd_epi32(u[3], k32_p16_p16); | 1397 v[ 7] = k_madd_epi32(u[3], k32_p16_p16); |
1360 | 1398 |
1361 u[0] = k_packs_epi64(v[0], v[1]); | 1399 u[0] = k_packs_epi64(v[0], v[1]); |
1362 u[1] = k_packs_epi64(v[2], v[3]); | 1400 u[1] = k_packs_epi64(v[2], v[3]); |
1363 u[2] = k_packs_epi64(v[4], v[5]); | 1401 u[2] = k_packs_epi64(v[4], v[5]); |
1364 u[3] = k_packs_epi64(v[6], v[7]); | 1402 u[3] = k_packs_epi64(v[6], v[7]); |
1365 | 1403 |
1366 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); | 1404 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
1367 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); | 1405 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
1368 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); | 1406 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
1369 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); | 1407 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
1370 | 1408 |
1371 lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 1409 lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
1372 lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 1410 lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
1373 lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 1411 lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
1374 lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 1412 lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
1375 } | 1413 } |
1376 { | 1414 { |
1377 const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); | 1415 const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); |
1378 const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); | 1416 const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); |
1379 const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); | 1417 const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); |
1380 | 1418 |
1381 u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); | 1419 u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); |
1382 u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); | 1420 u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); |
1383 u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); | 1421 u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); |
1384 u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); | 1422 u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); |
(...skipping 1255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2640 _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); | 2678 _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); |
2641 _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); | 2679 _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); |
2642 _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); | 2680 _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); |
2643 _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); | 2681 _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); |
2644 // Process next 8x8 | 2682 // Process next 8x8 |
2645 output += 8; | 2683 output += 8; |
2646 } | 2684 } |
2647 } | 2685 } |
2648 } | 2686 } |
2649 } | 2687 } |
2650 } | 2688 } // NOLINT |
OLD | NEW |