Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c

Issue 111463005: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h> 11 #include <assert.h>
12 #include <emmintrin.h> // SSE2 12 #include <emmintrin.h> // SSE2
13 #include "./vpx_config.h" 13 #include "./vpx_config.h"
14 #include "vpx/vpx_integer.h" 14 #include "vpx/vpx_integer.h"
15 #include "vp9/common/vp9_common.h" 15 #include "vp9/common/vp9_common.h"
16 #include "vp9/common/vp9_idct.h" 16 #include "vp9/common/vp9_idct.h"
17 17
18 #define RECON_AND_STORE4X4(dest, in_x) \
19 { \
20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
21 d0 = _mm_unpacklo_epi8(d0, zero); \
22 d0 = _mm_add_epi16(in_x, d0); \
23 d0 = _mm_packus_epi16(d0, d0); \
24 *(int *)dest = _mm_cvtsi128_si32(d0); \
25 dest += stride; \
26 }
27
18 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 28 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
19 const __m128i zero = _mm_setzero_si128(); 29 const __m128i zero = _mm_setzero_si128();
20 const __m128i eight = _mm_set1_epi16(8); 30 const __m128i eight = _mm_set1_epi16(8);
21 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 31 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
22 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 32 (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
23 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 33 (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
24 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 34 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
25 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 35 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
26 __m128i input0, input1, input2, input3; 36 __m128i input0, input1, input2, input3;
27 37
28 // Rows 38 // Rows
29 input0 = _mm_loadl_epi64((const __m128i *)input); 39 input0 = _mm_load_si128((const __m128i *)input);
30 input1 = _mm_loadl_epi64((const __m128i *)(input + 4)); 40 input2 = _mm_load_si128((const __m128i *)(input + 8));
31 input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
32 input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
33 41
34 // Construct i3, i1, i3, i1, i2, i0, i2, i0 42 // Construct i3, i1, i3, i1, i2, i0, i2, i0
35 input0 = _mm_shufflelo_epi16(input0, 0xd8); 43 input0 = _mm_shufflelo_epi16(input0, 0xd8);
36 input1 = _mm_shufflelo_epi16(input1, 0xd8); 44 input0 = _mm_shufflehi_epi16(input0, 0xd8);
37 input2 = _mm_shufflelo_epi16(input2, 0xd8); 45 input2 = _mm_shufflelo_epi16(input2, 0xd8);
38 input3 = _mm_shufflelo_epi16(input3, 0xd8); 46 input2 = _mm_shufflehi_epi16(input2, 0xd8);
39 47
48 input1 = _mm_unpackhi_epi32(input0, input0);
40 input0 = _mm_unpacklo_epi32(input0, input0); 49 input0 = _mm_unpacklo_epi32(input0, input0);
41 input1 = _mm_unpacklo_epi32(input1, input1); 50 input3 = _mm_unpackhi_epi32(input2, input2);
42 input2 = _mm_unpacklo_epi32(input2, input2); 51 input2 = _mm_unpacklo_epi32(input2, input2);
43 input3 = _mm_unpacklo_epi32(input3, input3);
44 52
45 // Stage 1 53 // Stage 1
46 input0 = _mm_madd_epi16(input0, cst); 54 input0 = _mm_madd_epi16(input0, cst);
47 input1 = _mm_madd_epi16(input1, cst); 55 input1 = _mm_madd_epi16(input1, cst);
48 input2 = _mm_madd_epi16(input2, cst); 56 input2 = _mm_madd_epi16(input2, cst);
49 input3 = _mm_madd_epi16(input3, cst); 57 input3 = _mm_madd_epi16(input3, cst);
50 58
51 input0 = _mm_add_epi32(input0, rounding); 59 input0 = _mm_add_epi32(input0, rounding);
52 input1 = _mm_add_epi32(input1, rounding); 60 input1 = _mm_add_epi32(input1, rounding);
53 input2 = _mm_add_epi32(input2, rounding); 61 input2 = _mm_add_epi32(input2, rounding);
54 input3 = _mm_add_epi32(input3, rounding); 62 input3 = _mm_add_epi32(input3, rounding);
55 63
56 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 64 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
57 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 65 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
58 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 66 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
59 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 67 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
60 68
61 // Stage 2 69 // Stage 2
62 input0 = _mm_packs_epi32(input0, zero); 70 input0 = _mm_packs_epi32(input0, input1);
63 input1 = _mm_packs_epi32(input1, zero); 71 input1 = _mm_packs_epi32(input2, input3);
64 input2 = _mm_packs_epi32(input2, zero);
65 input3 = _mm_packs_epi32(input3, zero);
66 72
67 // Transpose 73 // Transpose
68 input1 = _mm_unpacklo_epi16(input0, input1); 74 input2 = _mm_unpacklo_epi16(input0, input1);
69 input3 = _mm_unpacklo_epi16(input2, input3); 75 input3 = _mm_unpackhi_epi16(input0, input1);
70 input0 = _mm_unpacklo_epi32(input1, input3); 76 input0 = _mm_unpacklo_epi32(input2, input3);
71 input1 = _mm_unpackhi_epi32(input1, input3); 77 input1 = _mm_unpackhi_epi32(input2, input3);
72 78
73 // Switch column2, column 3, and then, we got: 79 // Switch column2, column 3, and then, we got:
74 // input2: column1, column 0; input3: column2, column 3. 80 // input2: column1, column 0; input3: column2, column 3.
75 input1 = _mm_shuffle_epi32(input1, 0x4e); 81 input1 = _mm_shuffle_epi32(input1, 0x4e);
76 input2 = _mm_add_epi16(input0, input1); 82 input2 = _mm_add_epi16(input0, input1);
77 input3 = _mm_sub_epi16(input0, input1); 83 input3 = _mm_sub_epi16(input0, input1);
78 84
79 // Columns 85 // Columns
80 // Construct i3, i1, i3, i1, i2, i0, i2, i0 86 // Construct i3, i1, i3, i1, i2, i0, i2, i0
81 input0 = _mm_shufflelo_epi16(input2, 0xd8); 87 input0 = _mm_unpacklo_epi32(input2, input2);
82 input1 = _mm_shufflehi_epi16(input2, 0xd8); 88 input1 = _mm_unpackhi_epi32(input2, input2);
83 input2 = _mm_shufflehi_epi16(input3, 0xd8); 89 input2 = _mm_unpackhi_epi32(input3, input3);
84 input3 = _mm_shufflelo_epi16(input3, 0xd8);
85
86 input0 = _mm_unpacklo_epi32(input0, input0);
87 input1 = _mm_unpackhi_epi32(input1, input1);
88 input2 = _mm_unpackhi_epi32(input2, input2);
89 input3 = _mm_unpacklo_epi32(input3, input3); 90 input3 = _mm_unpacklo_epi32(input3, input3);
90 91
91 // Stage 1 92 // Stage 1
92 input0 = _mm_madd_epi16(input0, cst); 93 input0 = _mm_madd_epi16(input0, cst);
93 input1 = _mm_madd_epi16(input1, cst); 94 input1 = _mm_madd_epi16(input1, cst);
94 input2 = _mm_madd_epi16(input2, cst); 95 input2 = _mm_madd_epi16(input2, cst);
95 input3 = _mm_madd_epi16(input3, cst); 96 input3 = _mm_madd_epi16(input3, cst);
96 97
97 input0 = _mm_add_epi32(input0, rounding); 98 input0 = _mm_add_epi32(input0, rounding);
98 input1 = _mm_add_epi32(input1, rounding); 99 input1 = _mm_add_epi32(input1, rounding);
99 input2 = _mm_add_epi32(input2, rounding); 100 input2 = _mm_add_epi32(input2, rounding);
100 input3 = _mm_add_epi32(input3, rounding); 101 input3 = _mm_add_epi32(input3, rounding);
101 102
102 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 103 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
103 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 104 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
104 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 105 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
105 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 106 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
106 107
107 // Stage 2 108 // Stage 2
108 input0 = _mm_packs_epi32(input0, zero); 109 input0 = _mm_packs_epi32(input0, input2);
109 input1 = _mm_packs_epi32(input1, zero); 110 input1 = _mm_packs_epi32(input1, input3);
110 input2 = _mm_packs_epi32(input2, zero);
111 input3 = _mm_packs_epi32(input3, zero);
112 111
113 // Transpose 112 // Transpose
114 input1 = _mm_unpacklo_epi16(input0, input1); 113 input2 = _mm_unpacklo_epi16(input0, input1);
115 input3 = _mm_unpacklo_epi16(input2, input3); 114 input3 = _mm_unpackhi_epi16(input0, input1);
116 input0 = _mm_unpacklo_epi32(input1, input3); 115 input0 = _mm_unpacklo_epi32(input2, input3);
117 input1 = _mm_unpackhi_epi32(input1, input3); 116 input1 = _mm_unpackhi_epi32(input2, input3);
118 117
119 // Switch column2, column 3, and then, we got: 118 // Switch column2, column 3, and then, we got:
120 // input2: column1, column 0; input3: column2, column 3. 119 // input2: column1, column 0; input3: column2, column 3.
121 input1 = _mm_shuffle_epi32(input1, 0x4e); 120 input1 = _mm_shuffle_epi32(input1, 0x4e);
122 input2 = _mm_add_epi16(input0, input1); 121 input2 = _mm_add_epi16(input0, input1);
123 input3 = _mm_sub_epi16(input0, input1); 122 input3 = _mm_sub_epi16(input0, input1);
124 123
125 // Final round and shift 124 // Final round and shift
126 input2 = _mm_add_epi16(input2, eight); 125 input2 = _mm_add_epi16(input2, eight);
127 input3 = _mm_add_epi16(input3, eight); 126 input3 = _mm_add_epi16(input3, eight);
128 127
129 input2 = _mm_srai_epi16(input2, 4); 128 input2 = _mm_srai_epi16(input2, 4);
130 input3 = _mm_srai_epi16(input3, 4); 129 input3 = _mm_srai_epi16(input3, 4);
131 130
132 #define RECON_AND_STORE4X4(dest, in_x) \ 131 // Reconstruction and Store
133 { \ 132 {
134 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 133 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
135 d0 = _mm_unpacklo_epi8(d0, zero); \ 134 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
136 d0 = _mm_add_epi16(in_x, d0); \ 135 d0 = _mm_unpacklo_epi32(d0,
137 d0 = _mm_packus_epi16(d0, d0); \ 136 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
138 *(int *)dest = _mm_cvtsi128_si32(d0); \ 137 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
139 dest += stride; \ 138 *(const int *) (dest + stride * 3)), d2);
139 d0 = _mm_unpacklo_epi8(d0, zero);
140 d2 = _mm_unpacklo_epi8(d2, zero);
141 d0 = _mm_add_epi16(d0, input2);
142 d2 = _mm_add_epi16(d2, input3);
143 d0 = _mm_packus_epi16(d0, d2);
144 // store input0
145 *(int *)dest = _mm_cvtsi128_si32(d0);
146 // store input1
147 d0 = _mm_srli_si128(d0, 4);
148 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
149 // store input2
150 d0 = _mm_srli_si128(d0, 4);
151 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
152 // store input3
153 d0 = _mm_srli_si128(d0, 4);
154 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
140 } 155 }
141
142 input0 = _mm_srli_si128(input2, 8);
143 input1 = _mm_srli_si128(input3, 8);
144
145 RECON_AND_STORE4X4(dest, input2);
146 RECON_AND_STORE4X4(dest, input0);
147 RECON_AND_STORE4X4(dest, input1);
148 RECON_AND_STORE4X4(dest, input3);
149 } 156 }
150 157
151 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 158 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
152 __m128i dc_value; 159 __m128i dc_value;
153 const __m128i zero = _mm_setzero_si128(); 160 const __m128i zero = _mm_setzero_si128();
154 int a; 161 int a;
155 162
156 a = dct_const_round_shift(input[0] * cospi_16_64); 163 a = dct_const_round_shift(input[0] * cospi_16_64);
157 a = dct_const_round_shift(a * cospi_16_64); 164 a = dct_const_round_shift(a * cospi_16_64);
158 a = ROUND_POWER_OF_TWO(a, 4); 165 a = ROUND_POWER_OF_TWO(a, 4);
159 166
160 dc_value = _mm_set1_epi16(a); 167 dc_value = _mm_set1_epi16(a);
161 168
162 RECON_AND_STORE4X4(dest, dc_value); 169 RECON_AND_STORE4X4(dest, dc_value);
163 RECON_AND_STORE4X4(dest, dc_value); 170 RECON_AND_STORE4X4(dest, dc_value);
164 RECON_AND_STORE4X4(dest, dc_value); 171 RECON_AND_STORE4X4(dest, dc_value);
165 RECON_AND_STORE4X4(dest, dc_value); 172 RECON_AND_STORE4X4(dest, dc_value);
166 } 173 }
167 174
168 static INLINE void transpose_4x4(__m128i *res) { 175 static INLINE void transpose_4x4(__m128i *res) {
169 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 176 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
170 const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); 177 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
171 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
172 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
173 178
174 res[1] = _mm_unpackhi_epi64(res[0], res[0]); 179 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
175 res[3] = _mm_unpackhi_epi64(res[2], res[2]); 180 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
176 } 181 }
177 182
178 static void idct4_1d_sse2(__m128i *in) { 183 static void idct4_1d_sse2(__m128i *in) {
179 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 184 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
180 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 185 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
181 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 186 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
182 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 187 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
183 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 188 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
184 __m128i u[8], v[8]; 189 __m128i u[8], v[8];
185 190
186 transpose_4x4(in); 191 transpose_4x4(in);
187 // stage 1 192 // stage 1
188 u[0] = _mm_unpacklo_epi16(in[0], in[2]); 193 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
189 u[1] = _mm_unpacklo_epi16(in[1], in[3]); 194 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
190 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 195 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
191 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 196 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
192 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 197 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
193 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 198 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
194 199
195 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 200 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
196 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 201 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
197 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 202 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
198 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 203 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
199 204
200 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 205 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
201 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 206 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
202 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 207 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
203 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 208 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
204 209
205 u[0] = _mm_packs_epi32(v[0], v[2]); 210 u[0] = _mm_packs_epi32(v[0], v[1]);
206 u[1] = _mm_packs_epi32(v[1], v[3]); 211 u[1] = _mm_packs_epi32(v[3], v[2]);
207 u[2] = _mm_unpackhi_epi64(u[0], u[0]);
208 u[3] = _mm_unpackhi_epi64(u[1], u[1]);
209 212
210 // stage 2 213 // stage 2
211 in[0] = _mm_add_epi16(u[0], u[3]); 214 in[0] = _mm_add_epi16(u[0], u[1]);
212 in[1] = _mm_add_epi16(u[1], u[2]); 215 in[1] = _mm_sub_epi16(u[0], u[1]);
213 in[2] = _mm_sub_epi16(u[1], u[2]); 216 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
214 in[3] = _mm_sub_epi16(u[0], u[3]);
215 } 217 }
216 218
217 static void iadst4_1d_sse2(__m128i *in) { 219 static void iadst4_1d_sse2(__m128i *in) {
218 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 220 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
219 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 221 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
220 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 222 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
221 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 223 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
222 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 224 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
223 const __m128i kZero = _mm_set1_epi16(0); 225 const __m128i kZero = _mm_set1_epi16(0);
224 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 226 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
225 __m128i u[8], v[8], in7; 227 __m128i u[8], v[8], in7;
226 228
227 transpose_4x4(in); 229 transpose_4x4(in);
228 in7 = _mm_add_epi16(in[0], in[3]); 230 in7 = _mm_srli_si128(in[1], 8);
229 in7 = _mm_sub_epi16(in7, in[2]); 231 in7 = _mm_add_epi16(in7, in[0]);
232 in7 = _mm_sub_epi16(in7, in[1]);
230 233
231 u[0] = _mm_unpacklo_epi16(in[0], in[2]); 234 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
232 u[1] = _mm_unpacklo_epi16(in[1], in[3]); 235 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
233 u[2] = _mm_unpacklo_epi16(in7, kZero); 236 u[2] = _mm_unpacklo_epi16(in7, kZero);
234 u[3] = _mm_unpacklo_epi16(in[1], kZero); 237 u[3] = _mm_unpackhi_epi16(in[0], kZero);
235 238
236 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 239 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
237 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 240 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
238 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 241 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
239 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 242 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
240 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 243 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
241 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 244 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
242 245
243 u[0] = _mm_add_epi32(v[0], v[1]); 246 u[0] = _mm_add_epi32(v[0], v[1]);
244 u[1] = _mm_add_epi32(v[3], v[4]); 247 u[1] = _mm_add_epi32(v[3], v[4]);
245 u[2] = v[2]; 248 u[2] = v[2];
246 u[3] = _mm_add_epi32(u[0], u[1]); 249 u[3] = _mm_add_epi32(u[0], u[1]);
247 u[4] = _mm_slli_epi32(v[5], 2); 250 u[4] = _mm_slli_epi32(v[5], 2);
248 u[5] = _mm_add_epi32(u[3], v[5]); 251 u[5] = _mm_add_epi32(u[3], v[5]);
249 u[6] = _mm_sub_epi32(u[5], u[4]); 252 u[6] = _mm_sub_epi32(u[5], u[4]);
250 253
251 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 254 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
252 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 255 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
253 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 256 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
254 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 257 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
255 258
256 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 259 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
257 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 260 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
258 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 261 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
259 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 262 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
260 263
261 in[0] = _mm_packs_epi32(u[0], u[2]); 264 in[0] = _mm_packs_epi32(u[0], u[1]);
262 in[1] = _mm_packs_epi32(u[1], u[3]); 265 in[1] = _mm_packs_epi32(u[2], u[3]);
263 in[2] = _mm_unpackhi_epi64(in[0], in[0]);
264 in[3] = _mm_unpackhi_epi64(in[1], in[1]);
265 } 266 }
266 267
267 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, 268 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
268 int tx_type) { 269 int tx_type) {
269 __m128i in[4]; 270 __m128i in[2];
270 const __m128i zero = _mm_setzero_si128(); 271 const __m128i zero = _mm_setzero_si128();
271 const __m128i eight = _mm_set1_epi16(8); 272 const __m128i eight = _mm_set1_epi16(8);
272 273
273 in[0] = _mm_loadl_epi64((const __m128i *)input); 274 in[0]= _mm_loadu_si128((const __m128i *)(input));
274 in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); 275 in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
275 in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
276 in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
277 276
278 switch (tx_type) { 277 switch (tx_type) {
279 case 0: // DCT_DCT 278 case 0: // DCT_DCT
280 idct4_1d_sse2(in); 279 idct4_1d_sse2(in);
281 idct4_1d_sse2(in); 280 idct4_1d_sse2(in);
282 break; 281 break;
283 case 1: // ADST_DCT 282 case 1: // ADST_DCT
284 idct4_1d_sse2(in); 283 idct4_1d_sse2(in);
285 iadst4_1d_sse2(in); 284 iadst4_1d_sse2(in);
286 break; 285 break;
287 case 2: // DCT_ADST 286 case 2: // DCT_ADST
288 iadst4_1d_sse2(in); 287 iadst4_1d_sse2(in);
289 idct4_1d_sse2(in); 288 idct4_1d_sse2(in);
290 break; 289 break;
291 case 3: // ADST_ADST 290 case 3: // ADST_ADST
292 iadst4_1d_sse2(in); 291 iadst4_1d_sse2(in);
293 iadst4_1d_sse2(in); 292 iadst4_1d_sse2(in);
294 break; 293 break;
295 default: 294 default:
296 assert(0); 295 assert(0);
297 break; 296 break;
298 } 297 }
299 298
300 // Final round and shift 299 // Final round and shift
301 in[0] = _mm_add_epi16(in[0], eight); 300 in[0] = _mm_add_epi16(in[0], eight);
302 in[1] = _mm_add_epi16(in[1], eight); 301 in[1] = _mm_add_epi16(in[1], eight);
303 in[2] = _mm_add_epi16(in[2], eight);
304 in[3] = _mm_add_epi16(in[3], eight);
305 302
306 in[0] = _mm_srai_epi16(in[0], 4); 303 in[0] = _mm_srai_epi16(in[0], 4);
307 in[1] = _mm_srai_epi16(in[1], 4); 304 in[1] = _mm_srai_epi16(in[1], 4);
308 in[2] = _mm_srai_epi16(in[2], 4);
309 in[3] = _mm_srai_epi16(in[3], 4);
310 305
311 RECON_AND_STORE4X4(dest, in[0]); 306 // Reconstruction and Store
312 RECON_AND_STORE4X4(dest, in[1]); 307 {
313 RECON_AND_STORE4X4(dest, in[2]); 308 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
314 RECON_AND_STORE4X4(dest, in[3]); 309 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
310 d0 = _mm_unpacklo_epi32(d0,
311 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
312 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
313 *(const int *) (dest + stride * 3)));
314 d0 = _mm_unpacklo_epi8(d0, zero);
315 d2 = _mm_unpacklo_epi8(d2, zero);
316 d0 = _mm_add_epi16(d0, in[0]);
317 d2 = _mm_add_epi16(d2, in[1]);
318 d0 = _mm_packus_epi16(d0, d2);
319 // store result[0]
320 *(int *)dest = _mm_cvtsi128_si32(d0);
321 // store result[1]
322 d0 = _mm_srli_si128(d0, 4);
323 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
324 // store result[2]
325 d0 = _mm_srli_si128(d0, 4);
326 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
327 // store result[3]
328 d0 = _mm_srli_si128(d0, 4);
329 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
330 }
315 } 331 }
316 332
317 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 333 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
318 out0, out1, out2, out3, out4, out5, out6, out7) \ 334 out0, out1, out2, out3, out4, out5, out6, out7) \
319 { \ 335 { \
320 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 336 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
321 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 337 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
322 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 338 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
323 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 339 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
324 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 340 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after
408 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 424 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
409 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 425 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
410 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 426 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
411 \ 427 \
412 res0 = _mm_packs_epi32(tmp0, tmp1); \ 428 res0 = _mm_packs_epi32(tmp0, tmp1); \
413 res1 = _mm_packs_epi32(tmp2, tmp3); \ 429 res1 = _mm_packs_epi32(tmp2, tmp3); \
414 res2 = _mm_packs_epi32(tmp4, tmp5); \ 430 res2 = _mm_packs_epi32(tmp4, tmp5); \
415 res3 = _mm_packs_epi32(tmp6, tmp7); \ 431 res3 = _mm_packs_epi32(tmp6, tmp7); \
416 } 432 }
417 433
434 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
435 { \
436 tmp0 = _mm_madd_epi16(lo_0, cst0); \
437 tmp1 = _mm_madd_epi16(hi_0, cst0); \
438 tmp2 = _mm_madd_epi16(lo_0, cst1); \
439 tmp3 = _mm_madd_epi16(hi_0, cst1); \
440 \
441 tmp0 = _mm_add_epi32(tmp0, rounding); \
442 tmp1 = _mm_add_epi32(tmp1, rounding); \
443 tmp2 = _mm_add_epi32(tmp2, rounding); \
444 tmp3 = _mm_add_epi32(tmp3, rounding); \
445 \
446 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
447 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
448 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
449 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
450 \
451 res0 = _mm_packs_epi32(tmp0, tmp1); \
452 res1 = _mm_packs_epi32(tmp2, tmp3); \
453 }
454
418 #define IDCT8_1D \ 455 #define IDCT8_1D \
419 /* Stage1 */ \ 456 /* Stage1 */ \
420 { \ 457 { \
421 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 458 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
422 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 459 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
423 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 460 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
424 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 461 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
425 \ 462 \
426 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 463 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
427 stg1_1, stg1_2, stg1_3, stp1_4, \ 464 stg1_1, stg1_2, stg1_3, stp1_4, \
(...skipping 178 matching lines...) Expand 10 before | Expand all | Expand 10 after
606 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 643 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
607 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 644 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
608 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 645 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
609 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 646 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
610 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 647 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
611 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 648 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
612 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 649 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
613 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 650 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
614 } 651 }
615 652
653 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
654 const __m128i zero = _mm_setzero_si128();
655 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
656 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
657 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
658 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
659
660 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
661 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
662 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
663 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
664
665 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
666 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
667 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
668 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
669 out[4] = out[5] = out[6] = out[7] = zero;
670 }
671
616 static void idct8_1d_sse2(__m128i *in) { 672 static void idct8_1d_sse2(__m128i *in) {
617 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 673 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
618 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 674 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
619 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 675 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
620 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 676 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
621 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 677 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
622 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 678 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
623 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 679 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
624 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 680 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
625 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 681 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
(...skipping 469 matching lines...) Expand 10 before | Expand all | Expand 10 after
1095 RECON_AND_STORE(dest, in3); 1151 RECON_AND_STORE(dest, in3);
1096 RECON_AND_STORE(dest, in4); 1152 RECON_AND_STORE(dest, in4);
1097 RECON_AND_STORE(dest, in5); 1153 RECON_AND_STORE(dest, in5);
1098 RECON_AND_STORE(dest, in6); 1154 RECON_AND_STORE(dest, in6);
1099 RECON_AND_STORE(dest, in7); 1155 RECON_AND_STORE(dest, in7);
1100 } 1156 }
1101 1157
1102 #define IDCT16_1D \ 1158 #define IDCT16_1D \
1103 /* Stage2 */ \ 1159 /* Stage2 */ \
1104 { \ 1160 { \
1105 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ 1161 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
1106 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ 1162 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
1107 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ 1163 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
1108 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ 1164 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
1109 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ 1165 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
1110 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ 1166 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
1111 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ 1167 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
1112 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ 1168 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
1113 \ 1169 \
1114 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 1170 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1115 stg2_0, stg2_1, stg2_2, stg2_3, \ 1171 stg2_0, stg2_1, stg2_2, stg2_3, \
1116 stp2_8, stp2_15, stp2_9, stp2_14) \ 1172 stp2_8, stp2_15, stp2_9, stp2_14) \
1117 \ 1173 \
1118 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 1174 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1119 stg2_4, stg2_5, stg2_6, stg2_7, \ 1175 stg2_4, stg2_5, stg2_6, stg2_7, \
1120 stp2_10, stp2_13, stp2_11, stp2_12) \ 1176 stp2_10, stp2_13, stp2_11, stp2_12) \
1121 } \ 1177 } \
1122 \ 1178 \
1123 /* Stage3 */ \ 1179 /* Stage3 */ \
1124 { \ 1180 { \
1125 const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ 1181 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
1126 const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ 1182 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
1127 const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ 1183 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
1128 const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ 1184 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
1129 \ 1185 \
1130 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 1186 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1131 stg3_0, stg3_1, stg3_2, stg3_3, \ 1187 stg3_0, stg3_1, stg3_2, stg3_3, \
1132 stp1_4, stp1_7, stp1_5, stp1_6) \ 1188 stp1_4, stp1_7, stp1_5, stp1_6) \
1133 \ 1189 \
1134 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 1190 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
1135 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1191 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
1136 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1192 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1137 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1193 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1138 \ 1194 \
1139 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 1195 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1140 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1196 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1141 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1197 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1142 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1198 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1143 } \ 1199 } \
1144 \ 1200 \
1145 /* Stage4 */ \ 1201 /* Stage4 */ \
1146 { \ 1202 { \
1147 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ 1203 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
1148 const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ 1204 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
1149 const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ 1205 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
1150 const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ 1206 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
1151 \ 1207 \
1152 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1208 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1153 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1209 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1154 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1210 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1155 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1211 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1156 \ 1212 \
1157 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 1213 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1158 stg4_0, stg4_1, stg4_2, stg4_3, \ 1214 stg4_0, stg4_1, stg4_2, stg4_3, \
1159 stp2_0, stp2_1, stp2_2, stp2_3) \ 1215 stp2_0, stp2_1, stp2_2, stp2_3) \
1160 \ 1216 \
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after
1252 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1308 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1253 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1309 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1254 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1310 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1255 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1311 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1256 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1312 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1257 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1313 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1258 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1314 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1259 1315
1260 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1316 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1261 1317
1262 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 1318 __m128i in[16], l[16], r[16], *curr1;
1263 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
1264 in10 = zero, in11 = zero, in12 = zero, in13 = zero,
1265 in14 = zero, in15 = zero;
1266 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
1267 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
1268 l12 = zero, l13 = zero, l14 = zero, l15 = zero;
1269 __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
1270 r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
1271 r12 = zero, r13 = zero, r14 = zero, r15 = zero;
1272 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1319 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1273 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1320 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1274 stp1_8_0, stp1_12_0; 1321 stp1_8_0, stp1_12_0;
1275 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1322 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1276 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1323 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1277 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1324 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1278 int i; 1325 int i;
1279 1326
1280 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. 1327 curr1 = l;
1281 for (i = 0; i < 4; i++) { 1328 for (i = 0; i < 2; i++) {
1282 // 1-D idct 1329 // 1-D idct
1283 if (i < 2) {
1284 if (i == 1) input += 128;
1285 1330
1286 // Load input data. 1331 // Load input data.
1287 in0 = _mm_load_si128((const __m128i *)input); 1332 in[0] = _mm_load_si128((const __m128i *)input);
1288 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1333 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1289 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1334 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1290 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1335 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1291 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1336 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1292 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1337 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1293 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1338 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1294 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1339 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1295 in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1340 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1296 in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1341 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1297 in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1342 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1298 in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1343 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1299 in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1344 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1300 in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1345 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1301 in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1346 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1302 in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1347 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1303 1348
1304 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1349 array_transpose_8x8(in, in);
1305 in4, in5, in6, in7); 1350 array_transpose_8x8(in+8, in+8);
1306 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
1307 in10, in11, in12, in13, in14, in15);
1308 }
1309 1351
1310 if (i == 2) { 1352 IDCT16_1D
1311 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
1312 in5, in6, in7);
1313 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
1314 in13, in14, in15);
1315 }
1316 1353
1317 if (i == 3) { 1354 // Stage7
1318 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 1355 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1319 in4, in5, in6, in7); 1356 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1320 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, 1357 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1321 in12, in13, in14, in15); 1358 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1322 } 1359 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1360 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1361 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1362 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1363 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1364 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1365 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1366 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1367 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1368 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1369 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1370 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1323 1371
1324 IDCT16_1D 1372 curr1 = r;
1373 input += 128;
1374 }
1375 for (i = 0; i < 2; i++) {
1376 // 1-D idct
1377 array_transpose_8x8(l+i*8, in);
1378 array_transpose_8x8(r+i*8, in+8);
1325 1379
1326 // Stage7 1380 IDCT16_1D
1327 if (i == 0) { 1381
1328 // Left 8x16
1329 l0 = _mm_add_epi16(stp2_0, stp1_15);
1330 l1 = _mm_add_epi16(stp2_1, stp1_14);
1331 l2 = _mm_add_epi16(stp2_2, stp2_13);
1332 l3 = _mm_add_epi16(stp2_3, stp2_12);
1333 l4 = _mm_add_epi16(stp2_4, stp2_11);
1334 l5 = _mm_add_epi16(stp2_5, stp2_10);
1335 l6 = _mm_add_epi16(stp2_6, stp1_9);
1336 l7 = _mm_add_epi16(stp2_7, stp1_8);
1337 l8 = _mm_sub_epi16(stp2_7, stp1_8);
1338 l9 = _mm_sub_epi16(stp2_6, stp1_9);
1339 l10 = _mm_sub_epi16(stp2_5, stp2_10);
1340 l11 = _mm_sub_epi16(stp2_4, stp2_11);
1341 l12 = _mm_sub_epi16(stp2_3, stp2_12);
1342 l13 = _mm_sub_epi16(stp2_2, stp2_13);
1343 l14 = _mm_sub_epi16(stp2_1, stp1_14);
1344 l15 = _mm_sub_epi16(stp2_0, stp1_15);
1345 } else if (i == 1) {
1346 // Right 8x16
1347 r0 = _mm_add_epi16(stp2_0, stp1_15);
1348 r1 = _mm_add_epi16(stp2_1, stp1_14);
1349 r2 = _mm_add_epi16(stp2_2, stp2_13);
1350 r3 = _mm_add_epi16(stp2_3, stp2_12);
1351 r4 = _mm_add_epi16(stp2_4, stp2_11);
1352 r5 = _mm_add_epi16(stp2_5, stp2_10);
1353 r6 = _mm_add_epi16(stp2_6, stp1_9);
1354 r7 = _mm_add_epi16(stp2_7, stp1_8);
1355 r8 = _mm_sub_epi16(stp2_7, stp1_8);
1356 r9 = _mm_sub_epi16(stp2_6, stp1_9);
1357 r10 = _mm_sub_epi16(stp2_5, stp2_10);
1358 r11 = _mm_sub_epi16(stp2_4, stp2_11);
1359 r12 = _mm_sub_epi16(stp2_3, stp2_12);
1360 r13 = _mm_sub_epi16(stp2_2, stp2_13);
1361 r14 = _mm_sub_epi16(stp2_1, stp1_14);
1362 r15 = _mm_sub_epi16(stp2_0, stp1_15);
1363 } else {
1364 // 2-D 1382 // 2-D
1365 in0 = _mm_add_epi16(stp2_0, stp1_15); 1383 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1366 in1 = _mm_add_epi16(stp2_1, stp1_14); 1384 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1367 in2 = _mm_add_epi16(stp2_2, stp2_13); 1385 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1368 in3 = _mm_add_epi16(stp2_3, stp2_12); 1386 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1369 in4 = _mm_add_epi16(stp2_4, stp2_11); 1387 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1370 in5 = _mm_add_epi16(stp2_5, stp2_10); 1388 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1371 in6 = _mm_add_epi16(stp2_6, stp1_9); 1389 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1372 in7 = _mm_add_epi16(stp2_7, stp1_8); 1390 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1373 in8 = _mm_sub_epi16(stp2_7, stp1_8); 1391 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1374 in9 = _mm_sub_epi16(stp2_6, stp1_9); 1392 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1375 in10 = _mm_sub_epi16(stp2_5, stp2_10); 1393 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1376 in11 = _mm_sub_epi16(stp2_4, stp2_11); 1394 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1377 in12 = _mm_sub_epi16(stp2_3, stp2_12); 1395 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1378 in13 = _mm_sub_epi16(stp2_2, stp2_13); 1396 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1379 in14 = _mm_sub_epi16(stp2_1, stp1_14); 1397 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1380 in15 = _mm_sub_epi16(stp2_0, stp1_15); 1398 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1381 1399
1382 // Final rounding and shift 1400 // Final rounding and shift
1383 in0 = _mm_adds_epi16(in0, final_rounding); 1401 in[0] = _mm_adds_epi16(in[0], final_rounding);
1384 in1 = _mm_adds_epi16(in1, final_rounding); 1402 in[1] = _mm_adds_epi16(in[1], final_rounding);
1385 in2 = _mm_adds_epi16(in2, final_rounding); 1403 in[2] = _mm_adds_epi16(in[2], final_rounding);
1386 in3 = _mm_adds_epi16(in3, final_rounding); 1404 in[3] = _mm_adds_epi16(in[3], final_rounding);
1387 in4 = _mm_adds_epi16(in4, final_rounding); 1405 in[4] = _mm_adds_epi16(in[4], final_rounding);
1388 in5 = _mm_adds_epi16(in5, final_rounding); 1406 in[5] = _mm_adds_epi16(in[5], final_rounding);
1389 in6 = _mm_adds_epi16(in6, final_rounding); 1407 in[6] = _mm_adds_epi16(in[6], final_rounding);
1390 in7 = _mm_adds_epi16(in7, final_rounding); 1408 in[7] = _mm_adds_epi16(in[7], final_rounding);
1391 in8 = _mm_adds_epi16(in8, final_rounding); 1409 in[8] = _mm_adds_epi16(in[8], final_rounding);
1392 in9 = _mm_adds_epi16(in9, final_rounding); 1410 in[9] = _mm_adds_epi16(in[9], final_rounding);
1393 in10 = _mm_adds_epi16(in10, final_rounding); 1411 in[10] = _mm_adds_epi16(in[10], final_rounding);
1394 in11 = _mm_adds_epi16(in11, final_rounding); 1412 in[11] = _mm_adds_epi16(in[11], final_rounding);
1395 in12 = _mm_adds_epi16(in12, final_rounding); 1413 in[12] = _mm_adds_epi16(in[12], final_rounding);
1396 in13 = _mm_adds_epi16(in13, final_rounding); 1414 in[13] = _mm_adds_epi16(in[13], final_rounding);
1397 in14 = _mm_adds_epi16(in14, final_rounding); 1415 in[14] = _mm_adds_epi16(in[14], final_rounding);
1398 in15 = _mm_adds_epi16(in15, final_rounding); 1416 in[15] = _mm_adds_epi16(in[15], final_rounding);
1399 1417
1400 in0 = _mm_srai_epi16(in0, 6); 1418 in[0] = _mm_srai_epi16(in[0], 6);
1401 in1 = _mm_srai_epi16(in1, 6); 1419 in[1] = _mm_srai_epi16(in[1], 6);
1402 in2 = _mm_srai_epi16(in2, 6); 1420 in[2] = _mm_srai_epi16(in[2], 6);
1403 in3 = _mm_srai_epi16(in3, 6); 1421 in[3] = _mm_srai_epi16(in[3], 6);
1404 in4 = _mm_srai_epi16(in4, 6); 1422 in[4] = _mm_srai_epi16(in[4], 6);
1405 in5 = _mm_srai_epi16(in5, 6); 1423 in[5] = _mm_srai_epi16(in[5], 6);
1406 in6 = _mm_srai_epi16(in6, 6); 1424 in[6] = _mm_srai_epi16(in[6], 6);
1407 in7 = _mm_srai_epi16(in7, 6); 1425 in[7] = _mm_srai_epi16(in[7], 6);
1408 in8 = _mm_srai_epi16(in8, 6); 1426 in[8] = _mm_srai_epi16(in[8], 6);
1409 in9 = _mm_srai_epi16(in9, 6); 1427 in[9] = _mm_srai_epi16(in[9], 6);
1410 in10 = _mm_srai_epi16(in10, 6); 1428 in[10] = _mm_srai_epi16(in[10], 6);
1411 in11 = _mm_srai_epi16(in11, 6); 1429 in[11] = _mm_srai_epi16(in[11], 6);
1412 in12 = _mm_srai_epi16(in12, 6); 1430 in[12] = _mm_srai_epi16(in[12], 6);
1413 in13 = _mm_srai_epi16(in13, 6); 1431 in[13] = _mm_srai_epi16(in[13], 6);
1414 in14 = _mm_srai_epi16(in14, 6); 1432 in[14] = _mm_srai_epi16(in[14], 6);
1415 in15 = _mm_srai_epi16(in15, 6); 1433 in[15] = _mm_srai_epi16(in[15], 6);
1416 1434
1417 RECON_AND_STORE(dest, in0); 1435 RECON_AND_STORE(dest, in[0]);
1418 RECON_AND_STORE(dest, in1); 1436 RECON_AND_STORE(dest, in[1]);
1419 RECON_AND_STORE(dest, in2); 1437 RECON_AND_STORE(dest, in[2]);
1420 RECON_AND_STORE(dest, in3); 1438 RECON_AND_STORE(dest, in[3]);
1421 RECON_AND_STORE(dest, in4); 1439 RECON_AND_STORE(dest, in[4]);
1422 RECON_AND_STORE(dest, in5); 1440 RECON_AND_STORE(dest, in[5]);
1423 RECON_AND_STORE(dest, in6); 1441 RECON_AND_STORE(dest, in[6]);
1424 RECON_AND_STORE(dest, in7); 1442 RECON_AND_STORE(dest, in[7]);
1425 RECON_AND_STORE(dest, in8); 1443 RECON_AND_STORE(dest, in[8]);
1426 RECON_AND_STORE(dest, in9); 1444 RECON_AND_STORE(dest, in[9]);
1427 RECON_AND_STORE(dest, in10); 1445 RECON_AND_STORE(dest, in[10]);
1428 RECON_AND_STORE(dest, in11); 1446 RECON_AND_STORE(dest, in[11]);
1429 RECON_AND_STORE(dest, in12); 1447 RECON_AND_STORE(dest, in[12]);
1430 RECON_AND_STORE(dest, in13); 1448 RECON_AND_STORE(dest, in[13]);
1431 RECON_AND_STORE(dest, in14); 1449 RECON_AND_STORE(dest, in[14]);
1432 RECON_AND_STORE(dest, in15); 1450 RECON_AND_STORE(dest, in[15]);
1433 1451
1434 dest += 8 - (stride * 16); 1452 dest += 8 - (stride * 16);
1435 }
1436 } 1453 }
1437 } 1454 }
1438 1455
1439 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1456 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1440 __m128i dc_value; 1457 __m128i dc_value;
1441 const __m128i zero = _mm_setzero_si128(); 1458 const __m128i zero = _mm_setzero_si128();
1442 int a, i; 1459 int a, i;
1443 1460
1444 a = dct_const_round_shift(input[0] * cospi_16_64); 1461 a = dct_const_round_shift(input[0] * cospi_16_64);
1445 a = dct_const_round_shift(a * cospi_16_64); 1462 a = dct_const_round_shift(a * cospi_16_64);
(...skipping 999 matching lines...) Expand 10 before | Expand all | Expand 10 after
2445 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2462 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2446 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2463 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2447 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2464 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2448 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 2465 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2449 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2466 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2450 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2467 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2451 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2468 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2452 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2469 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2453 2470
2454 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2471 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2455 2472 __m128i in[16], l[16];
2456 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
2457 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
2458 in10 = zero, in11 = zero, in12 = zero, in13 = zero,
2459 in14 = zero, in15 = zero;
2460 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
2461 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
2462 l12 = zero, l13 = zero, l14 = zero, l15 = zero;
2463
2464 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 2473 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2465 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2474 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2466 stp1_8_0, stp1_12_0; 2475 stp1_8_0, stp1_12_0;
2467 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2476 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2468 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 2477 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
2469 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2478 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2470 int i; 2479 int i;
2480 in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
2471 // 1-D idct. Load input data. 2481 // 1-D idct. Load input data.
2472 in0 = _mm_load_si128((const __m128i *)input); 2482 in[0] = _mm_load_si128((const __m128i *)input);
2473 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 2483 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
2474 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 2484 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2475 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 2485 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
2476 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 2486 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2477 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 2487 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
2478 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 2488 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2479 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 2489 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
2480 2490
2481 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); 2491 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]);
2482 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); 2492 TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]);
2483 2493
2484 // Stage2 2494 // Stage2
2485 { 2495 {
2486 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); 2496 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]);
2487 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); 2497 const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]);
2488 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); 2498 const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]);
2489 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); 2499 const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]);
2490 2500
2491 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2501 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2492 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2502 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2493 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); 2503 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
2494 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); 2504 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
2495 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); 2505 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
2496 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); 2506 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
2497 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2507 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2498 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2508 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2499 2509
(...skipping 21 matching lines...) Expand all
2521 stp2_14 = _mm_packs_epi32(tmp6, zero); 2531 stp2_14 = _mm_packs_epi32(tmp6, zero);
2522 2532
2523 stp2_10 = _mm_packs_epi32(tmp1, zero); 2533 stp2_10 = _mm_packs_epi32(tmp1, zero);
2524 stp2_13 = _mm_packs_epi32(tmp3, zero); 2534 stp2_13 = _mm_packs_epi32(tmp3, zero);
2525 stp2_11 = _mm_packs_epi32(tmp5, zero); 2535 stp2_11 = _mm_packs_epi32(tmp5, zero);
2526 stp2_12 = _mm_packs_epi32(tmp7, zero); 2536 stp2_12 = _mm_packs_epi32(tmp7, zero);
2527 } 2537 }
2528 2538
2529 // Stage3 2539 // Stage3
2530 { 2540 {
2531 const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); 2541 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]);
2532 const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); 2542 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]);
2533 2543
2534 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2544 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2535 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2545 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2536 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); 2546 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
2537 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); 2547 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
2538 2548
2539 tmp0 = _mm_add_epi32(tmp0, rounding); 2549 tmp0 = _mm_add_epi32(tmp0, rounding);
2540 tmp2 = _mm_add_epi32(tmp2, rounding); 2550 tmp2 = _mm_add_epi32(tmp2, rounding);
2541 tmp4 = _mm_add_epi32(tmp4, rounding); 2551 tmp4 = _mm_add_epi32(tmp4, rounding);
2542 tmp6 = _mm_add_epi32(tmp6, rounding); 2552 tmp6 = _mm_add_epi32(tmp6, rounding);
(...skipping 14 matching lines...) Expand all
2557 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); 2567 stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
2558 2568
2559 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); 2569 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
2560 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); 2570 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
2561 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); 2571 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
2562 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); 2572 stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
2563 } 2573 }
2564 2574
2565 // Stage4 2575 // Stage4
2566 { 2576 {
2567 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); 2577 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);
2568 const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); 2578 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]);
2569 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); 2579 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
2570 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2580 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2571 2581
2572 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2582 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2573 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2583 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2574 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); 2584 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
2575 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); 2585 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
2576 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2586 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2577 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2587 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2578 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2588 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after
2667 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); 2677 stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
2668 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); 2678 stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
2669 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); 2679 stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
2670 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); 2680 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
2671 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); 2681 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
2672 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); 2682 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
2673 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); 2683 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
2674 } 2684 }
2675 2685
2676 // Stage7. Left 8x16 only. 2686 // Stage7. Left 8x16 only.
2677 l0 = _mm_add_epi16(stp2_0, stp1_15); 2687 l[0] = _mm_add_epi16(stp2_0, stp1_15);
2678 l1 = _mm_add_epi16(stp2_1, stp1_14); 2688 l[1] = _mm_add_epi16(stp2_1, stp1_14);
2679 l2 = _mm_add_epi16(stp2_2, stp2_13); 2689 l[2] = _mm_add_epi16(stp2_2, stp2_13);
2680 l3 = _mm_add_epi16(stp2_3, stp2_12); 2690 l[3] = _mm_add_epi16(stp2_3, stp2_12);
2681 l4 = _mm_add_epi16(stp2_4, stp2_11); 2691 l[4] = _mm_add_epi16(stp2_4, stp2_11);
2682 l5 = _mm_add_epi16(stp2_5, stp2_10); 2692 l[5] = _mm_add_epi16(stp2_5, stp2_10);
2683 l6 = _mm_add_epi16(stp2_6, stp1_9); 2693 l[6] = _mm_add_epi16(stp2_6, stp1_9);
2684 l7 = _mm_add_epi16(stp2_7, stp1_8); 2694 l[7] = _mm_add_epi16(stp2_7, stp1_8);
2685 l8 = _mm_sub_epi16(stp2_7, stp1_8); 2695 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2686 l9 = _mm_sub_epi16(stp2_6, stp1_9); 2696 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2687 l10 = _mm_sub_epi16(stp2_5, stp2_10); 2697 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2688 l11 = _mm_sub_epi16(stp2_4, stp2_11); 2698 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2689 l12 = _mm_sub_epi16(stp2_3, stp2_12); 2699 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2690 l13 = _mm_sub_epi16(stp2_2, stp2_13); 2700 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2691 l14 = _mm_sub_epi16(stp2_1, stp1_14); 2701 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2692 l15 = _mm_sub_epi16(stp2_0, stp1_15); 2702 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2693 2703
2694 // 2-D idct. We do 2 8x16 blocks. 2704 // 2-D idct. We do 2 8x16 blocks.
2695 for (i = 0; i < 2; i++) { 2705 for (i = 0; i < 2; i++) {
2696 if (i == 0) 2706 array_transpose_4X8(l + 8*i, in);
2697 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 2707 in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero;
2698 in5, in6, in7);
2699
2700 if (i == 1)
2701 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
2702 in4, in5, in6, in7);
2703
2704 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
2705 2708
2706 IDCT16_1D 2709 IDCT16_1D
2707 2710
2708 // Stage7 2711 // Stage7
2709 in0 = _mm_add_epi16(stp2_0, stp1_15); 2712 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2710 in1 = _mm_add_epi16(stp2_1, stp1_14); 2713 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2711 in2 = _mm_add_epi16(stp2_2, stp2_13); 2714 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2712 in3 = _mm_add_epi16(stp2_3, stp2_12); 2715 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2713 in4 = _mm_add_epi16(stp2_4, stp2_11); 2716 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2714 in5 = _mm_add_epi16(stp2_5, stp2_10); 2717 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2715 in6 = _mm_add_epi16(stp2_6, stp1_9); 2718 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2716 in7 = _mm_add_epi16(stp2_7, stp1_8); 2719 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2717 in8 = _mm_sub_epi16(stp2_7, stp1_8); 2720 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2718 in9 = _mm_sub_epi16(stp2_6, stp1_9); 2721 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2719 in10 = _mm_sub_epi16(stp2_5, stp2_10); 2722 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2720 in11 = _mm_sub_epi16(stp2_4, stp2_11); 2723 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2721 in12 = _mm_sub_epi16(stp2_3, stp2_12); 2724 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2722 in13 = _mm_sub_epi16(stp2_2, stp2_13); 2725 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2723 in14 = _mm_sub_epi16(stp2_1, stp1_14); 2726 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2724 in15 = _mm_sub_epi16(stp2_0, stp1_15); 2727 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2725 2728
2726 // Final rounding and shift 2729 // Final rounding and shift
2727 in0 = _mm_adds_epi16(in0, final_rounding); 2730 in[0] = _mm_adds_epi16(in[0], final_rounding);
2728 in1 = _mm_adds_epi16(in1, final_rounding); 2731 in[1] = _mm_adds_epi16(in[1], final_rounding);
2729 in2 = _mm_adds_epi16(in2, final_rounding); 2732 in[2] = _mm_adds_epi16(in[2], final_rounding);
2730 in3 = _mm_adds_epi16(in3, final_rounding); 2733 in[3] = _mm_adds_epi16(in[3], final_rounding);
2731 in4 = _mm_adds_epi16(in4, final_rounding); 2734 in[4] = _mm_adds_epi16(in[4], final_rounding);
2732 in5 = _mm_adds_epi16(in5, final_rounding); 2735 in[5] = _mm_adds_epi16(in[5], final_rounding);
2733 in6 = _mm_adds_epi16(in6, final_rounding); 2736 in[6] = _mm_adds_epi16(in[6], final_rounding);
2734 in7 = _mm_adds_epi16(in7, final_rounding); 2737 in[7] = _mm_adds_epi16(in[7], final_rounding);
2735 in8 = _mm_adds_epi16(in8, final_rounding); 2738 in[8] = _mm_adds_epi16(in[8], final_rounding);
2736 in9 = _mm_adds_epi16(in9, final_rounding); 2739 in[9] = _mm_adds_epi16(in[9], final_rounding);
2737 in10 = _mm_adds_epi16(in10, final_rounding); 2740 in[10] = _mm_adds_epi16(in[10], final_rounding);
2738 in11 = _mm_adds_epi16(in11, final_rounding); 2741 in[11] = _mm_adds_epi16(in[11], final_rounding);
2739 in12 = _mm_adds_epi16(in12, final_rounding); 2742 in[12] = _mm_adds_epi16(in[12], final_rounding);
2740 in13 = _mm_adds_epi16(in13, final_rounding); 2743 in[13] = _mm_adds_epi16(in[13], final_rounding);
2741 in14 = _mm_adds_epi16(in14, final_rounding); 2744 in[14] = _mm_adds_epi16(in[14], final_rounding);
2742 in15 = _mm_adds_epi16(in15, final_rounding); 2745 in[15] = _mm_adds_epi16(in[15], final_rounding);
2743 2746
2744 in0 = _mm_srai_epi16(in0, 6); 2747 in[0] = _mm_srai_epi16(in[0], 6);
2745 in1 = _mm_srai_epi16(in1, 6); 2748 in[1] = _mm_srai_epi16(in[1], 6);
2746 in2 = _mm_srai_epi16(in2, 6); 2749 in[2] = _mm_srai_epi16(in[2], 6);
2747 in3 = _mm_srai_epi16(in3, 6); 2750 in[3] = _mm_srai_epi16(in[3], 6);
2748 in4 = _mm_srai_epi16(in4, 6); 2751 in[4] = _mm_srai_epi16(in[4], 6);
2749 in5 = _mm_srai_epi16(in5, 6); 2752 in[5] = _mm_srai_epi16(in[5], 6);
2750 in6 = _mm_srai_epi16(in6, 6); 2753 in[6] = _mm_srai_epi16(in[6], 6);
2751 in7 = _mm_srai_epi16(in7, 6); 2754 in[7] = _mm_srai_epi16(in[7], 6);
2752 in8 = _mm_srai_epi16(in8, 6); 2755 in[8] = _mm_srai_epi16(in[8], 6);
2753 in9 = _mm_srai_epi16(in9, 6); 2756 in[9] = _mm_srai_epi16(in[9], 6);
2754 in10 = _mm_srai_epi16(in10, 6); 2757 in[10] = _mm_srai_epi16(in[10], 6);
2755 in11 = _mm_srai_epi16(in11, 6); 2758 in[11] = _mm_srai_epi16(in[11], 6);
2756 in12 = _mm_srai_epi16(in12, 6); 2759 in[12] = _mm_srai_epi16(in[12], 6);
2757 in13 = _mm_srai_epi16(in13, 6); 2760 in[13] = _mm_srai_epi16(in[13], 6);
2758 in14 = _mm_srai_epi16(in14, 6); 2761 in[14] = _mm_srai_epi16(in[14], 6);
2759 in15 = _mm_srai_epi16(in15, 6); 2762 in[15] = _mm_srai_epi16(in[15], 6);
2760 2763
2761 RECON_AND_STORE(dest, in0); 2764 RECON_AND_STORE(dest, in[0]);
2762 RECON_AND_STORE(dest, in1); 2765 RECON_AND_STORE(dest, in[1]);
2763 RECON_AND_STORE(dest, in2); 2766 RECON_AND_STORE(dest, in[2]);
2764 RECON_AND_STORE(dest, in3); 2767 RECON_AND_STORE(dest, in[3]);
2765 RECON_AND_STORE(dest, in4); 2768 RECON_AND_STORE(dest, in[4]);
2766 RECON_AND_STORE(dest, in5); 2769 RECON_AND_STORE(dest, in[5]);
2767 RECON_AND_STORE(dest, in6); 2770 RECON_AND_STORE(dest, in[6]);
2768 RECON_AND_STORE(dest, in7); 2771 RECON_AND_STORE(dest, in[7]);
2769 RECON_AND_STORE(dest, in8); 2772 RECON_AND_STORE(dest, in[8]);
2770 RECON_AND_STORE(dest, in9); 2773 RECON_AND_STORE(dest, in[9]);
2771 RECON_AND_STORE(dest, in10); 2774 RECON_AND_STORE(dest, in[10]);
2772 RECON_AND_STORE(dest, in11); 2775 RECON_AND_STORE(dest, in[11]);
2773 RECON_AND_STORE(dest, in12); 2776 RECON_AND_STORE(dest, in[12]);
2774 RECON_AND_STORE(dest, in13); 2777 RECON_AND_STORE(dest, in[13]);
2775 RECON_AND_STORE(dest, in14); 2778 RECON_AND_STORE(dest, in[14]);
2776 RECON_AND_STORE(dest, in15); 2779 RECON_AND_STORE(dest, in[15]);
2777 2780
2778 dest += 8 - (stride * 16); 2781 dest += 8 - (stride * 16);
2779 } 2782 }
2780 } 2783 }
2781 2784
2782 #define LOAD_DQCOEFF(reg, input) \ 2785 #define LOAD_DQCOEFF(reg, input) \
2783 { \ 2786 { \
2784 reg = _mm_load_si128((const __m128i *) input); \ 2787 reg = _mm_load_si128((const __m128i *) input); \
2785 input += 8; \ 2788 input += 8; \
2786 } \ 2789 } \
2787 2790
2791 #define IDCT32_1D_34 \
2792 /* Stage1 */ \
2793 { \
2794 const __m128i zero = _mm_setzero_si128();\
2795 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2796 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2797 \
2798 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2799 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2800 \
2801 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2802 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2803 \
2804 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2805 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2806 \
2807 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2808 stg1_1, stp1_16, stp1_31); \
2809 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2810 stg1_7, stp1_19, stp1_28); \
2811 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2812 stg1_9, stp1_20, stp1_27); \
2813 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2814 stg1_15, stp1_23, stp1_24); \
2815 } \
2816 \
2817 /* Stage2 */ \
2818 { \
2819 const __m128i zero = _mm_setzero_si128();\
2820 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2821 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2822 \
2823 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2824 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2825 \
2826 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2827 stg2_1, stp2_8, stp2_15); \
2828 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2829 stg2_7, stp2_11, stp2_12); \
2830 \
2831 stp2_16 = stp1_16; \
2832 stp2_19 = stp1_19; \
2833 \
2834 stp2_20 = stp1_20; \
2835 stp2_23 = stp1_23; \
2836 \
2837 stp2_24 = stp1_24; \
2838 stp2_27 = stp1_27; \
2839 \
2840 stp2_28 = stp1_28; \
2841 stp2_31 = stp1_31; \
2842 } \
2843 \
2844 /* Stage3 */ \
2845 { \
2846 const __m128i zero = _mm_setzero_si128();\
2847 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2848 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2849 \
2850 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2851 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2852 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2853 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2854 \
2855 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2856 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2857 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2858 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2859 \
2860 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2861 stg3_1, stp1_4, stp1_7); \
2862 \
2863 stp1_8 = stp2_8; \
2864 stp1_11 = stp2_11; \
2865 stp1_12 = stp2_12; \
2866 stp1_15 = stp2_15; \
2867 \
2868 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2869 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2870 stp1_18, stp1_29) \
2871 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2872 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2873 stp1_22, stp1_25) \
2874 \
2875 stp1_16 = stp2_16; \
2876 stp1_31 = stp2_31; \
2877 stp1_19 = stp2_19; \
2878 stp1_20 = stp2_20; \
2879 stp1_23 = stp2_23; \
2880 stp1_24 = stp2_24; \
2881 stp1_27 = stp2_27; \
2882 stp1_28 = stp2_28; \
2883 } \
2884 \
2885 /* Stage4 */ \
2886 { \
2887 const __m128i zero = _mm_setzero_si128();\
2888 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2889 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2890 \
2891 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2892 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2893 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2894 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2895 \
2896 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2897 stg4_1, stp2_0, stp2_1); \
2898 \
2899 stp2_4 = stp1_4; \
2900 stp2_5 = stp1_4; \
2901 stp2_6 = stp1_7; \
2902 stp2_7 = stp1_7; \
2903 \
2904 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2905 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2906 stp2_10, stp2_13) \
2907 \
2908 stp2_8 = stp1_8; \
2909 stp2_15 = stp1_15; \
2910 stp2_11 = stp1_11; \
2911 stp2_12 = stp1_12; \
2912 \
2913 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2914 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2915 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2916 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2917 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2918 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2919 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2920 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2921 \
2922 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2923 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2924 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2925 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2926 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2927 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2928 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2929 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2930 } \
2931 \
2932 /* Stage5 */ \
2933 { \
2934 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2935 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2936 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2937 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2938 \
2939 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2940 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2941 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2942 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2943 \
2944 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2945 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2946 \
2947 stp1_0 = stp2_0; \
2948 stp1_1 = stp2_1; \
2949 stp1_2 = stp2_1; \
2950 stp1_3 = stp2_0; \
2951 \
2952 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2953 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2954 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2955 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2956 \
2957 tmp0 = _mm_add_epi32(tmp0, rounding); \
2958 tmp1 = _mm_add_epi32(tmp1, rounding); \
2959 tmp2 = _mm_add_epi32(tmp2, rounding); \
2960 tmp3 = _mm_add_epi32(tmp3, rounding); \
2961 \
2962 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2963 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2964 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2965 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2966 \
2967 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2968 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2969 \
2970 stp1_4 = stp2_4; \
2971 stp1_7 = stp2_7; \
2972 \
2973 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2974 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2975 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2976 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2977 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2978 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2979 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2980 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2981 \
2982 stp1_16 = stp2_16; \
2983 stp1_17 = stp2_17; \
2984 \
2985 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2986 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2987 stp1_19, stp1_28) \
2988 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2989 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2990 stp1_21, stp1_26) \
2991 \
2992 stp1_22 = stp2_22; \
2993 stp1_23 = stp2_23; \
2994 stp1_24 = stp2_24; \
2995 stp1_25 = stp2_25; \
2996 stp1_30 = stp2_30; \
2997 stp1_31 = stp2_31; \
2998 } \
2999 \
3000 /* Stage6 */ \
3001 { \
3002 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3003 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3004 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3005 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3006 \
3007 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3008 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3009 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3010 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3011 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3012 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3013 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3014 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3015 \
3016 stp2_8 = stp1_8; \
3017 stp2_9 = stp1_9; \
3018 stp2_14 = stp1_14; \
3019 stp2_15 = stp1_15; \
3020 \
3021 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3022 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3023 stp2_13, stp2_11, stp2_12) \
3024 \
3025 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3026 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3027 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3028 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3029 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3030 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3031 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3032 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3033 \
3034 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3035 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3036 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3037 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3038 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3039 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3040 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3041 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3042 } \
3043 \
3044 /* Stage7 */ \
3045 { \
3046 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3047 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3048 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3049 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3050 \
3051 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3052 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3053 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3054 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3055 \
3056 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3057 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3058 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3059 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3060 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3061 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3062 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3063 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3064 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3065 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3066 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3067 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3068 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3069 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3070 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3071 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3072 \
3073 stp1_16 = stp2_16; \
3074 stp1_17 = stp2_17; \
3075 stp1_18 = stp2_18; \
3076 stp1_19 = stp2_19; \
3077 \
3078 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3079 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3080 stp1_21, stp1_26) \
3081 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3082 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3083 stp1_23, stp1_24) \
3084 \
3085 stp1_28 = stp2_28; \
3086 stp1_29 = stp2_29; \
3087 stp1_30 = stp2_30; \
3088 stp1_31 = stp2_31; \
3089 }
3090
3091
2788 #define IDCT32_1D \ 3092 #define IDCT32_1D \
2789 /* Stage1 */ \ 3093 /* Stage1 */ \
2790 { \ 3094 { \
2791 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ 3095 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2792 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ 3096 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2793 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ 3097 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2794 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ 3098 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2795 \ 3099 \
2796 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ 3100 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2797 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ 3101 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2798 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ 3102 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2799 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ 3103 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2800 \ 3104 \
2801 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ 3105 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2802 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ 3106 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2803 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ 3107 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2804 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ 3108 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2805 \ 3109 \
2806 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ 3110 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2807 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ 3111 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2808 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ 3112 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2809 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ 3113 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2810 \ 3114 \
2811 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 3115 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2812 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 3116 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2813 stp1_17, stp1_30) \ 3117 stp1_17, stp1_30) \
2814 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 3118 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2815 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 3119 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2816 stp1_19, stp1_28) \ 3120 stp1_19, stp1_28) \
2817 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 3121 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2818 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 3122 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2819 stp1_21, stp1_26) \ 3123 stp1_21, stp1_26) \
2820 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 3124 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2821 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 3125 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2822 stp1_23, stp1_24) \ 3126 stp1_23, stp1_24) \
2823 } \ 3127 } \
2824 \ 3128 \
2825 /* Stage2 */ \ 3129 /* Stage2 */ \
2826 { \ 3130 { \
2827 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ 3131 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2828 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ 3132 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2829 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ 3133 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2830 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ 3134 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2831 \ 3135 \
2832 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ 3136 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2833 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ 3137 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2834 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ 3138 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2835 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ 3139 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2836 \ 3140 \
2837 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 3141 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2838 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 3142 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2839 stp2_14) \ 3143 stp2_14) \
2840 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 3144 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2841 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 3145 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2842 stp2_11, stp2_12) \ 3146 stp2_11, stp2_12) \
2843 \ 3147 \
2844 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 3148 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2845 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 3149 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
(...skipping 11 matching lines...) Expand all
2857 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 3161 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2858 \ 3162 \
2859 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 3163 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2860 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 3164 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2861 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 3165 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2862 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 3166 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2863 } \ 3167 } \
2864 \ 3168 \
2865 /* Stage3 */ \ 3169 /* Stage3 */ \
2866 { \ 3170 { \
2867 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ 3171 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2868 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ 3172 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2869 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ 3173 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2870 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ 3174 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2871 \ 3175 \
2872 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 3176 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2873 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 3177 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2874 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 3178 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2875 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 3179 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2876 \ 3180 \
2877 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 3181 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2878 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 3182 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2879 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 3183 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2880 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 3184 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
(...skipping 23 matching lines...) Expand all
2904 stp1_19 = stp2_19; \ 3208 stp1_19 = stp2_19; \
2905 stp1_20 = stp2_20; \ 3209 stp1_20 = stp2_20; \
2906 stp1_23 = stp2_23; \ 3210 stp1_23 = stp2_23; \
2907 stp1_24 = stp2_24; \ 3211 stp1_24 = stp2_24; \
2908 stp1_27 = stp2_27; \ 3212 stp1_27 = stp2_27; \
2909 stp1_28 = stp2_28; \ 3213 stp1_28 = stp2_28; \
2910 } \ 3214 } \
2911 \ 3215 \
2912 /* Stage4 */ \ 3216 /* Stage4 */ \
2913 { \ 3217 { \
2914 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ 3218 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2915 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ 3219 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2916 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ 3220 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2917 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ 3221 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2918 \ 3222 \
2919 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 3223 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2920 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 3224 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2921 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 3225 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2922 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 3226 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2923 \ 3227 \
2924 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 3228 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2925 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 3229 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2926 stp2_2, stp2_3) \ 3230 stp2_2, stp2_3) \
2927 \ 3231 \
(...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after
3164 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3468 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3165 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3469 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3166 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3470 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3167 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3471 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3168 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3472 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3169 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3473 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3170 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3474 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3171 3475
3172 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3476 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3173 3477
3174 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 3478 __m128i in[32], col[32];
3175 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
3176 in24, in25, in26, in27, in28, in29, in30, in31;
3177 __m128i col[128];
3178 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3479 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3179 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3480 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3180 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3481 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3181 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3482 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3182 stp1_30, stp1_31; 3483 stp1_30, stp1_31;
3183 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3484 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3184 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3485 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3185 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3486 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3186 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3487 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3187 stp2_30, stp2_31; 3488 stp2_30, stp2_31;
3188 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3489 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3189 int i, j, i32; 3490 int i;
3190 3491 // Load input data.
3191 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 3492 LOAD_DQCOEFF(in[0], input);
3192 for (i = 0; i < 8; i++) { 3493 LOAD_DQCOEFF(in[8], input);
3193 i32 = (i << 5); 3494 LOAD_DQCOEFF(in[16], input);
3194 if (i == 0) { 3495 LOAD_DQCOEFF(in[24], input);
3195 // First 1-D idct: first 8 rows 3496 LOAD_DQCOEFF(in[1], input);
3196 // Load input data. 3497 LOAD_DQCOEFF(in[9], input);
3197 LOAD_DQCOEFF(in0, input); 3498 LOAD_DQCOEFF(in[17], input);
3198 LOAD_DQCOEFF(in8, input); 3499 LOAD_DQCOEFF(in[25], input);
3199 LOAD_DQCOEFF(in16, input); 3500 LOAD_DQCOEFF(in[2], input);
3200 LOAD_DQCOEFF(in24, input); 3501 LOAD_DQCOEFF(in[10], input);
3201 LOAD_DQCOEFF(in1, input); 3502 LOAD_DQCOEFF(in[18], input);
3202 LOAD_DQCOEFF(in9, input); 3503 LOAD_DQCOEFF(in[26], input);
3203 LOAD_DQCOEFF(in17, input); 3504 LOAD_DQCOEFF(in[3], input);
3204 LOAD_DQCOEFF(in25, input); 3505 LOAD_DQCOEFF(in[11], input);
3205 LOAD_DQCOEFF(in2, input); 3506 LOAD_DQCOEFF(in[19], input);
3206 LOAD_DQCOEFF(in10, input); 3507 LOAD_DQCOEFF(in[27], input);
3207 LOAD_DQCOEFF(in18, input); 3508
3208 LOAD_DQCOEFF(in26, input); 3509 LOAD_DQCOEFF(in[4], input);
3209 LOAD_DQCOEFF(in3, input); 3510 LOAD_DQCOEFF(in[12], input);
3210 LOAD_DQCOEFF(in11, input); 3511 LOAD_DQCOEFF(in[20], input);
3211 LOAD_DQCOEFF(in19, input); 3512 LOAD_DQCOEFF(in[28], input);
3212 LOAD_DQCOEFF(in27, input); 3513 LOAD_DQCOEFF(in[5], input);
3213 3514 LOAD_DQCOEFF(in[13], input);
3214 LOAD_DQCOEFF(in4, input); 3515 LOAD_DQCOEFF(in[21], input);
3215 LOAD_DQCOEFF(in12, input); 3516 LOAD_DQCOEFF(in[29], input);
3216 LOAD_DQCOEFF(in20, input); 3517 LOAD_DQCOEFF(in[6], input);
3217 LOAD_DQCOEFF(in28, input); 3518 LOAD_DQCOEFF(in[14], input);
3218 LOAD_DQCOEFF(in5, input); 3519 LOAD_DQCOEFF(in[22], input);
3219 LOAD_DQCOEFF(in13, input); 3520 LOAD_DQCOEFF(in[30], input);
3220 LOAD_DQCOEFF(in21, input); 3521 LOAD_DQCOEFF(in[7], input);
3221 LOAD_DQCOEFF(in29, input); 3522 LOAD_DQCOEFF(in[15], input);
3222 LOAD_DQCOEFF(in6, input); 3523 LOAD_DQCOEFF(in[23], input);
3223 LOAD_DQCOEFF(in14, input); 3524 LOAD_DQCOEFF(in[31], input);
3224 LOAD_DQCOEFF(in22, input); 3525
3225 LOAD_DQCOEFF(in30, input); 3526 array_transpose_8x8(in, in);
3226 LOAD_DQCOEFF(in7, input); 3527 array_transpose_8x8(in+8, in+8);
3227 LOAD_DQCOEFF(in15, input); 3528 array_transpose_8x8(in+16, in+16);
3228 LOAD_DQCOEFF(in23, input); 3529 array_transpose_8x8(in+24, in+24);
3229 LOAD_DQCOEFF(in31, input); 3530
3230 3531 IDCT32_1D
3532
3533 // 1_D: Store 32 intermediate results for each 8x32 block.
3534 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3535 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3536 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3537 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3538 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3539 col[5] = _mm_add_epi16(stp1_5, stp1_26);
3540 col[6] = _mm_add_epi16(stp1_6, stp1_25);
3541 col[7] = _mm_add_epi16(stp1_7, stp1_24);
3542 col[8] = _mm_add_epi16(stp1_8, stp1_23);
3543 col[9] = _mm_add_epi16(stp1_9, stp1_22);
3544 col[10] = _mm_add_epi16(stp1_10, stp1_21);
3545 col[11] = _mm_add_epi16(stp1_11, stp1_20);
3546 col[12] = _mm_add_epi16(stp1_12, stp1_19);
3547 col[13] = _mm_add_epi16(stp1_13, stp1_18);
3548 col[14] = _mm_add_epi16(stp1_14, stp1_17);
3549 col[15] = _mm_add_epi16(stp1_15, stp1_16);
3550 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3551 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3552 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3553 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3554 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3555 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3556 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3557 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3558 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3559 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3560 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3561 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3562 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3563 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3564 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3565 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3566 for (i = 0; i < 4; i++) {
3567 const __m128i zero = _mm_setzero_si128();
3231 // Transpose 32x8 block to 8x32 block 3568 // Transpose 32x8 block to 8x32 block
3232 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 3569 array_transpose_8x8(col+i*8, in);
3233 in4, in5, in6, in7); 3570 IDCT32_1D_34
3234 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
3235 in10, in11, in12, in13, in14, in15);
3236 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
3237 in18, in19, in20, in21, in22, in23);
3238 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
3239 in26, in27, in28, in29, in30, in31);
3240 } else if (i < 4) {
3241 // First 1-D idct: next 24 zero-coeff rows
3242 col[i32 + 0] = _mm_setzero_si128();
3243 col[i32 + 1] = _mm_setzero_si128();
3244 col[i32 + 2] = _mm_setzero_si128();
3245 col[i32 + 3] = _mm_setzero_si128();
3246 col[i32 + 4] = _mm_setzero_si128();
3247 col[i32 + 5] = _mm_setzero_si128();
3248 col[i32 + 6] = _mm_setzero_si128();
3249 col[i32 + 7] = _mm_setzero_si128();
3250 col[i32 + 8] = _mm_setzero_si128();
3251 col[i32 + 9] = _mm_setzero_si128();
3252 col[i32 + 10] = _mm_setzero_si128();
3253 col[i32 + 11] = _mm_setzero_si128();
3254 col[i32 + 12] = _mm_setzero_si128();
3255 col[i32 + 13] = _mm_setzero_si128();
3256 col[i32 + 14] = _mm_setzero_si128();
3257 col[i32 + 15] = _mm_setzero_si128();
3258 col[i32 + 16] = _mm_setzero_si128();
3259 col[i32 + 17] = _mm_setzero_si128();
3260 col[i32 + 18] = _mm_setzero_si128();
3261 col[i32 + 19] = _mm_setzero_si128();
3262 col[i32 + 20] = _mm_setzero_si128();
3263 col[i32 + 21] = _mm_setzero_si128();
3264 col[i32 + 22] = _mm_setzero_si128();
3265 col[i32 + 23] = _mm_setzero_si128();
3266 col[i32 + 24] = _mm_setzero_si128();
3267 col[i32 + 25] = _mm_setzero_si128();
3268 col[i32 + 26] = _mm_setzero_si128();
3269 col[i32 + 27] = _mm_setzero_si128();
3270 col[i32 + 28] = _mm_setzero_si128();
3271 col[i32 + 29] = _mm_setzero_si128();
3272 col[i32 + 30] = _mm_setzero_si128();
3273 col[i32 + 31] = _mm_setzero_si128();
3274 continue;
3275 } else {
3276 // Second 1-D idct
3277 j = i - 4;
3278
3279 // Transpose 32x8 block to 8x32 block
3280 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3281 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3282 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
3283 in5, in6, in7);
3284 j += 4;
3285 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3286 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3287 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
3288 in11, in12, in13, in14, in15);
3289 j += 4;
3290 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3291 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3292 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
3293 in19, in20, in21, in22, in23);
3294 j += 4;
3295 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3296 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3297 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
3298 in28, in29, in30, in31);
3299 }
3300
3301 IDCT32_1D
3302
3303 // final stage
3304 if (i < 4) {
3305 // 1_D: Store 32 intermediate results for each 8x32 block.
3306 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3307 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3308 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3309 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3310 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3311 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3312 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3313 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3314 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3315 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3316 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3317 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3318 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3319 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3320 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3321 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3322 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3323 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3324 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3325 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3326 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3327 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3328 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3329 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3330 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3331 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3332 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3333 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3334 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3335 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3336 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3337 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3338 } else {
3339 const __m128i zero = _mm_setzero_si128();
3340 3571
3341 // 2_D: Calculate the results and store them to destination. 3572 // 2_D: Calculate the results and store them to destination.
3342 in0 = _mm_add_epi16(stp1_0, stp1_31); 3573 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3343 in1 = _mm_add_epi16(stp1_1, stp1_30); 3574 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3344 in2 = _mm_add_epi16(stp1_2, stp1_29); 3575 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3345 in3 = _mm_add_epi16(stp1_3, stp1_28); 3576 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3346 in4 = _mm_add_epi16(stp1_4, stp1_27); 3577 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3347 in5 = _mm_add_epi16(stp1_5, stp1_26); 3578 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3348 in6 = _mm_add_epi16(stp1_6, stp1_25); 3579 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3349 in7 = _mm_add_epi16(stp1_7, stp1_24); 3580 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3350 in8 = _mm_add_epi16(stp1_8, stp1_23); 3581 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3351 in9 = _mm_add_epi16(stp1_9, stp1_22); 3582 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3352 in10 = _mm_add_epi16(stp1_10, stp1_21); 3583 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3353 in11 = _mm_add_epi16(stp1_11, stp1_20); 3584 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3354 in12 = _mm_add_epi16(stp1_12, stp1_19); 3585 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3355 in13 = _mm_add_epi16(stp1_13, stp1_18); 3586 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3356 in14 = _mm_add_epi16(stp1_14, stp1_17); 3587 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3357 in15 = _mm_add_epi16(stp1_15, stp1_16); 3588 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3358 in16 = _mm_sub_epi16(stp1_15, stp1_16); 3589 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3359 in17 = _mm_sub_epi16(stp1_14, stp1_17); 3590 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3360 in18 = _mm_sub_epi16(stp1_13, stp1_18); 3591 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3361 in19 = _mm_sub_epi16(stp1_12, stp1_19); 3592 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3362 in20 = _mm_sub_epi16(stp1_11, stp1_20); 3593 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3363 in21 = _mm_sub_epi16(stp1_10, stp1_21); 3594 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3364 in22 = _mm_sub_epi16(stp1_9, stp1_22); 3595 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3365 in23 = _mm_sub_epi16(stp1_8, stp1_23); 3596 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3366 in24 = _mm_sub_epi16(stp1_7, stp1_24); 3597 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3367 in25 = _mm_sub_epi16(stp1_6, stp1_25); 3598 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3368 in26 = _mm_sub_epi16(stp1_5, stp1_26); 3599 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3369 in27 = _mm_sub_epi16(stp1_4, stp1_27); 3600 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3370 in28 = _mm_sub_epi16(stp1_3, stp1_28); 3601 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3371 in29 = _mm_sub_epi16(stp1_2, stp1_29); 3602 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3372 in30 = _mm_sub_epi16(stp1_1, stp1_30); 3603 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3373 in31 = _mm_sub_epi16(stp1_0, stp1_31); 3604 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3374 3605
3375 // Final rounding and shift 3606 // Final rounding and shift
3376 in0 = _mm_adds_epi16(in0, final_rounding); 3607 in[0] = _mm_adds_epi16(in[0], final_rounding);
3377 in1 = _mm_adds_epi16(in1, final_rounding); 3608 in[1] = _mm_adds_epi16(in[1], final_rounding);
3378 in2 = _mm_adds_epi16(in2, final_rounding); 3609 in[2] = _mm_adds_epi16(in[2], final_rounding);
3379 in3 = _mm_adds_epi16(in3, final_rounding); 3610 in[3] = _mm_adds_epi16(in[3], final_rounding);
3380 in4 = _mm_adds_epi16(in4, final_rounding); 3611 in[4] = _mm_adds_epi16(in[4], final_rounding);
3381 in5 = _mm_adds_epi16(in5, final_rounding); 3612 in[5] = _mm_adds_epi16(in[5], final_rounding);
3382 in6 = _mm_adds_epi16(in6, final_rounding); 3613 in[6] = _mm_adds_epi16(in[6], final_rounding);
3383 in7 = _mm_adds_epi16(in7, final_rounding); 3614 in[7] = _mm_adds_epi16(in[7], final_rounding);
3384 in8 = _mm_adds_epi16(in8, final_rounding); 3615 in[8] = _mm_adds_epi16(in[8], final_rounding);
3385 in9 = _mm_adds_epi16(in9, final_rounding); 3616 in[9] = _mm_adds_epi16(in[9], final_rounding);
3386 in10 = _mm_adds_epi16(in10, final_rounding); 3617 in[10] = _mm_adds_epi16(in[10], final_rounding);
3387 in11 = _mm_adds_epi16(in11, final_rounding); 3618 in[11] = _mm_adds_epi16(in[11], final_rounding);
3388 in12 = _mm_adds_epi16(in12, final_rounding); 3619 in[12] = _mm_adds_epi16(in[12], final_rounding);
3389 in13 = _mm_adds_epi16(in13, final_rounding); 3620 in[13] = _mm_adds_epi16(in[13], final_rounding);
3390 in14 = _mm_adds_epi16(in14, final_rounding); 3621 in[14] = _mm_adds_epi16(in[14], final_rounding);
3391 in15 = _mm_adds_epi16(in15, final_rounding); 3622 in[15] = _mm_adds_epi16(in[15], final_rounding);
3392 in16 = _mm_adds_epi16(in16, final_rounding); 3623 in[16] = _mm_adds_epi16(in[16], final_rounding);
3393 in17 = _mm_adds_epi16(in17, final_rounding); 3624 in[17] = _mm_adds_epi16(in[17], final_rounding);
3394 in18 = _mm_adds_epi16(in18, final_rounding); 3625 in[18] = _mm_adds_epi16(in[18], final_rounding);
3395 in19 = _mm_adds_epi16(in19, final_rounding); 3626 in[19] = _mm_adds_epi16(in[19], final_rounding);
3396 in20 = _mm_adds_epi16(in20, final_rounding); 3627 in[20] = _mm_adds_epi16(in[20], final_rounding);
3397 in21 = _mm_adds_epi16(in21, final_rounding); 3628 in[21] = _mm_adds_epi16(in[21], final_rounding);
3398 in22 = _mm_adds_epi16(in22, final_rounding); 3629 in[22] = _mm_adds_epi16(in[22], final_rounding);
3399 in23 = _mm_adds_epi16(in23, final_rounding); 3630 in[23] = _mm_adds_epi16(in[23], final_rounding);
3400 in24 = _mm_adds_epi16(in24, final_rounding); 3631 in[24] = _mm_adds_epi16(in[24], final_rounding);
3401 in25 = _mm_adds_epi16(in25, final_rounding); 3632 in[25] = _mm_adds_epi16(in[25], final_rounding);
3402 in26 = _mm_adds_epi16(in26, final_rounding); 3633 in[26] = _mm_adds_epi16(in[26], final_rounding);
3403 in27 = _mm_adds_epi16(in27, final_rounding); 3634 in[27] = _mm_adds_epi16(in[27], final_rounding);
3404 in28 = _mm_adds_epi16(in28, final_rounding); 3635 in[28] = _mm_adds_epi16(in[28], final_rounding);
3405 in29 = _mm_adds_epi16(in29, final_rounding); 3636 in[29] = _mm_adds_epi16(in[29], final_rounding);
3406 in30 = _mm_adds_epi16(in30, final_rounding); 3637 in[30] = _mm_adds_epi16(in[30], final_rounding);
3407 in31 = _mm_adds_epi16(in31, final_rounding); 3638 in[31] = _mm_adds_epi16(in[31], final_rounding);
3408 3639
3409 in0 = _mm_srai_epi16(in0, 6); 3640 in[0] = _mm_srai_epi16(in[0], 6);
3410 in1 = _mm_srai_epi16(in1, 6); 3641 in[1] = _mm_srai_epi16(in[1], 6);
3411 in2 = _mm_srai_epi16(in2, 6); 3642 in[2] = _mm_srai_epi16(in[2], 6);
3412 in3 = _mm_srai_epi16(in3, 6); 3643 in[3] = _mm_srai_epi16(in[3], 6);
3413 in4 = _mm_srai_epi16(in4, 6); 3644 in[4] = _mm_srai_epi16(in[4], 6);
3414 in5 = _mm_srai_epi16(in5, 6); 3645 in[5] = _mm_srai_epi16(in[5], 6);
3415 in6 = _mm_srai_epi16(in6, 6); 3646 in[6] = _mm_srai_epi16(in[6], 6);
3416 in7 = _mm_srai_epi16(in7, 6); 3647 in[7] = _mm_srai_epi16(in[7], 6);
3417 in8 = _mm_srai_epi16(in8, 6); 3648 in[8] = _mm_srai_epi16(in[8], 6);
3418 in9 = _mm_srai_epi16(in9, 6); 3649 in[9] = _mm_srai_epi16(in[9], 6);
3419 in10 = _mm_srai_epi16(in10, 6); 3650 in[10] = _mm_srai_epi16(in[10], 6);
3420 in11 = _mm_srai_epi16(in11, 6); 3651 in[11] = _mm_srai_epi16(in[11], 6);
3421 in12 = _mm_srai_epi16(in12, 6); 3652 in[12] = _mm_srai_epi16(in[12], 6);
3422 in13 = _mm_srai_epi16(in13, 6); 3653 in[13] = _mm_srai_epi16(in[13], 6);
3423 in14 = _mm_srai_epi16(in14, 6); 3654 in[14] = _mm_srai_epi16(in[14], 6);
3424 in15 = _mm_srai_epi16(in15, 6); 3655 in[15] = _mm_srai_epi16(in[15], 6);
3425 in16 = _mm_srai_epi16(in16, 6); 3656 in[16] = _mm_srai_epi16(in[16], 6);
3426 in17 = _mm_srai_epi16(in17, 6); 3657 in[17] = _mm_srai_epi16(in[17], 6);
3427 in18 = _mm_srai_epi16(in18, 6); 3658 in[18] = _mm_srai_epi16(in[18], 6);
3428 in19 = _mm_srai_epi16(in19, 6); 3659 in[19] = _mm_srai_epi16(in[19], 6);
3429 in20 = _mm_srai_epi16(in20, 6); 3660 in[20] = _mm_srai_epi16(in[20], 6);
3430 in21 = _mm_srai_epi16(in21, 6); 3661 in[21] = _mm_srai_epi16(in[21], 6);
3431 in22 = _mm_srai_epi16(in22, 6); 3662 in[22] = _mm_srai_epi16(in[22], 6);
3432 in23 = _mm_srai_epi16(in23, 6); 3663 in[23] = _mm_srai_epi16(in[23], 6);
3433 in24 = _mm_srai_epi16(in24, 6); 3664 in[24] = _mm_srai_epi16(in[24], 6);
3434 in25 = _mm_srai_epi16(in25, 6); 3665 in[25] = _mm_srai_epi16(in[25], 6);
3435 in26 = _mm_srai_epi16(in26, 6); 3666 in[26] = _mm_srai_epi16(in[26], 6);
3436 in27 = _mm_srai_epi16(in27, 6); 3667 in[27] = _mm_srai_epi16(in[27], 6);
3437 in28 = _mm_srai_epi16(in28, 6); 3668 in[28] = _mm_srai_epi16(in[28], 6);
3438 in29 = _mm_srai_epi16(in29, 6); 3669 in[29] = _mm_srai_epi16(in[29], 6);
3439 in30 = _mm_srai_epi16(in30, 6); 3670 in[30] = _mm_srai_epi16(in[30], 6);
3440 in31 = _mm_srai_epi16(in31, 6); 3671 in[31] = _mm_srai_epi16(in[31], 6);
3441 3672
3442 RECON_AND_STORE(dest, in0); 3673 RECON_AND_STORE(dest, in[0]);
3443 RECON_AND_STORE(dest, in1); 3674 RECON_AND_STORE(dest, in[1]);
3444 RECON_AND_STORE(dest, in2); 3675 RECON_AND_STORE(dest, in[2]);
3445 RECON_AND_STORE(dest, in3); 3676 RECON_AND_STORE(dest, in[3]);
3446 RECON_AND_STORE(dest, in4); 3677 RECON_AND_STORE(dest, in[4]);
3447 RECON_AND_STORE(dest, in5); 3678 RECON_AND_STORE(dest, in[5]);
3448 RECON_AND_STORE(dest, in6); 3679 RECON_AND_STORE(dest, in[6]);
3449 RECON_AND_STORE(dest, in7); 3680 RECON_AND_STORE(dest, in[7]);
3450 RECON_AND_STORE(dest, in8); 3681 RECON_AND_STORE(dest, in[8]);
3451 RECON_AND_STORE(dest, in9); 3682 RECON_AND_STORE(dest, in[9]);
3452 RECON_AND_STORE(dest, in10); 3683 RECON_AND_STORE(dest, in[10]);
3453 RECON_AND_STORE(dest, in11); 3684 RECON_AND_STORE(dest, in[11]);
3454 RECON_AND_STORE(dest, in12); 3685 RECON_AND_STORE(dest, in[12]);
3455 RECON_AND_STORE(dest, in13); 3686 RECON_AND_STORE(dest, in[13]);
3456 RECON_AND_STORE(dest, in14); 3687 RECON_AND_STORE(dest, in[14]);
3457 RECON_AND_STORE(dest, in15); 3688 RECON_AND_STORE(dest, in[15]);
3458 RECON_AND_STORE(dest, in16); 3689 RECON_AND_STORE(dest, in[16]);
3459 RECON_AND_STORE(dest, in17); 3690 RECON_AND_STORE(dest, in[17]);
3460 RECON_AND_STORE(dest, in18); 3691 RECON_AND_STORE(dest, in[18]);
3461 RECON_AND_STORE(dest, in19); 3692 RECON_AND_STORE(dest, in[19]);
3462 RECON_AND_STORE(dest, in20); 3693 RECON_AND_STORE(dest, in[20]);
3463 RECON_AND_STORE(dest, in21); 3694 RECON_AND_STORE(dest, in[21]);
3464 RECON_AND_STORE(dest, in22); 3695 RECON_AND_STORE(dest, in[22]);
3465 RECON_AND_STORE(dest, in23); 3696 RECON_AND_STORE(dest, in[23]);
3466 RECON_AND_STORE(dest, in24); 3697 RECON_AND_STORE(dest, in[24]);
3467 RECON_AND_STORE(dest, in25); 3698 RECON_AND_STORE(dest, in[25]);
3468 RECON_AND_STORE(dest, in26); 3699 RECON_AND_STORE(dest, in[26]);
3469 RECON_AND_STORE(dest, in27); 3700 RECON_AND_STORE(dest, in[27]);
3470 RECON_AND_STORE(dest, in28); 3701 RECON_AND_STORE(dest, in[28]);
3471 RECON_AND_STORE(dest, in29); 3702 RECON_AND_STORE(dest, in[29]);
3472 RECON_AND_STORE(dest, in30); 3703 RECON_AND_STORE(dest, in[30]);
3473 RECON_AND_STORE(dest, in31); 3704 RECON_AND_STORE(dest, in[31]);
3474 3705
3475 dest += 8 - (stride * 32); 3706 dest += 8 - (stride * 32);
3476 } 3707 }
3477 } 3708 }
3478 }
3479 3709
3480 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3710 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3481 int stride) { 3711 int stride) {
3482 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3712 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3483 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3713 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3484 3714
3485 // idct constants for each stage 3715 // idct constants for each stage
3486 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3716 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3487 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3717 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3488 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3718 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
3523 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3753 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3524 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3754 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3525 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3755 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3526 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3756 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3527 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3757 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3528 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3758 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3529 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3759 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3530 3760
3531 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3761 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3532 3762
3533 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 3763 __m128i in[32], col[128], zero_idx[16];
3534 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
3535 in24, in25, in26, in27, in28, in29, in30, in31;
3536 __m128i col[128];
3537 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3764 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3538 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3765 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3539 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3766 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3540 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3767 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3541 stp1_30, stp1_31; 3768 stp1_30, stp1_31;
3542 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3769 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3543 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3770 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3544 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3771 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3545 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3772 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3546 stp2_30, stp2_31; 3773 stp2_30, stp2_31;
3547 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3774 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3548 int i, j, i32; 3775 int i, j, i32;
3549 __m128i zero_idx[16];
3550 int zero_flag[2]; 3776 int zero_flag[2];
3551 3777
3552 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 3778 for (i = 0; i < 4; i++) {
3553 for (i = 0; i < 8; i++) {
3554 i32 = (i << 5); 3779 i32 = (i << 5);
3555 if (i < 4) {
3556 // First 1-D idct 3780 // First 1-D idct
3557 // Load input data. 3781 // Load input data.
3558 LOAD_DQCOEFF(in0, input); 3782 LOAD_DQCOEFF(in[0], input);
3559 LOAD_DQCOEFF(in8, input); 3783 LOAD_DQCOEFF(in[8], input);
3560 LOAD_DQCOEFF(in16, input); 3784 LOAD_DQCOEFF(in[16], input);
3561 LOAD_DQCOEFF(in24, input); 3785 LOAD_DQCOEFF(in[24], input);
3562 LOAD_DQCOEFF(in1, input); 3786 LOAD_DQCOEFF(in[1], input);
3563 LOAD_DQCOEFF(in9, input); 3787 LOAD_DQCOEFF(in[9], input);
3564 LOAD_DQCOEFF(in17, input); 3788 LOAD_DQCOEFF(in[17], input);
3565 LOAD_DQCOEFF(in25, input); 3789 LOAD_DQCOEFF(in[25], input);
3566 LOAD_DQCOEFF(in2, input); 3790 LOAD_DQCOEFF(in[2], input);
3567 LOAD_DQCOEFF(in10, input); 3791 LOAD_DQCOEFF(in[10], input);
3568 LOAD_DQCOEFF(in18, input); 3792 LOAD_DQCOEFF(in[18], input);
3569 LOAD_DQCOEFF(in26, input); 3793 LOAD_DQCOEFF(in[26], input);
3570 LOAD_DQCOEFF(in3, input); 3794 LOAD_DQCOEFF(in[3], input);
3571 LOAD_DQCOEFF(in11, input); 3795 LOAD_DQCOEFF(in[11], input);
3572 LOAD_DQCOEFF(in19, input); 3796 LOAD_DQCOEFF(in[19], input);
3573 LOAD_DQCOEFF(in27, input); 3797 LOAD_DQCOEFF(in[27], input);
3574 3798
3575 LOAD_DQCOEFF(in4, input); 3799 LOAD_DQCOEFF(in[4], input);
3576 LOAD_DQCOEFF(in12, input); 3800 LOAD_DQCOEFF(in[12], input);
3577 LOAD_DQCOEFF(in20, input); 3801 LOAD_DQCOEFF(in[20], input);
3578 LOAD_DQCOEFF(in28, input); 3802 LOAD_DQCOEFF(in[28], input);
3579 LOAD_DQCOEFF(in5, input); 3803 LOAD_DQCOEFF(in[5], input);
3580 LOAD_DQCOEFF(in13, input); 3804 LOAD_DQCOEFF(in[13], input);
3581 LOAD_DQCOEFF(in21, input); 3805 LOAD_DQCOEFF(in[21], input);
3582 LOAD_DQCOEFF(in29, input); 3806 LOAD_DQCOEFF(in[29], input);
3583 LOAD_DQCOEFF(in6, input); 3807 LOAD_DQCOEFF(in[6], input);
3584 LOAD_DQCOEFF(in14, input); 3808 LOAD_DQCOEFF(in[14], input);
3585 LOAD_DQCOEFF(in22, input); 3809 LOAD_DQCOEFF(in[22], input);
3586 LOAD_DQCOEFF(in30, input); 3810 LOAD_DQCOEFF(in[30], input);
3587 LOAD_DQCOEFF(in7, input); 3811 LOAD_DQCOEFF(in[7], input);
3588 LOAD_DQCOEFF(in15, input); 3812 LOAD_DQCOEFF(in[15], input);
3589 LOAD_DQCOEFF(in23, input); 3813 LOAD_DQCOEFF(in[23], input);
3590 LOAD_DQCOEFF(in31, input); 3814 LOAD_DQCOEFF(in[31], input);
3591 3815
3592 // checking if all entries are zero 3816 // checking if all entries are zero
3593 zero_idx[0] = _mm_or_si128(in0, in1); 3817 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3594 zero_idx[1] = _mm_or_si128(in2, in3); 3818 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3595 zero_idx[2] = _mm_or_si128(in4, in5); 3819 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3596 zero_idx[3] = _mm_or_si128(in6, in7); 3820 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3597 zero_idx[4] = _mm_or_si128(in8, in9); 3821 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3598 zero_idx[5] = _mm_or_si128(in10, in11); 3822 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3599 zero_idx[6] = _mm_or_si128(in12, in13); 3823 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3600 zero_idx[7] = _mm_or_si128(in14, in15); 3824 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3601 zero_idx[8] = _mm_or_si128(in16, in17); 3825 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3602 zero_idx[9] = _mm_or_si128(in18, in19); 3826 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3603 zero_idx[10] = _mm_or_si128(in20, in21); 3827 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3604 zero_idx[11] = _mm_or_si128(in22, in23); 3828 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3605 zero_idx[12] = _mm_or_si128(in24, in25); 3829 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3606 zero_idx[13] = _mm_or_si128(in26, in27); 3830 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3607 zero_idx[14] = _mm_or_si128(in28, in29); 3831 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3608 zero_idx[15] = _mm_or_si128(in30, in31); 3832 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3609 3833
3610 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3834 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3611 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3835 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3612 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3836 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3613 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3837 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3614 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3838 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3615 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3839 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3616 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3840 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3617 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3841 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3618 3842
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
3660 col[i32 + 26] = _mm_setzero_si128(); 3884 col[i32 + 26] = _mm_setzero_si128();
3661 col[i32 + 27] = _mm_setzero_si128(); 3885 col[i32 + 27] = _mm_setzero_si128();
3662 col[i32 + 28] = _mm_setzero_si128(); 3886 col[i32 + 28] = _mm_setzero_si128();
3663 col[i32 + 29] = _mm_setzero_si128(); 3887 col[i32 + 29] = _mm_setzero_si128();
3664 col[i32 + 30] = _mm_setzero_si128(); 3888 col[i32 + 30] = _mm_setzero_si128();
3665 col[i32 + 31] = _mm_setzero_si128(); 3889 col[i32 + 31] = _mm_setzero_si128();
3666 continue; 3890 continue;
3667 } 3891 }
3668 3892
3669 // Transpose 32x8 block to 8x32 block 3893 // Transpose 32x8 block to 8x32 block
3670 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 3894 array_transpose_8x8(in, in);
3671 in4, in5, in6, in7); 3895 array_transpose_8x8(in+8, in+8);
3672 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 3896 array_transpose_8x8(in+16, in+16);
3673 in10, in11, in12, in13, in14, in15); 3897 array_transpose_8x8(in+24, in+24);
3674 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
3675 in18, in19, in20, in21, in22, in23);
3676 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
3677 in26, in27, in28, in29, in30, in31);
3678 } else {
3679 // Second 1-D idct
3680 j = i - 4;
3681 3898
3682 // Transpose 32x8 block to 8x32 block 3899 IDCT32_1D
3683 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3684 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3685 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
3686 in5, in6, in7);
3687 j += 4;
3688 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3689 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3690 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
3691 in11, in12, in13, in14, in15);
3692 j += 4;
3693 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3694 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3695 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
3696 in19, in20, in21, in22, in23);
3697 j += 4;
3698 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3699 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3700 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
3701 in28, in29, in30, in31);
3702 }
3703 3900
3704 IDCT32_1D
3705
3706 // final stage
3707 if (i < 4) {
3708 // 1_D: Store 32 intermediate results for each 8x32 block. 3901 // 1_D: Store 32 intermediate results for each 8x32 block.
3709 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3902 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3710 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3903 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3711 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3904 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3712 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3905 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3713 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3906 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3714 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3907 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3715 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3908 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3716 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3909 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3717 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3910 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
(...skipping 13 matching lines...) Expand all
3731 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3924 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3732 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3925 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3733 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3926 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3734 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3927 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3735 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3928 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3736 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3929 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3737 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3930 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3738 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3931 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3739 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3932 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3740 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3933 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3741 } else { 3934 }
3935 for (i = 0; i < 4; i++) {
3742 const __m128i zero = _mm_setzero_si128(); 3936 const __m128i zero = _mm_setzero_si128();
3937 // Second 1-D idct
3938 j = i << 3;
3939
3940 // Transpose 32x8 block to 8x32 block
3941 array_transpose_8x8(col+j, in);
3942 array_transpose_8x8(col+j+32, in+8);
3943 array_transpose_8x8(col+j+64, in+16);
3944 array_transpose_8x8(col+j+96, in+24);
3945
3946 IDCT32_1D
3743 3947
3744 // 2_D: Calculate the results and store them to destination. 3948 // 2_D: Calculate the results and store them to destination.
3745 in0 = _mm_add_epi16(stp1_0, stp1_31); 3949 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3746 in1 = _mm_add_epi16(stp1_1, stp1_30); 3950 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3747 in2 = _mm_add_epi16(stp1_2, stp1_29); 3951 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3748 in3 = _mm_add_epi16(stp1_3, stp1_28); 3952 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3749 in4 = _mm_add_epi16(stp1_4, stp1_27); 3953 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3750 in5 = _mm_add_epi16(stp1_5, stp1_26); 3954 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3751 in6 = _mm_add_epi16(stp1_6, stp1_25); 3955 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3752 in7 = _mm_add_epi16(stp1_7, stp1_24); 3956 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3753 in8 = _mm_add_epi16(stp1_8, stp1_23); 3957 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3754 in9 = _mm_add_epi16(stp1_9, stp1_22); 3958 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3755 in10 = _mm_add_epi16(stp1_10, stp1_21); 3959 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3756 in11 = _mm_add_epi16(stp1_11, stp1_20); 3960 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3757 in12 = _mm_add_epi16(stp1_12, stp1_19); 3961 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3758 in13 = _mm_add_epi16(stp1_13, stp1_18); 3962 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3759 in14 = _mm_add_epi16(stp1_14, stp1_17); 3963 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3760 in15 = _mm_add_epi16(stp1_15, stp1_16); 3964 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3761 in16 = _mm_sub_epi16(stp1_15, stp1_16); 3965 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3762 in17 = _mm_sub_epi16(stp1_14, stp1_17); 3966 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3763 in18 = _mm_sub_epi16(stp1_13, stp1_18); 3967 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3764 in19 = _mm_sub_epi16(stp1_12, stp1_19); 3968 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3765 in20 = _mm_sub_epi16(stp1_11, stp1_20); 3969 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3766 in21 = _mm_sub_epi16(stp1_10, stp1_21); 3970 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3767 in22 = _mm_sub_epi16(stp1_9, stp1_22); 3971 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3768 in23 = _mm_sub_epi16(stp1_8, stp1_23); 3972 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3769 in24 = _mm_sub_epi16(stp1_7, stp1_24); 3973 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3770 in25 = _mm_sub_epi16(stp1_6, stp1_25); 3974 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3771 in26 = _mm_sub_epi16(stp1_5, stp1_26); 3975 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3772 in27 = _mm_sub_epi16(stp1_4, stp1_27); 3976 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3773 in28 = _mm_sub_epi16(stp1_3, stp1_28); 3977 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3774 in29 = _mm_sub_epi16(stp1_2, stp1_29); 3978 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3775 in30 = _mm_sub_epi16(stp1_1, stp1_30); 3979 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3776 in31 = _mm_sub_epi16(stp1_0, stp1_31); 3980 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3777 3981
3778 // Final rounding and shift 3982 // Final rounding and shift
3779 in0 = _mm_adds_epi16(in0, final_rounding); 3983 in[0] = _mm_adds_epi16(in[0], final_rounding);
3780 in1 = _mm_adds_epi16(in1, final_rounding); 3984 in[1] = _mm_adds_epi16(in[1], final_rounding);
3781 in2 = _mm_adds_epi16(in2, final_rounding); 3985 in[2] = _mm_adds_epi16(in[2], final_rounding);
3782 in3 = _mm_adds_epi16(in3, final_rounding); 3986 in[3] = _mm_adds_epi16(in[3], final_rounding);
3783 in4 = _mm_adds_epi16(in4, final_rounding); 3987 in[4] = _mm_adds_epi16(in[4], final_rounding);
3784 in5 = _mm_adds_epi16(in5, final_rounding); 3988 in[5] = _mm_adds_epi16(in[5], final_rounding);
3785 in6 = _mm_adds_epi16(in6, final_rounding); 3989 in[6] = _mm_adds_epi16(in[6], final_rounding);
3786 in7 = _mm_adds_epi16(in7, final_rounding); 3990 in[7] = _mm_adds_epi16(in[7], final_rounding);
3787 in8 = _mm_adds_epi16(in8, final_rounding); 3991 in[8] = _mm_adds_epi16(in[8], final_rounding);
3788 in9 = _mm_adds_epi16(in9, final_rounding); 3992 in[9] = _mm_adds_epi16(in[9], final_rounding);
3789 in10 = _mm_adds_epi16(in10, final_rounding); 3993 in[10] = _mm_adds_epi16(in[10], final_rounding);
3790 in11 = _mm_adds_epi16(in11, final_rounding); 3994 in[11] = _mm_adds_epi16(in[11], final_rounding);
3791 in12 = _mm_adds_epi16(in12, final_rounding); 3995 in[12] = _mm_adds_epi16(in[12], final_rounding);
3792 in13 = _mm_adds_epi16(in13, final_rounding); 3996 in[13] = _mm_adds_epi16(in[13], final_rounding);
3793 in14 = _mm_adds_epi16(in14, final_rounding); 3997 in[14] = _mm_adds_epi16(in[14], final_rounding);
3794 in15 = _mm_adds_epi16(in15, final_rounding); 3998 in[15] = _mm_adds_epi16(in[15], final_rounding);
3795 in16 = _mm_adds_epi16(in16, final_rounding); 3999 in[16] = _mm_adds_epi16(in[16], final_rounding);
3796 in17 = _mm_adds_epi16(in17, final_rounding); 4000 in[17] = _mm_adds_epi16(in[17], final_rounding);
3797 in18 = _mm_adds_epi16(in18, final_rounding); 4001 in[18] = _mm_adds_epi16(in[18], final_rounding);
3798 in19 = _mm_adds_epi16(in19, final_rounding); 4002 in[19] = _mm_adds_epi16(in[19], final_rounding);
3799 in20 = _mm_adds_epi16(in20, final_rounding); 4003 in[20] = _mm_adds_epi16(in[20], final_rounding);
3800 in21 = _mm_adds_epi16(in21, final_rounding); 4004 in[21] = _mm_adds_epi16(in[21], final_rounding);
3801 in22 = _mm_adds_epi16(in22, final_rounding); 4005 in[22] = _mm_adds_epi16(in[22], final_rounding);
3802 in23 = _mm_adds_epi16(in23, final_rounding); 4006 in[23] = _mm_adds_epi16(in[23], final_rounding);
3803 in24 = _mm_adds_epi16(in24, final_rounding); 4007 in[24] = _mm_adds_epi16(in[24], final_rounding);
3804 in25 = _mm_adds_epi16(in25, final_rounding); 4008 in[25] = _mm_adds_epi16(in[25], final_rounding);
3805 in26 = _mm_adds_epi16(in26, final_rounding); 4009 in[26] = _mm_adds_epi16(in[26], final_rounding);
3806 in27 = _mm_adds_epi16(in27, final_rounding); 4010 in[27] = _mm_adds_epi16(in[27], final_rounding);
3807 in28 = _mm_adds_epi16(in28, final_rounding); 4011 in[28] = _mm_adds_epi16(in[28], final_rounding);
3808 in29 = _mm_adds_epi16(in29, final_rounding); 4012 in[29] = _mm_adds_epi16(in[29], final_rounding);
3809 in30 = _mm_adds_epi16(in30, final_rounding); 4013 in[30] = _mm_adds_epi16(in[30], final_rounding);
3810 in31 = _mm_adds_epi16(in31, final_rounding); 4014 in[31] = _mm_adds_epi16(in[31], final_rounding);
3811 4015
3812 in0 = _mm_srai_epi16(in0, 6); 4016 in[0] = _mm_srai_epi16(in[0], 6);
3813 in1 = _mm_srai_epi16(in1, 6); 4017 in[1] = _mm_srai_epi16(in[1], 6);
3814 in2 = _mm_srai_epi16(in2, 6); 4018 in[2] = _mm_srai_epi16(in[2], 6);
3815 in3 = _mm_srai_epi16(in3, 6); 4019 in[3] = _mm_srai_epi16(in[3], 6);
3816 in4 = _mm_srai_epi16(in4, 6); 4020 in[4] = _mm_srai_epi16(in[4], 6);
3817 in5 = _mm_srai_epi16(in5, 6); 4021 in[5] = _mm_srai_epi16(in[5], 6);
3818 in6 = _mm_srai_epi16(in6, 6); 4022 in[6] = _mm_srai_epi16(in[6], 6);
3819 in7 = _mm_srai_epi16(in7, 6); 4023 in[7] = _mm_srai_epi16(in[7], 6);
3820 in8 = _mm_srai_epi16(in8, 6); 4024 in[8] = _mm_srai_epi16(in[8], 6);
3821 in9 = _mm_srai_epi16(in9, 6); 4025 in[9] = _mm_srai_epi16(in[9], 6);
3822 in10 = _mm_srai_epi16(in10, 6); 4026 in[10] = _mm_srai_epi16(in[10], 6);
3823 in11 = _mm_srai_epi16(in11, 6); 4027 in[11] = _mm_srai_epi16(in[11], 6);
3824 in12 = _mm_srai_epi16(in12, 6); 4028 in[12] = _mm_srai_epi16(in[12], 6);
3825 in13 = _mm_srai_epi16(in13, 6); 4029 in[13] = _mm_srai_epi16(in[13], 6);
3826 in14 = _mm_srai_epi16(in14, 6); 4030 in[14] = _mm_srai_epi16(in[14], 6);
3827 in15 = _mm_srai_epi16(in15, 6); 4031 in[15] = _mm_srai_epi16(in[15], 6);
3828 in16 = _mm_srai_epi16(in16, 6); 4032 in[16] = _mm_srai_epi16(in[16], 6);
3829 in17 = _mm_srai_epi16(in17, 6); 4033 in[17] = _mm_srai_epi16(in[17], 6);
3830 in18 = _mm_srai_epi16(in18, 6); 4034 in[18] = _mm_srai_epi16(in[18], 6);
3831 in19 = _mm_srai_epi16(in19, 6); 4035 in[19] = _mm_srai_epi16(in[19], 6);
3832 in20 = _mm_srai_epi16(in20, 6); 4036 in[20] = _mm_srai_epi16(in[20], 6);
3833 in21 = _mm_srai_epi16(in21, 6); 4037 in[21] = _mm_srai_epi16(in[21], 6);
3834 in22 = _mm_srai_epi16(in22, 6); 4038 in[22] = _mm_srai_epi16(in[22], 6);
3835 in23 = _mm_srai_epi16(in23, 6); 4039 in[23] = _mm_srai_epi16(in[23], 6);
3836 in24 = _mm_srai_epi16(in24, 6); 4040 in[24] = _mm_srai_epi16(in[24], 6);
3837 in25 = _mm_srai_epi16(in25, 6); 4041 in[25] = _mm_srai_epi16(in[25], 6);
3838 in26 = _mm_srai_epi16(in26, 6); 4042 in[26] = _mm_srai_epi16(in[26], 6);
3839 in27 = _mm_srai_epi16(in27, 6); 4043 in[27] = _mm_srai_epi16(in[27], 6);
3840 in28 = _mm_srai_epi16(in28, 6); 4044 in[28] = _mm_srai_epi16(in[28], 6);
3841 in29 = _mm_srai_epi16(in29, 6); 4045 in[29] = _mm_srai_epi16(in[29], 6);
3842 in30 = _mm_srai_epi16(in30, 6); 4046 in[30] = _mm_srai_epi16(in[30], 6);
3843 in31 = _mm_srai_epi16(in31, 6); 4047 in[31] = _mm_srai_epi16(in[31], 6);
3844 4048
3845 RECON_AND_STORE(dest, in0); 4049 RECON_AND_STORE(dest, in[0]);
3846 RECON_AND_STORE(dest, in1); 4050 RECON_AND_STORE(dest, in[1]);
3847 RECON_AND_STORE(dest, in2); 4051 RECON_AND_STORE(dest, in[2]);
3848 RECON_AND_STORE(dest, in3); 4052 RECON_AND_STORE(dest, in[3]);
3849 RECON_AND_STORE(dest, in4); 4053 RECON_AND_STORE(dest, in[4]);
3850 RECON_AND_STORE(dest, in5); 4054 RECON_AND_STORE(dest, in[5]);
3851 RECON_AND_STORE(dest, in6); 4055 RECON_AND_STORE(dest, in[6]);
3852 RECON_AND_STORE(dest, in7); 4056 RECON_AND_STORE(dest, in[7]);
3853 RECON_AND_STORE(dest, in8); 4057 RECON_AND_STORE(dest, in[8]);
3854 RECON_AND_STORE(dest, in9); 4058 RECON_AND_STORE(dest, in[9]);
3855 RECON_AND_STORE(dest, in10); 4059 RECON_AND_STORE(dest, in[10]);
3856 RECON_AND_STORE(dest, in11); 4060 RECON_AND_STORE(dest, in[11]);
3857 RECON_AND_STORE(dest, in12); 4061 RECON_AND_STORE(dest, in[12]);
3858 RECON_AND_STORE(dest, in13); 4062 RECON_AND_STORE(dest, in[13]);
3859 RECON_AND_STORE(dest, in14); 4063 RECON_AND_STORE(dest, in[14]);
3860 RECON_AND_STORE(dest, in15); 4064 RECON_AND_STORE(dest, in[15]);
3861 RECON_AND_STORE(dest, in16); 4065 RECON_AND_STORE(dest, in[16]);
3862 RECON_AND_STORE(dest, in17); 4066 RECON_AND_STORE(dest, in[17]);
3863 RECON_AND_STORE(dest, in18); 4067 RECON_AND_STORE(dest, in[18]);
3864 RECON_AND_STORE(dest, in19); 4068 RECON_AND_STORE(dest, in[19]);
3865 RECON_AND_STORE(dest, in20); 4069 RECON_AND_STORE(dest, in[20]);
3866 RECON_AND_STORE(dest, in21); 4070 RECON_AND_STORE(dest, in[21]);
3867 RECON_AND_STORE(dest, in22); 4071 RECON_AND_STORE(dest, in[22]);
3868 RECON_AND_STORE(dest, in23); 4072 RECON_AND_STORE(dest, in[23]);
3869 RECON_AND_STORE(dest, in24); 4073 RECON_AND_STORE(dest, in[24]);
3870 RECON_AND_STORE(dest, in25); 4074 RECON_AND_STORE(dest, in[25]);
3871 RECON_AND_STORE(dest, in26); 4075 RECON_AND_STORE(dest, in[26]);
3872 RECON_AND_STORE(dest, in27); 4076 RECON_AND_STORE(dest, in[27]);
3873 RECON_AND_STORE(dest, in28); 4077 RECON_AND_STORE(dest, in[28]);
3874 RECON_AND_STORE(dest, in29); 4078 RECON_AND_STORE(dest, in[29]);
3875 RECON_AND_STORE(dest, in30); 4079 RECON_AND_STORE(dest, in[30]);
3876 RECON_AND_STORE(dest, in31); 4080 RECON_AND_STORE(dest, in[31]);
3877 4081
3878 dest += 8 - (stride * 32); 4082 dest += 8 - (stride * 32);
3879 } 4083 }
3880 }
3881 } //NOLINT 4084 } //NOLINT
3882 4085
3883 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 4086 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
3884 __m128i dc_value; 4087 __m128i dc_value;
3885 const __m128i zero = _mm_setzero_si128(); 4088 const __m128i zero = _mm_setzero_si128();
3886 int a, i; 4089 int a, i;
3887 4090
3888 a = dct_const_round_shift(input[0] * cospi_16_64); 4091 a = dct_const_round_shift(input[0] * cospi_16_64);
3889 a = dct_const_round_shift(a * cospi_16_64); 4092 a = dct_const_round_shift(a * cospi_16_64);
3890 a = ROUND_POWER_OF_TWO(a, 6); 4093 a = ROUND_POWER_OF_TWO(a, 6);
(...skipping 29 matching lines...) Expand all
3920 RECON_AND_STORE(dest, dc_value); 4123 RECON_AND_STORE(dest, dc_value);
3921 RECON_AND_STORE(dest, dc_value); 4124 RECON_AND_STORE(dest, dc_value);
3922 RECON_AND_STORE(dest, dc_value); 4125 RECON_AND_STORE(dest, dc_value);
3923 RECON_AND_STORE(dest, dc_value); 4126 RECON_AND_STORE(dest, dc_value);
3924 RECON_AND_STORE(dest, dc_value); 4127 RECON_AND_STORE(dest, dc_value);
3925 RECON_AND_STORE(dest, dc_value); 4128 RECON_AND_STORE(dest, dc_value);
3926 RECON_AND_STORE(dest, dc_value); 4129 RECON_AND_STORE(dest, dc_value);
3927 dest += 8 - (stride * 32); 4130 dest += 8 - (stride * 32);
3928 } 4131 }
3929 } 4132 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_treecoder.c ('k') | source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698