Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_idct_msa.h

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_
12 #define VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_
13
14 #include <assert.h>
15
16 #include "vpx_ports/mem.h"
17 #include "vp9/common/vp9_idct.h"
18 #include "vp9/common/mips/msa/vp9_macros_msa.h"
19
20 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
21 v8i16 k0_m = __msa_fill_h(cnst0); \
22 v4i32 s0_m, s1_m, s2_m, s3_m; \
23 \
24 s0_m = (v4i32)__msa_fill_h(cnst1); \
25 k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
26 \
27 ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
28 ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
29 DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
30 SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
31 out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
32 \
33 DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
34 SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
35 out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
36 }
37
38 #define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \
39 dst0, dst1, dst2, dst3) { \
40 v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \
41 v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \
42 \
43 DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \
44 tp0_m, tp2_m, tp3_m, tp4_m); \
45 DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \
46 tp5_m, tp6_m, tp7_m, tp8_m); \
47 BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \
48 BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \
49 SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \
50 SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \
51 PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \
52 dst0, dst1, dst2, dst3); \
53 }
54
55 #define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
56 v8i16 dst_m; \
57 v4i32 tp0_m, tp1_m; \
58 \
59 DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
60 SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
61 dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
62 \
63 dst_m; \
64 })
65
66 #define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \
67 out0, out1, out2, out3, out4, out5, out6, out7) { \
68 v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
69 v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
70 v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
71 cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
72 v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \
73 -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \
74 \
75 SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
76 cnst2_m = -cnst0_m; \
77 ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
78 SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
79 cnst4_m = -cnst2_m; \
80 ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
81 \
82 ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
83 ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
84 VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
85 cnst1_m, cnst2_m, cnst3_m, in7, in0, \
86 in4, in3); \
87 \
88 SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
89 cnst2_m = -cnst0_m; \
90 ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
91 SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
92 cnst4_m = -cnst2_m; \
93 ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
94 \
95 ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
96 ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
97 \
98 VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
99 cnst1_m, cnst2_m, cnst3_m, in5, in2, \
100 in6, in1); \
101 BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
102 out7 = -s0_m; \
103 out0 = s1_m; \
104 \
105 SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \
106 cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
107 \
108 ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
109 cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
110 cnst1_m = cnst0_m; \
111 \
112 ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
113 ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
114 VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \
115 cnst2_m, cnst3_m, cnst1_m, out1, out6, \
116 s0_m, s1_m); \
117 \
118 SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
119 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
120 \
121 ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
122 ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
123 out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
124 out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
125 out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
126 out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
127 \
128 out1 = -out1; \
129 out3 = -out3; \
130 out5 = -out5; \
131 }
132
133 #define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
134 v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
135 v8i16 madd_s0_m, madd_s1_m; \
136 \
137 ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
138 DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \
139 c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \
140 SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
141 PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
142 }
143
144 #define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
145 out0, out1, out2, out3) { \
146 v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
147 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
148 \
149 ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
150 ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
151 DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
152 cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
153 BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
154 m4_m, m5_m, tmp3_m, tmp2_m); \
155 SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
156 PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
157 DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
158 cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
159 BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
160 m4_m, m5_m, tmp3_m, tmp2_m); \
161 SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
162 PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
163 }
164
165 #define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({ \
166 v8i16 out0_m, r0_m, r1_m; \
167 \
168 r0_m = __msa_fill_h(c0_h); \
169 r1_m = __msa_fill_h(c1_h); \
170 out0_m = __msa_ilvev_h(r1_m, r0_m); \
171 \
172 out0_m; \
173 })
174
175 #define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) { \
176 uint8_t *dst_m = (uint8_t *) (dst); \
177 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
178 v16i8 tmp0_m, tmp1_m; \
179 v16i8 zero_m = { 0 }; \
180 v8i16 res0_m, res1_m, res2_m, res3_m; \
181 \
182 LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \
183 ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \
184 zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \
185 ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \
186 res0_m, res1_m, res2_m, res3_m); \
187 CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
188 PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
189 ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
190 }
191
192 #define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \
193 v8i16 c0_m, c1_m, c2_m, c3_m; \
194 v8i16 step0_m, step1_m; \
195 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
196 \
197 c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
198 c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
199 step0_m = __msa_ilvr_h(in2, in0); \
200 DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \
201 \
202 c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
203 c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
204 step1_m = __msa_ilvr_h(in3, in1); \
205 DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \
206 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
207 \
208 PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
209 SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \
210 BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, \
211 (v8i16)tmp2_m, (v8i16)tmp3_m, \
212 out0, out1, out2, out3); \
213 }
214
215 #define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \
216 v8i16 res0_m, res1_m, c0_m, c1_m; \
217 v8i16 k1_m, k2_m, k3_m, k4_m; \
218 v8i16 zero_m = { 0 }; \
219 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
220 v4i32 int0_m, int1_m, int2_m, int3_m; \
221 v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \
222 sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \
223 -sinpi_4_9 }; \
224 \
225 SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \
226 ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \
227 ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
228 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \
229 int0_m = tmp2_m + tmp1_m; \
230 \
231 SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \
232 ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \
233 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
234 int1_m = tmp0_m + tmp1_m; \
235 \
236 c0_m = __msa_splati_h(mask_m, 6); \
237 ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \
238 ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \
239 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \
240 int2_m = tmp0_m + tmp1_m; \
241 \
242 c0_m = __msa_splati_h(mask_m, 6); \
243 c0_m = __msa_ilvev_h(c0_m, k1_m); \
244 \
245 res0_m = __msa_ilvr_h((in1), (in3)); \
246 tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \
247 int3_m = tmp2_m + tmp0_m; \
248 \
249 res0_m = __msa_ilvr_h((in2), (in3)); \
250 c1_m = __msa_ilvev_h(k4_m, k3_m); \
251 \
252 tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \
253 res1_m = __msa_ilvr_h((in0), (in2)); \
254 c1_m = __msa_ilvev_h(k1_m, zero_m); \
255 \
256 tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \
257 int3_m += tmp2_m; \
258 int3_m += tmp3_m; \
259 \
260 SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \
261 PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \
262 PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \
263 }
264
265 #define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \
266 v8i16 c0_m, c1_m; \
267 \
268 SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
269 c0_m = __msa_ilvev_h(c1_m, c0_m); \
270 \
271 c0_m; \
272 })
273
274 /* multiply and add macro */
275 #define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
276 out0, out1, out2, out3) { \
277 v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
278 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
279 \
280 ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \
281 ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \
282 DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \
283 cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
284 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
285 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \
286 DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \
287 cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
288 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
289 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \
290 }
291
292 /* idct 8x8 macro */
293 #define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
294 out0, out1, out2, out3, out4, out5, out6, out7) { \
295 v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \
296 v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \
297 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
298 v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \
299 cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \
300 \
301 k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \
302 k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \
303 k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \
304 k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \
305 VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
306 SUB2(in1, in3, in7, in5, res0_m, res1_m); \
307 k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \
308 k1_m = __msa_splati_h(mask_m, 4); \
309 \
310 ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \
311 DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \
312 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
313 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
314 tp4_m = in1 + in3; \
315 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \
316 tp7_m = in7 + in5; \
317 k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
318 k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
319 VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \
320 in0, in4, in2, in6); \
321 BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \
322 BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \
323 out0, out1, out2, out3, out4, out5, out6, out7); \
324 }
325
326 #define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \
327 out0, out1, out2, out3, out4, out5, out6, out7) { \
328 v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \
329 v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \
330 v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \
331 v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \
332 cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \
333 v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \
334 cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \
335 v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \
336 -cospi_16_64, 0, 0, 0, 0 }; \
337 \
338 k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \
339 k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \
340 ILVRL_H2_SH(in1, in0, in_s1, in_s0); \
341 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
342 r0_m, r1_m, r2_m, r3_m); \
343 k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \
344 k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \
345 ILVRL_H2_SH(in5, in4, in_s1, in_s0); \
346 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
347 r4_m, r5_m, r6_m, r7_m); \
348 ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
349 m0_m, m1_m, m2_m, m3_m); \
350 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
351 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \
352 SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
353 m0_m, m1_m, m2_m, m3_m); \
354 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
355 PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \
356 k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \
357 k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \
358 ILVRL_H2_SH(in3, in2, in_s1, in_s0); \
359 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
360 r0_m, r1_m, r2_m, r3_m); \
361 k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \
362 k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \
363 ILVRL_H2_SH(in7, in6, in_s1, in_s0); \
364 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
365 r4_m, r5_m, r6_m, r7_m); \
366 ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
367 m0_m, m1_m, m2_m, m3_m); \
368 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
369 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \
370 SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \
371 m0_m, m1_m, m2_m, m3_m); \
372 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
373 PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \
374 ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \
375 BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \
376 k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \
377 k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \
378 ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \
379 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
380 r0_m, r1_m, r2_m, r3_m); \
381 k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \
382 DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \
383 r4_m, r5_m, r6_m, r7_m); \
384 ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \
385 m0_m, m1_m, m2_m, m3_m); \
386 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
387 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \
388 SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \
389 m0_m, m1_m, m2_m, m3_m); \
390 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
391 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \
392 k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \
393 k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \
394 ILVRL_H2_SH(in4, in3, in_s1, in_s0); \
395 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \
396 m0_m, m1_m, m2_m, m3_m); \
397 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
398 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \
399 ILVRL_H2_SW(in5, in2, m2_m, m3_m); \
400 DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \
401 m0_m, m1_m, m2_m, m3_m); \
402 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \
403 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \
404 \
405 out1 = -in1; \
406 out3 = -in3; \
407 out5 = -in5; \
408 out7 = -in7; \
409 }
410
411 #define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \
412 r9, r10, r11, r12, r13, r14, r15, \
413 out0, out1, out2, out3, out4, out5, \
414 out6, out7, out8, out9, out10, out11, \
415 out12, out13, out14, out15) { \
416 v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
417 v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
418 v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
419 v8i16 h8_m, h9_m, h10_m, h11_m; \
420 v8i16 k0_m, k1_m, k2_m, k3_m; \
421 \
422 /* stage 1 */ \
423 k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
424 k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
425 k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
426 k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
427 VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \
428 g0_m, g1_m, g2_m, g3_m); \
429 k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
430 k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
431 k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
432 k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
433 VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \
434 g4_m, g5_m, g6_m, g7_m); \
435 k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
436 k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
437 k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
438 k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
439 VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \
440 g8_m, g9_m, g10_m, g11_m); \
441 k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
442 k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
443 k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
444 k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
445 VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \
446 g12_m, g13_m, g14_m, g15_m); \
447 \
448 /* stage 2 */ \
449 k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
450 k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
451 k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
452 VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \
453 h0_m, h1_m, h2_m, h3_m); \
454 k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
455 k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
456 k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
457 VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \
458 h4_m, h5_m, h6_m, h7_m); \
459 BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
460 BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \
461 h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
462 \
463 /* stage 3 */ \
464 BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
465 k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
466 k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
467 k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
468 VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \
469 out4, out6, out5, out7); \
470 VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \
471 out12, out14, out13, out15); \
472 \
473 /* stage 4 */ \
474 k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
475 k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
476 k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
477 k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
478 VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
479 VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
480 VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
481 VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
482 }
483 #endif /* VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ */
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698