Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(495)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h> 11 #include "vp9/common/mips/msa/vp9_idct_msa.h"
12
13 #include "vpx_ports/mem.h"
14 #include "vp9/common/vp9_idct.h"
15 #include "vp9/common/mips/msa/vp9_macros_msa.h"
16
17 #define SET_COSPI_PAIR(c0_h, c1_h) ({ \
18 v8i16 out0, r0_m, r1_m; \
19 \
20 r0_m = __msa_fill_h(c0_h); \
21 r1_m = __msa_fill_h(c1_h); \
22 out0 = __msa_ilvev_h(r1_m, r0_m); \
23 \
24 out0; \
25 })
26
27 #define DOTP_CONST_PAIR(reg0, reg1, const0, const1, out0, out1) { \
28 v8i16 k0_m = __msa_fill_h(const0); \
29 v8i16 s0_m, s1_m, s2_m, s3_m; \
30 \
31 s0_m = __msa_fill_h(const1); \
32 k0_m = __msa_ilvev_h(s0_m, k0_m); \
33 \
34 s0_m = __msa_ilvl_h(-reg1, reg0); \
35 s1_m = __msa_ilvr_h(-reg1, reg0); \
36 s2_m = __msa_ilvl_h(reg0, reg1); \
37 s3_m = __msa_ilvr_h(reg0, reg1); \
38 s1_m = (v8i16)__msa_dotp_s_w(s1_m, k0_m); \
39 s0_m = (v8i16)__msa_dotp_s_w(s0_m, k0_m); \
40 s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \
41 s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \
42 out0 = __msa_pckev_h(s0_m, s1_m); \
43 \
44 s1_m = (v8i16)__msa_dotp_s_w(s3_m, k0_m); \
45 s0_m = (v8i16)__msa_dotp_s_w(s2_m, k0_m); \
46 s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS); \
47 s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS); \
48 out1 = __msa_pckev_h(s0_m, s1_m); \
49 }
50
51 #define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
52 v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
53 v8i16 madd_s0_m, madd_s1_m; \
54 \
55 ILV_H_LR_SH(m0, m1, madd_s1_m, madd_s0_m); \
56 \
57 DOTP_S_W_4VECS_SW(madd_s0_m, c0, madd_s1_m, c0, \
58 madd_s0_m, c1, madd_s1_m, c1, \
59 madd0_m, madd1_m, madd2_m, madd3_m); \
60 \
61 SRARI_W_4VECS_SW(madd0_m, madd1_m, madd2_m, madd3_m, \
62 madd0_m, madd1_m, madd2_m, madd3_m, \
63 DCT_CONST_BITS); \
64 \
65 PCKEV_H_2VECS_SH(madd1_m, madd0_m, madd3_m, madd2_m, \
66 res0, res1); \
67 }
68
69 #define VP9_MADD_BF(inp0, inp1, inp2, inp3, \
70 cst0, cst1, cst2, cst3, \
71 out0, out1, out2, out3) { \
72 v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
73 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
74 v4i32 m4_m, m5_m; \
75 \
76 ILV_H_LRLR_SH(inp0, inp1, inp2, inp3, \
77 madd_s1_m, madd_s0_m, madd_s3_m, madd_s2_m); \
78 \
79 DOTP_S_W_4VECS_SW(madd_s0_m, cst0, madd_s1_m, cst0, \
80 madd_s2_m, cst2, madd_s3_m, cst2, \
81 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
82 \
83 m4_m = tmp0_m + tmp2_m; \
84 m5_m = tmp1_m + tmp3_m; \
85 tmp3_m = tmp1_m - tmp3_m; \
86 tmp2_m = tmp0_m - tmp2_m; \
87 \
88 SRARI_W_4VECS_SW(m4_m, m5_m, tmp2_m, tmp3_m, \
89 m4_m, m5_m, tmp2_m, tmp3_m, \
90 DCT_CONST_BITS); \
91 \
92 PCKEV_H_2VECS_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
93 \
94 DOTP_S_W_4VECS_SW(madd_s0_m, cst1, madd_s1_m, cst1, \
95 madd_s2_m, cst3, madd_s3_m, cst3, \
96 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
97 \
98 m4_m = tmp0_m + tmp2_m; \
99 m5_m = tmp1_m + tmp3_m; \
100 tmp3_m = tmp1_m - tmp3_m; \
101 tmp2_m = tmp0_m - tmp2_m; \
102 \
103 SRARI_W_4VECS_SW(m4_m, m5_m, tmp2_m, tmp3_m, \
104 m4_m, m5_m, tmp2_m, tmp3_m, \
105 DCT_CONST_BITS); \
106 \
107 PCKEV_H_2VECS_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
108 }
109
110 #define TRANSPOSE8x8_H1(in0, in1, in2, in3, \
111 in4, in5, in6, in7, \
112 out0, out1, out2, out3, \
113 out4, out5, out6, out7) { \
114 v8i16 loc0_m, loc1_m; \
115 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
116 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
117 \
118 loc0_m = __msa_ilvr_h((in6), (in4)); \
119 loc1_m = __msa_ilvr_h((in7), (in5)); \
120 tmp0_m = __msa_ilvr_h(loc1_m, loc0_m); \
121 tmp1_m = __msa_ilvl_h(loc1_m, loc0_m); \
122 \
123 loc0_m = __msa_ilvl_h((in6), (in4)); \
124 loc1_m = __msa_ilvl_h((in7), (in5)); \
125 tmp2_m = __msa_ilvr_h(loc1_m, loc0_m); \
126 tmp3_m = __msa_ilvl_h(loc1_m, loc0_m); \
127 \
128 loc0_m = __msa_ilvr_h((in2), (in0)); \
129 loc1_m = __msa_ilvr_h((in3), (in1)); \
130 tmp4_m = __msa_ilvr_h(loc1_m, loc0_m); \
131 tmp5_m = __msa_ilvl_h(loc1_m, loc0_m); \
132 \
133 loc0_m = __msa_ilvl_h((in2), (in0)); \
134 loc1_m = __msa_ilvl_h((in3), (in1)); \
135 tmp6_m = __msa_ilvr_h(loc1_m, loc0_m); \
136 tmp7_m = __msa_ilvl_h(loc1_m, loc0_m); \
137 \
138 out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
139 out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
140 out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
141 out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
142 out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
143 out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
144 out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
145 out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
146 }
147
148 #define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, \
149 r8, r9, r10, r11, r12, r13, r14, r15, \
150 out0, out1, out2, out3, out4, out5, out6, out7, \
151 out8, out9, out10, out11, \
152 out12, out13, out14, out15) { \
153 v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \
154 v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \
155 v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \
156 v8i16 h8_m, h9_m, h10_m, h11_m; \
157 v8i16 k0_m, k1_m, k2_m, k3_m; \
158 \
159 /* stage 1 */ \
160 k0_m = SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \
161 k1_m = SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \
162 k2_m = SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \
163 k3_m = SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \
164 VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \
165 g0_m, g1_m, g2_m, g3_m); \
166 \
167 k0_m = SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \
168 k1_m = SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \
169 k2_m = SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \
170 k3_m = SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \
171 VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \
172 g4_m, g5_m, g6_m, g7_m); \
173 \
174 k0_m = SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \
175 k1_m = SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \
176 k2_m = SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \
177 k3_m = SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \
178 VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \
179 g8_m, g9_m, g10_m, g11_m); \
180 \
181 k0_m = SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \
182 k1_m = SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \
183 k2_m = SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \
184 k3_m = SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \
185 VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \
186 g12_m, g13_m, g14_m, g15_m); \
187 \
188 /* stage 2 */ \
189 k0_m = SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \
190 k1_m = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \
191 k2_m = SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \
192 VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \
193 h0_m, h1_m, h2_m, h3_m); \
194 \
195 k0_m = SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \
196 k1_m = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \
197 k2_m = SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \
198 VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \
199 h4_m, h5_m, h6_m, h7_m); \
200 \
201 BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \
202 \
203 BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \
204 h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \
205 \
206 /* stage 3 */ \
207 BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \
208 \
209 k0_m = SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \
210 k1_m = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \
211 k2_m = SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \
212 VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \
213 out4, out6, out5, out7); \
214 VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \
215 out12, out14, out13, out15); \
216 \
217 /* stage 4 */ \
218 k0_m = SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
219 k1_m = SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \
220 k2_m = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
221 k3_m = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \
222 VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \
223 VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \
224 VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \
225 VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \
226 }
227
228 #define VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, \
229 in0, in1, in2, in3) { \
230 uint64_t out0_m, out1_m, out2_m, out3_m; \
231 v8i16 res0_m, res1_m, res2_m, res3_m; \
232 v16u8 dest0_m, dest1_m, dest2_m, dest3_m; \
233 v16i8 tmp0_m, tmp1_m; \
234 v16i8 zero_m = { 0 }; \
235 uint8_t *dst_m = (uint8_t *)(dest); \
236 \
237 LOAD_4VECS_UB(dst_m, (dest_stride), \
238 dest0_m, dest1_m, dest2_m, dest3_m); \
239 \
240 res0_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest0_m); \
241 res1_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest1_m); \
242 res2_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest2_m); \
243 res3_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest3_m); \
244 \
245 res0_m += (v8i16)(in0); \
246 res1_m += (v8i16)(in1); \
247 res2_m += (v8i16)(in2); \
248 res3_m += (v8i16)(in3); \
249 \
250 res0_m = CLIP_UNSIGNED_CHAR_H(res0_m); \
251 res1_m = CLIP_UNSIGNED_CHAR_H(res1_m); \
252 res2_m = CLIP_UNSIGNED_CHAR_H(res2_m); \
253 res3_m = CLIP_UNSIGNED_CHAR_H(res3_m); \
254 \
255 tmp0_m = __msa_pckev_b((v16i8)res1_m, (v16i8)res0_m); \
256 tmp1_m = __msa_pckev_b((v16i8)res3_m, (v16i8)res2_m); \
257 \
258 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
259 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
260 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
261 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
262 \
263 STORE_DWORD(dst_m, out0_m); \
264 dst_m += (dest_stride); \
265 STORE_DWORD(dst_m, out1_m); \
266 dst_m += (dest_stride); \
267 STORE_DWORD(dst_m, out2_m); \
268 dst_m += (dest_stride); \
269 STORE_DWORD(dst_m, out3_m); \
270 }
271 12
272 void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { 13 void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
273 v8i16 loc0, loc1, loc2, loc3; 14 v8i16 loc0, loc1, loc2, loc3;
274 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; 15 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
275 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; 16 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
276 v8i16 tmp5, tmp6, tmp7; 17 v8i16 tmp5, tmp6, tmp7;
277 18
278 /* load left top 8x8 */ 19 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
279 LOAD_8VECS_SH(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 20 input += 8;
21 LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
280 22
281 /* load right top 8x8 */ 23 TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
282 LOAD_8VECS_SH((input + 8), 16, 24 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
283 reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); 25 TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
284 26 reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
285 /* transpose block */ 27 VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
286 TRANSPOSE8x8_H1(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, 28 VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
287 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 29 BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
288 30 VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
289 /* transpose block */ 31 VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
290 TRANSPOSE8x8_H1(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, 32 VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
291 reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); 33 BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
292 34 SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
293 DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); 35 reg8);
294 DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); 36 ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6,
295 37 reg10);
296 loc0 = reg2 + reg10;
297 reg2 = reg2 - reg10;
298 loc1 = reg14 + reg6;
299 reg14 = reg14 - reg6;
300
301 DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
302 DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
303 DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
304
305 reg14 = reg8 - reg12;
306 reg2 = reg8 + reg12;
307 reg10 = reg0 - reg4;
308 reg6 = reg0 + reg4;
309
310 reg0 = reg2 - loc1;
311 reg2 = reg2 + loc1;
312 reg12 = reg14 - loc0;
313 reg14 = reg14 + loc0;
314 reg4 = reg6 - loc3;
315 reg6 = reg6 + loc3;
316 reg8 = reg10 - loc2;
317 reg10 = reg10 + loc2;
318 38
319 /* stage 2 */ 39 /* stage 2 */
320 DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); 40 VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
321 DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); 41 VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
322 42
323 reg9 = reg1 - loc2; 43 reg9 = reg1 - loc2;
324 reg1 = reg1 + loc2; 44 reg1 = reg1 + loc2;
325 reg7 = reg15 - loc3; 45 reg7 = reg15 - loc3;
326 reg15 = reg15 + loc3; 46 reg15 = reg15 + loc3;
327 47
328 DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); 48 VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
329 DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); 49 VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
330 50 BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
331 reg13 = loc0 + reg5;
332 reg5 = loc0 - reg5;
333 reg3 = loc1 + reg11;
334 reg11 = loc1 - reg11;
335 51
336 loc1 = reg15 + reg3; 52 loc1 = reg15 + reg3;
337 reg3 = reg15 - reg3; 53 reg3 = reg15 - reg3;
338 loc2 = reg2 + loc1; 54 loc2 = reg2 + loc1;
339 reg15 = reg2 - loc1; 55 reg15 = reg2 - loc1;
340 56
341 loc1 = reg1 + reg13; 57 loc1 = reg1 + reg13;
342 reg13 = reg1 - reg13; 58 reg13 = reg1 - reg13;
343 loc0 = reg0 + loc1; 59 loc0 = reg0 + loc1;
344 loc1 = reg0 - loc1; 60 loc1 = reg0 - loc1;
345 tmp6 = loc0; 61 tmp6 = loc0;
346 tmp7 = loc1; 62 tmp7 = loc1;
347 reg0 = loc2; 63 reg0 = loc2;
348 64
349 DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); 65 VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
350 DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); 66 VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
351 67
352 loc0 = reg9 + reg5; 68 loc0 = reg9 + reg5;
353 reg5 = reg9 - reg5; 69 reg5 = reg9 - reg5;
354 reg2 = reg6 + loc0; 70 reg2 = reg6 + loc0;
355 reg1 = reg6 - loc0; 71 reg1 = reg6 - loc0;
356 72
357 loc0 = reg7 + reg11; 73 loc0 = reg7 + reg11;
358 reg11 = reg7 - reg11; 74 reg11 = reg7 - reg11;
359 loc1 = reg4 + loc0; 75 loc1 = reg4 + loc0;
360 loc2 = reg4 - loc0; 76 loc2 = reg4 - loc0;
361 tmp5 = loc1; 77 tmp5 = loc1;
362 78
363 DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); 79 VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
80 BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
364 81
365 loc0 = reg8 + reg5;
366 loc1 = reg8 - reg5;
367 reg4 = reg10 + reg11;
368 reg9 = reg10 - reg11;
369 reg10 = loc0; 82 reg10 = loc0;
370 reg11 = loc1; 83 reg11 = loc1;
371 84
372 DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); 85 VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
86 BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
373 87
374 reg8 = reg12 + reg3;
375 reg5 = reg12 - reg3;
376 reg6 = reg14 + reg13;
377 reg7 = reg14 - reg13;
378 reg13 = loc2; 88 reg13 = loc2;
379 89
380 /* Transpose and store the output */ 90 /* Transpose and store the output */
381 reg12 = tmp5; 91 reg12 = tmp5;
382 reg14 = tmp6; 92 reg14 = tmp6;
383 reg3 = tmp7; 93 reg3 = tmp7;
384 94
385 /* transpose block */ 95 /* transpose block */
386 TRANSPOSE8x8_H1(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, 96 TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
387 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); 97 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
388 98 ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
389 STORE_8VECS_SH(output, 16, reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
390 99
391 /* transpose block */ 100 /* transpose block */
392 TRANSPOSE8x8_H1(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, 101 TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
393 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); 102 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
394 103 ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
395 STORE_8VECS_SH((output + 8), 16,
396 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
397 } 104 }
398 105
399 void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, 106 void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
400 int32_t dest_stride) { 107 int32_t dst_stride) {
401 v8i16 loc0, loc1, loc2, loc3; 108 v8i16 loc0, loc1, loc2, loc3;
402 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; 109 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
403 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; 110 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
404 v8i16 tmp5, tmp6, tmp7; 111 v8i16 tmp5, tmp6, tmp7;
405 112
406 /* load up 8x8 */ 113 /* load up 8x8 */
407 LOAD_8VECS_SH(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 114 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
115 input += 8 * 16;
116 /* load bottom 8x8 */
117 LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
408 118
409 /* load bottom 8x8 */ 119 VP9_DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
410 LOAD_8VECS_SH((input + 8 * 16), 16, 120 VP9_DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
411 reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); 121 BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
412 122 VP9_DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
413 DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); 123 VP9_DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
414 DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); 124 VP9_DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
415 125 BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
416 loc0 = reg2 + reg10;
417 reg2 = reg2 - reg10;
418 loc1 = reg14 + reg6;
419 reg14 = reg14 - reg6;
420
421 DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
422 DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
423 DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
424
425 reg14 = reg8 - reg12;
426 reg2 = reg8 + reg12;
427 reg10 = reg0 - reg4;
428 reg6 = reg0 + reg4;
429 126
430 reg0 = reg2 - loc1; 127 reg0 = reg2 - loc1;
431 reg2 = reg2 + loc1; 128 reg2 = reg2 + loc1;
432 reg12 = reg14 - loc0; 129 reg12 = reg14 - loc0;
433 reg14 = reg14 + loc0; 130 reg14 = reg14 + loc0;
434 reg4 = reg6 - loc3; 131 reg4 = reg6 - loc3;
435 reg6 = reg6 + loc3; 132 reg6 = reg6 + loc3;
436 reg8 = reg10 - loc2; 133 reg8 = reg10 - loc2;
437 reg10 = reg10 + loc2; 134 reg10 = reg10 + loc2;
438 135
439 /* stage 2 */ 136 /* stage 2 */
440 DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); 137 VP9_DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
441 DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); 138 VP9_DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
442 139
443 reg9 = reg1 - loc2; 140 reg9 = reg1 - loc2;
444 reg1 = reg1 + loc2; 141 reg1 = reg1 + loc2;
445 reg7 = reg15 - loc3; 142 reg7 = reg15 - loc3;
446 reg15 = reg15 + loc3; 143 reg15 = reg15 + loc3;
447 144
448 DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); 145 VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
449 DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); 146 VP9_DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
450 147 BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
451 reg13 = loc0 + reg5;
452 reg5 = loc0 - reg5;
453 reg3 = loc1 + reg11;
454 reg11 = loc1 - reg11;
455 148
456 loc1 = reg15 + reg3; 149 loc1 = reg15 + reg3;
457 reg3 = reg15 - reg3; 150 reg3 = reg15 - reg3;
458 loc2 = reg2 + loc1; 151 loc2 = reg2 + loc1;
459 reg15 = reg2 - loc1; 152 reg15 = reg2 - loc1;
460 153
461 loc1 = reg1 + reg13; 154 loc1 = reg1 + reg13;
462 reg13 = reg1 - reg13; 155 reg13 = reg1 - reg13;
463 loc0 = reg0 + loc1; 156 loc0 = reg0 + loc1;
464 loc1 = reg0 - loc1; 157 loc1 = reg0 - loc1;
465 tmp6 = loc0; 158 tmp6 = loc0;
466 tmp7 = loc1; 159 tmp7 = loc1;
467 reg0 = loc2; 160 reg0 = loc2;
468 161
469 DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); 162 VP9_DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
470 DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); 163 VP9_DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
471 164
472 loc0 = reg9 + reg5; 165 loc0 = reg9 + reg5;
473 reg5 = reg9 - reg5; 166 reg5 = reg9 - reg5;
474 reg2 = reg6 + loc0; 167 reg2 = reg6 + loc0;
475 reg1 = reg6 - loc0; 168 reg1 = reg6 - loc0;
476 169
477 loc0 = reg7 + reg11; 170 loc0 = reg7 + reg11;
478 reg11 = reg7 - reg11; 171 reg11 = reg7 - reg11;
479 loc1 = reg4 + loc0; 172 loc1 = reg4 + loc0;
480 loc2 = reg4 - loc0; 173 loc2 = reg4 - loc0;
481 tmp5 = loc1; 174 tmp5 = loc1;
482 175
483 DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); 176 VP9_DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
177 BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
484 178
485 loc0 = reg8 + reg5;
486 loc1 = reg8 - reg5;
487 reg4 = reg10 + reg11;
488 reg9 = reg10 - reg11;
489 reg10 = loc0; 179 reg10 = loc0;
490 reg11 = loc1; 180 reg11 = loc1;
491 181
492 DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); 182 VP9_DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
493 183 BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
494 reg8 = reg12 + reg3;
495 reg5 = reg12 - reg3;
496 reg6 = reg14 + reg13;
497 reg7 = reg14 - reg13;
498 reg13 = loc2; 184 reg13 = loc2;
499 185
500 /* Transpose and store the output */ 186 /* Transpose and store the output */
501 reg12 = tmp5; 187 reg12 = tmp5;
502 reg14 = tmp6; 188 reg14 = tmp6;
503 reg3 = tmp7; 189 reg3 = tmp7;
504 190
505 SRARI_H_4VECS_SH(reg0, reg2, reg4, reg6, reg0, reg2, reg4, reg6, 6); 191 SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
506 VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride, 192 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
507 reg0, reg2, reg4, reg6); 193 dst += (4 * dst_stride);
508 SRARI_H_4VECS_SH(reg8, reg10, reg12, reg14, reg8, reg10, reg12, reg14, 6); 194 SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
509 VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (4 * dest_stride)), 195 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
510 dest_stride, reg8, reg10, reg12, reg14); 196 dst += (4 * dst_stride);
511 SRARI_H_4VECS_SH(reg3, reg13, reg11, reg5, reg3, reg13, reg11, reg5, 6); 197 SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
512 VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (8 * dest_stride)), 198 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
513 dest_stride, reg3, reg13, reg11, reg5); 199 dst += (4 * dst_stride);
514 SRARI_H_4VECS_SH(reg7, reg9, reg1, reg15, reg7, reg9, reg1, reg15, 6); 200 SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
515 VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (12 * dest_stride)), 201 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
516 dest_stride, reg7, reg9, reg1, reg15);
517 } 202 }
518 203
519 void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dest, 204 void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
520 int32_t dest_stride) { 205 int32_t dst_stride) {
521 int32_t i; 206 int32_t i;
522 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); 207 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
523 int16_t *out = out_arr; 208 int16_t *out = out_arr;
524 209
525 /* transform rows */ 210 /* transform rows */
526 for (i = 0; i < 2; ++i) { 211 for (i = 0; i < 2; ++i) {
527 /* process 16 * 8 block */ 212 /* process 16 * 8 block */
528 vp9_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); 213 vp9_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
529 } 214 }
530 215
531 /* transform columns */ 216 /* transform columns */
532 for (i = 0; i < 2; ++i) { 217 for (i = 0; i < 2; ++i) {
533 /* process 8 * 16 block */ 218 /* process 8 * 16 block */
534 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dest + (i << 3)), 219 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
535 dest_stride); 220 dst_stride);
536 } 221 }
537 } 222 }
538 223
539 void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dest, 224 void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
540 int32_t dest_stride) { 225 int32_t dst_stride) {
541 uint8_t i; 226 uint8_t i;
542 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); 227 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
543 int16_t *out = out_arr; 228 int16_t *out = out_arr;
544 229
545 /* process 16 * 8 block */ 230 /* process 16 * 8 block */
546 vp9_idct16_1d_rows_msa(input, out); 231 vp9_idct16_1d_rows_msa(input, out);
547 232
548 /* short case just considers top 4 rows as valid output */ 233 /* short case just considers top 4 rows as valid output */
549 out += 4 * 16; 234 out += 4 * 16;
550 for (i = 12; i--;) { 235 for (i = 12; i--;) {
(...skipping 12 matching lines...) Expand all
563 ); 248 );
564 249
565 out += 16; 250 out += 16;
566 } 251 }
567 252
568 out = out_arr; 253 out = out_arr;
569 254
570 /* transform columns */ 255 /* transform columns */
571 for (i = 0; i < 2; ++i) { 256 for (i = 0; i < 2; ++i) {
572 /* process 8 * 16 block */ 257 /* process 8 * 16 block */
573 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dest + (i << 3)), 258 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
574 dest_stride); 259 dst_stride);
575 } 260 }
576 } 261 }
577 262
578 void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dest, 263 void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
579 int32_t dest_stride) { 264 int32_t dst_stride) {
580 uint8_t i; 265 uint8_t i;
581 int32_t const1;
582 int16_t out; 266 int16_t out;
583 v8i16 const2, res0, res1, res2, res3, res4, res5, res6, res7; 267 v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
584 v16u8 dest0, dest1, dest2, dest3; 268 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
585 v16u8 tmp0, tmp1, tmp2, tmp3;
586 v16i8 zero = { 0 };
587 269
588 out = dct_const_round_shift(input[0] * cospi_16_64); 270 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
589 out = dct_const_round_shift(out * cospi_16_64); 271 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
590 const1 = ROUND_POWER_OF_TWO(out, 6); 272 out = ROUND_POWER_OF_TWO(out, 6);
591 273
592 const2 = __msa_fill_h(const1); 274 vec = __msa_fill_h(out);
593 275
594 for (i = 0; i < 4; ++i) { 276 for (i = 4; i--;) {
595 LOAD_4VECS_UB(dest, dest_stride, dest0, dest1, dest2, dest3); 277 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
596 278 UNPCK_UB_SH(dst0, res0, res4);
597 res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0); 279 UNPCK_UB_SH(dst1, res1, res5);
598 res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1); 280 UNPCK_UB_SH(dst2, res2, res6);
599 res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2); 281 UNPCK_UB_SH(dst3, res3, res7);
600 res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3); 282 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
601 res4 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest0); 283 ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
602 res5 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest1); 284 CLIP_SH4_0_255(res0, res1, res2, res3);
603 res6 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest2); 285 CLIP_SH4_0_255(res4, res5, res6, res7);
604 res7 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest3); 286 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
605 287 tmp0, tmp1, tmp2, tmp3);
606 res0 += const2; 288 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
607 res1 += const2; 289 dst += (4 * dst_stride);
608 res2 += const2;
609 res3 += const2;
610 res4 += const2;
611 res5 += const2;
612 res6 += const2;
613 res7 += const2;
614
615 res0 = CLIP_UNSIGNED_CHAR_H(res0);
616 res1 = CLIP_UNSIGNED_CHAR_H(res1);
617 res2 = CLIP_UNSIGNED_CHAR_H(res2);
618 res3 = CLIP_UNSIGNED_CHAR_H(res3);
619 res4 = CLIP_UNSIGNED_CHAR_H(res4);
620 res5 = CLIP_UNSIGNED_CHAR_H(res5);
621 res6 = CLIP_UNSIGNED_CHAR_H(res6);
622 res7 = CLIP_UNSIGNED_CHAR_H(res7);
623
624 tmp0 = (v16u8)__msa_pckev_b((v16i8)res4, (v16i8)res0);
625 tmp1 = (v16u8)__msa_pckev_b((v16i8)res5, (v16i8)res1);
626 tmp2 = (v16u8)__msa_pckev_b((v16i8)res6, (v16i8)res2);
627 tmp3 = (v16u8)__msa_pckev_b((v16i8)res7, (v16i8)res3);
628
629 STORE_4VECS_UB(dest, dest_stride, tmp0, tmp1, tmp2, tmp3);
630 dest += (4 * dest_stride);
631 } 290 }
632 } 291 }
633 292
634 static void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { 293 static void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
635 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 294 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
636 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; 295 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
637 296
638 /* load input data */ 297 /* load input data */
639 LOAD_16VECS_SH(input, 8, 298 LD_SH16(input, 8,
640 l0, l8, l1, l9, l2, l10, l3, l11, 299 l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15);
641 l4, l12, l5, l13, l6, l14, l7, l15); 300 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
642 301 l0, l1, l2, l3, l4, l5, l6, l7);
643 TRANSPOSE8x8_H_SH(l0, l1, l2, l3, l4, l5, l6, l7, 302 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
644 l0, l1, l2, l3, l4, l5, l6, l7); 303 l8, l9, l10, l11, l12, l13, l14, l15);
645
646 TRANSPOSE8x8_H_SH(l8, l9, l10, l11, l12, l13, l14, l15,
647 l8, l9, l10, l11, l12, l13, l14, l15);
648 304
649 /* ADST in horizontal */ 305 /* ADST in horizontal */
650 VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, 306 VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
651 l8, l9, l10, l11, l12, l13, l14, l15, 307 l8, l9, l10, l11, l12, l13, l14, l15,
652 r0, r1, r2, r3, r4, r5, r6, r7, 308 r0, r1, r2, r3, r4, r5, r6, r7,
653 r8, r9, r10, r11, r12, r13, r14, r15); 309 r8, r9, r10, r11, r12, r13, r14, r15);
654 310
655 l1 = -r8; 311 l1 = -r8;
656 l3 = -r4; 312 l3 = -r4;
657 l13 = -r13; 313 l13 = -r13;
658 l15 = -r1; 314 l15 = -r1;
659 315
660 TRANSPOSE8x8_H_SH(r0, l1, r12, l3, r6, r14, r10, r2, 316 TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
661 l0, l1, l2, l3, l4, l5, l6, l7); 317 l0, l1, l2, l3, l4, l5, l6, l7);
662 318 ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
663 STORE_8VECS_SH(output, 16, l0, l1, l2, l3, l4, l5, l6, l7); 319 TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
664 320 l8, l9, l10, l11, l12, l13, l14, l15);
665 TRANSPOSE8x8_H_SH(r3, r11, r15, r7, r5, l13, r9, l15, 321 ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
666 l8, l9, l10, l11, l12, l13, l14, l15);
667
668 STORE_8VECS_SH((output + 8), 16, l8, l9, l10, l11, l12, l13, l14, l15);
669 } 322 }
670 323
671 static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest, 324 static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
672 int32_t dest_stride) { 325 int32_t dst_stride) {
673 v8i16 v0, v2, v4, v6, k0, k1, k2, k3; 326 v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
674 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 327 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
675 v8i16 out0, out1, out2, out3, out4, out5, out6, out7; 328 v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
676 v8i16 out8, out9, out10, out11, out12, out13, out14, out15; 329 v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
677 v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; 330 v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
678 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; 331 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
679 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 332 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
680 v8i16 res8, res9, res10, res11, res12, res13, res14, res15; 333 v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
681 v16u8 dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7; 334 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
682 v16u8 dest8, dest9, dest10, dest11, dest12, dest13, dest14, dest15; 335 v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
683 v16i8 zero = { 0 }; 336 v16i8 zero = { 0 };
684 337
685 r0 = LOAD_SH(input + 0 * 16); 338 r0 = LD_SH(input + 0 * 16);
686 r3 = LOAD_SH(input + 3 * 16); 339 r3 = LD_SH(input + 3 * 16);
687 r4 = LOAD_SH(input + 4 * 16); 340 r4 = LD_SH(input + 4 * 16);
688 r7 = LOAD_SH(input + 7 * 16); 341 r7 = LD_SH(input + 7 * 16);
689 r8 = LOAD_SH(input + 8 * 16); 342 r8 = LD_SH(input + 8 * 16);
690 r11 = LOAD_SH(input + 11 * 16); 343 r11 = LD_SH(input + 11 * 16);
691 r12 = LOAD_SH(input + 12 * 16); 344 r12 = LD_SH(input + 12 * 16);
692 r15 = LOAD_SH(input + 15 * 16); 345 r15 = LD_SH(input + 15 * 16);
693 346
694 /* stage 1 */ 347 /* stage 1 */
695 k0 = SET_COSPI_PAIR(cospi_1_64, cospi_31_64); 348 k0 = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
696 k1 = SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); 349 k1 = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
697 k2 = SET_COSPI_PAIR(cospi_17_64, cospi_15_64); 350 k2 = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
698 k3 = SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); 351 k3 = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
699 VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); 352 VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
700 353 k0 = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
701 k0 = SET_COSPI_PAIR(cospi_9_64, cospi_23_64); 354 k1 = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
702 k1 = SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); 355 k2 = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
703 k2 = SET_COSPI_PAIR(cospi_25_64, cospi_7_64); 356 k3 = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
704 k3 = SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
705 VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); 357 VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
706
707 BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); 358 BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
708 359 k0 = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
709 k0 = SET_COSPI_PAIR(cospi_4_64, cospi_28_64); 360 k1 = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
710 k1 = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); 361 k2 = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
711 k2 = SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
712 VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); 362 VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
713 363
714 r1 = LOAD_SH(input + 1 * 16); 364 r1 = LD_SH(input + 1 * 16);
715 r2 = LOAD_SH(input + 2 * 16); 365 r2 = LD_SH(input + 2 * 16);
716 r5 = LOAD_SH(input + 5 * 16); 366 r5 = LD_SH(input + 5 * 16);
717 r6 = LOAD_SH(input + 6 * 16); 367 r6 = LD_SH(input + 6 * 16);
718 r9 = LOAD_SH(input + 9 * 16); 368 r9 = LD_SH(input + 9 * 16);
719 r10 = LOAD_SH(input + 10 * 16); 369 r10 = LD_SH(input + 10 * 16);
720 r13 = LOAD_SH(input + 13 * 16); 370 r13 = LD_SH(input + 13 * 16);
721 r14 = LOAD_SH(input + 14 * 16); 371 r14 = LD_SH(input + 14 * 16);
722 372
723 k0 = SET_COSPI_PAIR(cospi_5_64, cospi_27_64); 373 k0 = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
724 k1 = SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); 374 k1 = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
725 k2 = SET_COSPI_PAIR(cospi_21_64, cospi_11_64); 375 k2 = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
726 k3 = SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); 376 k3 = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
727 VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); 377 VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
728 378 k0 = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
729 k0 = SET_COSPI_PAIR(cospi_13_64, cospi_19_64); 379 k1 = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
730 k1 = SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); 380 k2 = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
731 k2 = SET_COSPI_PAIR(cospi_29_64, cospi_3_64); 381 k3 = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
732 k3 = SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
733 VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); 382 VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
734
735 BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); 383 BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
736
737 BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); 384 BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
738 out1 = -out1; 385 out1 = -out1;
739 out0 = __msa_srari_h(out0, 6); 386 SRARI_H2_SH(out0, out1, 6);
740 out1 = __msa_srari_h(out1, 6); 387 dst0 = LD_UB(dst + 0 * dst_stride);
741 dest0 = LOAD_UB(dest + 0 * dest_stride); 388 dst1 = LD_UB(dst + 15 * dst_stride);
742 dest1 = LOAD_UB(dest + 15 * dest_stride); 389 ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
743 res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0); 390 ADD2(res0, out0, res1, out1, res0, res1);
744 res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1); 391 CLIP_SH2_0_255(res0, res1);
745 res0 += out0; 392 PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
746 res1 += out1; 393 ST8x1_UB(res0, dst);
747 res0 = CLIP_UNSIGNED_CHAR_H(res0); 394 ST8x1_UB(res1, dst + 15 * dst_stride);
748 res1 = CLIP_UNSIGNED_CHAR_H(res1);
749 res0 = (v8i16)__msa_pckev_b((v16i8)res0, (v16i8)res0);
750 res1 = (v8i16)__msa_pckev_b((v16i8)res1, (v16i8)res1);
751 STORE_DWORD(dest, __msa_copy_u_d((v2i64)res0, 0));
752 STORE_DWORD(dest + 15 * dest_stride, __msa_copy_u_d((v2i64)res1, 0));
753 395
754 k0 = SET_COSPI_PAIR(cospi_12_64, cospi_20_64); 396 k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
755 k1 = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); 397 k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
756 k2 = SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); 398 k2 = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
757 VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); 399 VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
758
759 BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); 400 BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
760 out8 = -out8; 401 out8 = -out8;
761 402
762 out8 = __msa_srari_h(out8, 6); 403 SRARI_H2_SH(out8, out9, 6);
763 out9 = __msa_srari_h(out9, 6); 404 dst8 = LD_UB(dst + 1 * dst_stride);
764 dest8 = LOAD_UB(dest + 1 * dest_stride); 405 dst9 = LD_UB(dst + 14 * dst_stride);
765 dest9 = LOAD_UB(dest + 14 * dest_stride); 406 ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
766 res8 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest8); 407 ADD2(res8, out8, res9, out9, res8, res9);
767 res9 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest9); 408 CLIP_SH2_0_255(res8, res9);
768 res8 += out8; 409 PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
769 res9 += out9; 410 ST8x1_UB(res8, dst + dst_stride);
770 res8 = CLIP_UNSIGNED_CHAR_H(res8); 411 ST8x1_UB(res9, dst + 14 * dst_stride);
771 res9 = CLIP_UNSIGNED_CHAR_H(res9);
772 res8 = (v8i16)__msa_pckev_b((v16i8)res8, (v16i8)res8);
773 res9 = (v8i16)__msa_pckev_b((v16i8)res9, (v16i8)res9);
774 STORE_DWORD(dest + dest_stride, __msa_copy_u_d((v2i64)res8, 0));
775 STORE_DWORD(dest + 14 * dest_stride, __msa_copy_u_d((v2i64)res9, 0));
776 412
777 k0 = SET_COSPI_PAIR(cospi_8_64, cospi_24_64); 413 k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
778 k1 = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); 414 k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
779 k2 = SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); 415 k2 = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
780 VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); 416 VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
781 out4 = -out4; 417 out4 = -out4;
782 out4 = __msa_srari_h(out4, 6); 418 SRARI_H2_SH(out4, out5, 6);
783 out5 = __msa_srari_h(out5, 6); 419 dst4 = LD_UB(dst + 3 * dst_stride);
784 dest4 = LOAD_UB(dest + 3 * dest_stride); 420 dst5 = LD_UB(dst + 12 * dst_stride);
785 dest5 = LOAD_UB(dest + 12 * dest_stride); 421 ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
786 res4 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest4); 422 ADD2(res4, out4, res5, out5, res4, res5);
787 res5 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest5); 423 CLIP_SH2_0_255(res4, res5);
788 res4 += out4; 424 PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
789 res5 += out5; 425 ST8x1_UB(res4, dst + 3 * dst_stride);
790 res4 = CLIP_UNSIGNED_CHAR_H(res4); 426 ST8x1_UB(res5, dst + 12 * dst_stride);
791 res5 = CLIP_UNSIGNED_CHAR_H(res5);
792 res4 = (v8i16)__msa_pckev_b((v16i8)res4, (v16i8)res4);
793 res5 = (v8i16)__msa_pckev_b((v16i8)res5, (v16i8)res5);
794 STORE_DWORD(dest + 3 * dest_stride, __msa_copy_u_d((v2i64)res4, 0));
795 STORE_DWORD(dest + 12 * dest_stride, __msa_copy_u_d((v2i64)res5, 0));
796 427
797 VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); 428 VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
798 out13 = -out13; 429 out13 = -out13;
799 out12 = __msa_srari_h(out12, 6); 430 SRARI_H2_SH(out12, out13, 6);
800 out13 = __msa_srari_h(out13, 6); 431 dst12 = LD_UB(dst + 2 * dst_stride);
801 dest12 = LOAD_UB(dest + 2 * dest_stride); 432 dst13 = LD_UB(dst + 13 * dst_stride);
802 dest13 = LOAD_UB(dest + 13 * dest_stride); 433 ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
803 res12 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest12); 434 ADD2(res12, out12, res13, out13, res12, res13);
804 res13 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest13); 435 CLIP_SH2_0_255(res12, res13);
805 res12 += out12; 436 PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
806 res13 += out13; 437 ST8x1_UB(res12, dst + 2 * dst_stride);
807 res12 = CLIP_UNSIGNED_CHAR_H(res12); 438 ST8x1_UB(res13, dst + 13 * dst_stride);
808 res13 = CLIP_UNSIGNED_CHAR_H(res13);
809 res12 = (v8i16)__msa_pckev_b((v16i8)res12, (v16i8)res12);
810 res13 = (v8i16)__msa_pckev_b((v16i8)res13, (v16i8)res13);
811 STORE_DWORD(dest + 2 * dest_stride, __msa_copy_u_d((v2i64)res12, 0));
812 STORE_DWORD(dest + 13 * dest_stride, __msa_copy_u_d((v2i64)res13, 0));
813 439
814 k0 = SET_COSPI_PAIR(cospi_16_64, cospi_16_64); 440 k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
815 k3 = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); 441 k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
816 VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7); 442 VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
817 out6 = __msa_srari_h(out6, 6); 443 SRARI_H2_SH(out6, out7, 6);
818 out7 = __msa_srari_h(out7, 6); 444 dst6 = LD_UB(dst + 4 * dst_stride);
819 dest6 = LOAD_UB(dest + 4 * dest_stride); 445 dst7 = LD_UB(dst + 11 * dst_stride);
820 dest7 = LOAD_UB(dest + 11 * dest_stride); 446 ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
821 res6 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest6); 447 ADD2(res6, out6, res7, out7, res6, res7);
822 res7 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest7); 448 CLIP_SH2_0_255(res6, res7);
823 res6 += out6; 449 PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
824 res7 += out7; 450 ST8x1_UB(res6, dst + 4 * dst_stride);
825 res6 = CLIP_UNSIGNED_CHAR_H(res6); 451 ST8x1_UB(res7, dst + 11 * dst_stride);
826 res7 = CLIP_UNSIGNED_CHAR_H(res7);
827 res6 = (v8i16)__msa_pckev_b((v16i8)res6, (v16i8)res6);
828 res7 = (v8i16)__msa_pckev_b((v16i8)res7, (v16i8)res7);
829 STORE_DWORD(dest + 4 * dest_stride, __msa_copy_u_d((v2i64)res6, 0));
830 STORE_DWORD(dest + 11 * dest_stride, __msa_copy_u_d((v2i64)res7, 0));
831 452
832 VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11); 453 VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
833 out10 = __msa_srari_h(out10, 6); 454 SRARI_H2_SH(out10, out11, 6);
834 out11 = __msa_srari_h(out11, 6); 455 dst10 = LD_UB(dst + 6 * dst_stride);
835 dest10 = LOAD_UB(dest + 6 * dest_stride); 456 dst11 = LD_UB(dst + 9 * dst_stride);
836 dest11 = LOAD_UB(dest + 9 * dest_stride); 457 ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
837 res10 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest10); 458 ADD2(res10, out10, res11, out11, res10, res11);
838 res11 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest11); 459 CLIP_SH2_0_255(res10, res11);
839 res10 += out10; 460 PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
840 res11 += out11; 461 ST8x1_UB(res10, dst + 6 * dst_stride);
841 res10 = CLIP_UNSIGNED_CHAR_H(res10); 462 ST8x1_UB(res11, dst + 9 * dst_stride);
842 res11 = CLIP_UNSIGNED_CHAR_H(res11);
843 res10 = (v8i16)__msa_pckev_b((v16i8)res10, (v16i8)res10);
844 res11 = (v8i16)__msa_pckev_b((v16i8)res11, (v16i8)res11);
845 STORE_DWORD(dest + 6 * dest_stride, __msa_copy_u_d((v2i64)res10, 0));
846 STORE_DWORD(dest + 9 * dest_stride, __msa_copy_u_d((v2i64)res11, 0));
847 463
848 k1 = SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); 464 k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
849 k2 = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); 465 k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
850 VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3); 466 VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
851 out2 = __msa_srari_h(out2, 6); 467 SRARI_H2_SH(out2, out3, 6);
852 out3 = __msa_srari_h(out3, 6); 468 dst2 = LD_UB(dst + 7 * dst_stride);
853 dest2 = LOAD_UB(dest + 7 * dest_stride); 469 dst3 = LD_UB(dst + 8 * dst_stride);
854 dest3 = LOAD_UB(dest + 8 * dest_stride); 470 ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
855 res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2); 471 ADD2(res2, out2, res3, out3, res2, res3);
856 res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3); 472 CLIP_SH2_0_255(res2, res3);
857 res2 += out2; 473 PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
858 res3 += out3; 474 ST8x1_UB(res2, dst + 7 * dst_stride);
859 res2 = CLIP_UNSIGNED_CHAR_H(res2); 475 ST8x1_UB(res3, dst + 8 * dst_stride);
860 res3 = CLIP_UNSIGNED_CHAR_H(res3);
861 res2 = (v8i16)__msa_pckev_b((v16i8)res2, (v16i8)res2);
862 res3 = (v8i16)__msa_pckev_b((v16i8)res3, (v16i8)res3);
863 STORE_DWORD(dest + 7 * dest_stride, __msa_copy_u_d((v2i64)res2, 0));
864 STORE_DWORD(dest + 8 * dest_stride, __msa_copy_u_d((v2i64)res3, 0));
865 476
866 VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15); 477 VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
867 out14 = __msa_srari_h(out14, 6); 478 SRARI_H2_SH(out14, out15, 6);
868 out15 = __msa_srari_h(out15, 6); 479 dst14 = LD_UB(dst + 5 * dst_stride);
869 dest14 = LOAD_UB(dest + 5 * dest_stride); 480 dst15 = LD_UB(dst + 10 * dst_stride);
870 dest15 = LOAD_UB(dest + 10 * dest_stride); 481 ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
871 res14 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest14); 482 ADD2(res14, out14, res15, out15, res14, res15);
872 res15 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest15); 483 CLIP_SH2_0_255(res14, res15);
873 res14 += out14; 484 PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
874 res15 += out15; 485 ST8x1_UB(res14, dst + 5 * dst_stride);
875 res14 = CLIP_UNSIGNED_CHAR_H(res14); 486 ST8x1_UB(res15, dst + 10 * dst_stride);
876 res15 = CLIP_UNSIGNED_CHAR_H(res15);
877 res14 = (v8i16)__msa_pckev_b((v16i8)res14, (v16i8)res14);
878 res15 = (v8i16)__msa_pckev_b((v16i8)res15, (v16i8)res15);
879 STORE_DWORD(dest + 5 * dest_stride, __msa_copy_u_d((v2i64)res14, 0));
880 STORE_DWORD(dest + 10 * dest_stride, __msa_copy_u_d((v2i64)res15, 0));
881 } 487 }
882 488
883 void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dest, 489 void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dst,
884 int32_t dest_stride, int32_t tx_type) { 490 int32_t dst_stride, int32_t tx_type) {
885 int32_t i; 491 int32_t i;
886 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); 492 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
887 int16_t *out_ptr = &out[0]; 493 int16_t *out_ptr = &out[0];
888 494
889 switch (tx_type) { 495 switch (tx_type) {
890 case DCT_DCT: 496 case DCT_DCT:
891 /* transform rows */ 497 /* transform rows */
892 for (i = 0; i < 2; ++i) { 498 for (i = 0; i < 2; ++i) {
893 /* process 16 * 8 block */ 499 /* process 16 * 8 block */
894 vp9_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7))); 500 vp9_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
895 } 501 }
896 502
897 /* transform columns */ 503 /* transform columns */
898 for (i = 0; i < 2; ++i) { 504 for (i = 0; i < 2; ++i) {
899 /* process 8 * 16 block */ 505 /* process 8 * 16 block */
900 vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), 506 vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
901 (dest + (i << 3)), dest_stride); 507 dst_stride);
902 } 508 }
903 break; 509 break;
904 case ADST_DCT: 510 case ADST_DCT:
905 /* transform rows */ 511 /* transform rows */
906 for (i = 0; i < 2; ++i) { 512 for (i = 0; i < 2; ++i) {
907 /* process 16 * 8 block */ 513 /* process 16 * 8 block */
908 vp9_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7))); 514 vp9_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
909 } 515 }
910 516
911 /* transform columns */ 517 /* transform columns */
912 for (i = 0; i < 2; ++i) { 518 for (i = 0; i < 2; ++i) {
913 vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)), 519 vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
914 (dest + (i << 3)), dest_stride); 520 (dst + (i << 3)), dst_stride);
915 } 521 }
916 break; 522 break;
917 case DCT_ADST: 523 case DCT_ADST:
918 /* transform rows */ 524 /* transform rows */
919 for (i = 0; i < 2; ++i) { 525 for (i = 0; i < 2; ++i) {
920 /* process 16 * 8 block */ 526 /* process 16 * 8 block */
921 vp9_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7))); 527 vp9_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
922 } 528 }
923 529
924 /* transform columns */ 530 /* transform columns */
925 for (i = 0; i < 2; ++i) { 531 for (i = 0; i < 2; ++i) {
926 /* process 8 * 16 block */ 532 /* process 8 * 16 block */
927 vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), 533 vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
928 (dest + (i << 3)), dest_stride); 534 dst_stride);
929 } 535 }
930 break; 536 break;
931 case ADST_ADST: 537 case ADST_ADST:
932 /* transform rows */ 538 /* transform rows */
933 for (i = 0; i < 2; ++i) { 539 for (i = 0; i < 2; ++i) {
934 /* process 16 * 8 block */ 540 /* process 16 * 8 block */
935 vp9_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7))); 541 vp9_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
936 } 542 }
937 543
938 /* transform columns */ 544 /* transform columns */
939 for (i = 0; i < 2; ++i) { 545 for (i = 0; i < 2; ++i) {
940 vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)), 546 vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
941 (dest + (i << 3)), dest_stride); 547 (dst + (i << 3)), dst_stride);
942 } 548 }
943 break; 549 break;
944 default: 550 default:
945 assert(0); 551 assert(0);
946 break; 552 break;
947 } 553 }
948 } 554 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698