OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "vpx_dsp/mips/inv_txfm_msa.h" | 11 #include "vpx_dsp/mips/inv_txfm_msa.h" |
12 | 12 |
13 void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { | 13 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { |
14 v8i16 loc0, loc1, loc2, loc3; | 14 v8i16 loc0, loc1, loc2, loc3; |
15 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; | 15 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; |
16 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; | 16 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; |
17 v8i16 tmp5, tmp6, tmp7; | 17 v8i16 tmp5, tmp6, tmp7; |
18 | 18 |
19 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); | 19 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); |
20 input += 8; | 20 input += 8; |
21 LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); | 21 LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); |
22 | 22 |
23 TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, | 23 TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, |
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
96 TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, | 96 TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, |
97 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); | 97 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); |
98 ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); | 98 ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); |
99 | 99 |
100 /* transpose block */ | 100 /* transpose block */ |
101 TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, | 101 TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, |
102 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); | 102 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); |
103 ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); | 103 ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); |
104 } | 104 } |
105 | 105 |
106 void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, | 106 void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, |
107 int32_t dst_stride) { | 107 int32_t dst_stride) { |
108 v8i16 loc0, loc1, loc2, loc3; | 108 v8i16 loc0, loc1, loc2, loc3; |
109 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; | 109 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; |
110 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; | 110 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; |
111 v8i16 tmp5, tmp6, tmp7; | 111 v8i16 tmp5, tmp6, tmp7; |
112 | 112 |
113 /* load up 8x8 */ | 113 /* load up 8x8 */ |
114 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); | 114 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); |
115 input += 8 * 16; | 115 input += 8 * 16; |
116 /* load bottom 8x8 */ | 116 /* load bottom 8x8 */ |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
194 SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); | 194 SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); |
195 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); | 195 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); |
196 dst += (4 * dst_stride); | 196 dst += (4 * dst_stride); |
197 SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); | 197 SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); |
198 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); | 198 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); |
199 dst += (4 * dst_stride); | 199 dst += (4 * dst_stride); |
200 SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); | 200 SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); |
201 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); | 201 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); |
202 } | 202 } |
203 | 203 |
204 void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, | 204 void vpx_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, |
205 int32_t dst_stride) { | 205 int32_t dst_stride) { |
206 int32_t i; | 206 int32_t i; |
207 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); | 207 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); |
208 int16_t *out = out_arr; | 208 int16_t *out = out_arr; |
209 | 209 |
210 /* transform rows */ | 210 /* transform rows */ |
211 for (i = 0; i < 2; ++i) { | 211 for (i = 0; i < 2; ++i) { |
212 /* process 16 * 8 block */ | 212 /* process 16 * 8 block */ |
213 vp9_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); | 213 vpx_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); |
214 } | 214 } |
215 | 215 |
216 /* transform columns */ | 216 /* transform columns */ |
217 for (i = 0; i < 2; ++i) { | 217 for (i = 0; i < 2; ++i) { |
218 /* process 8 * 16 block */ | 218 /* process 8 * 16 block */ |
219 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), | 219 vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), |
220 dst_stride); | 220 dst_stride); |
221 } | 221 } |
222 } | 222 } |
223 | 223 |
224 void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, | 224 void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, |
225 int32_t dst_stride) { | 225 int32_t dst_stride) { |
226 uint8_t i; | 226 uint8_t i; |
227 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); | 227 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); |
228 int16_t *out = out_arr; | 228 int16_t *out = out_arr; |
229 | 229 |
230 /* process 16 * 8 block */ | 230 /* process 16 * 8 block */ |
231 vp9_idct16_1d_rows_msa(input, out); | 231 vpx_idct16_1d_rows_msa(input, out); |
232 | 232 |
233 /* short case just considers top 4 rows as valid output */ | 233 /* short case just considers top 4 rows as valid output */ |
234 out += 4 * 16; | 234 out += 4 * 16; |
235 for (i = 12; i--;) { | 235 for (i = 12; i--;) { |
236 __asm__ __volatile__ ( | 236 __asm__ __volatile__ ( |
237 "sw $zero, 0(%[out]) \n\t" | 237 "sw $zero, 0(%[out]) \n\t" |
238 "sw $zero, 4(%[out]) \n\t" | 238 "sw $zero, 4(%[out]) \n\t" |
239 "sw $zero, 8(%[out]) \n\t" | 239 "sw $zero, 8(%[out]) \n\t" |
240 "sw $zero, 12(%[out]) \n\t" | 240 "sw $zero, 12(%[out]) \n\t" |
241 "sw $zero, 16(%[out]) \n\t" | 241 "sw $zero, 16(%[out]) \n\t" |
242 "sw $zero, 20(%[out]) \n\t" | 242 "sw $zero, 20(%[out]) \n\t" |
243 "sw $zero, 24(%[out]) \n\t" | 243 "sw $zero, 24(%[out]) \n\t" |
244 "sw $zero, 28(%[out]) \n\t" | 244 "sw $zero, 28(%[out]) \n\t" |
245 | 245 |
246 : | 246 : |
247 : [out] "r" (out) | 247 : [out] "r" (out) |
248 ); | 248 ); |
249 | 249 |
250 out += 16; | 250 out += 16; |
251 } | 251 } |
252 | 252 |
253 out = out_arr; | 253 out = out_arr; |
254 | 254 |
255 /* transform columns */ | 255 /* transform columns */ |
256 for (i = 0; i < 2; ++i) { | 256 for (i = 0; i < 2; ++i) { |
257 /* process 8 * 16 block */ | 257 /* process 8 * 16 block */ |
258 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), | 258 vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), |
259 dst_stride); | 259 dst_stride); |
260 } | 260 } |
261 } | 261 } |
262 | 262 |
263 void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, | 263 void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, |
264 int32_t dst_stride) { | 264 int32_t dst_stride) { |
265 uint8_t i; | 265 uint8_t i; |
266 int16_t out; | 266 int16_t out; |
267 v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; | 267 v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; |
268 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; | 268 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; |
269 | 269 |
270 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); | 270 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); |
271 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); | 271 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); |
272 out = ROUND_POWER_OF_TWO(out, 6); | 272 out = ROUND_POWER_OF_TWO(out, 6); |
273 | 273 |
274 vec = __msa_fill_h(out); | 274 vec = __msa_fill_h(out); |
275 | 275 |
276 for (i = 4; i--;) { | 276 for (i = 4; i--;) { |
277 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 277 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
278 UNPCK_UB_SH(dst0, res0, res4); | 278 UNPCK_UB_SH(dst0, res0, res4); |
279 UNPCK_UB_SH(dst1, res1, res5); | 279 UNPCK_UB_SH(dst1, res1, res5); |
280 UNPCK_UB_SH(dst2, res2, res6); | 280 UNPCK_UB_SH(dst2, res2, res6); |
281 UNPCK_UB_SH(dst3, res3, res7); | 281 UNPCK_UB_SH(dst3, res3, res7); |
282 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); | 282 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); |
283 ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); | 283 ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); |
284 CLIP_SH4_0_255(res0, res1, res2, res3); | 284 CLIP_SH4_0_255(res0, res1, res2, res3); |
285 CLIP_SH4_0_255(res4, res5, res6, res7); | 285 CLIP_SH4_0_255(res4, res5, res6, res7); |
286 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, | 286 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, |
287 tmp0, tmp1, tmp2, tmp3); | 287 tmp0, tmp1, tmp2, tmp3); |
288 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | 288 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); |
289 dst += (4 * dst_stride); | 289 dst += (4 * dst_stride); |
290 } | 290 } |
291 } | 291 } |
292 | 292 |
293 void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { | 293 void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { |
294 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; | 294 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; |
295 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; | 295 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; |
296 | 296 |
297 /* load input data */ | 297 /* load input data */ |
298 LD_SH16(input, 8, | 298 LD_SH16(input, 8, |
299 l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); | 299 l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); |
300 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, | 300 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, |
301 l0, l1, l2, l3, l4, l5, l6, l7); | 301 l0, l1, l2, l3, l4, l5, l6, l7); |
302 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, | 302 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, |
303 l8, l9, l10, l11, l12, l13, l14, l15); | 303 l8, l9, l10, l11, l12, l13, l14, l15); |
(...skipping 10 matching lines...) Expand all Loading... |
314 l15 = -r1; | 314 l15 = -r1; |
315 | 315 |
316 TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, | 316 TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, |
317 l0, l1, l2, l3, l4, l5, l6, l7); | 317 l0, l1, l2, l3, l4, l5, l6, l7); |
318 ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); | 318 ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); |
319 TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, | 319 TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, |
320 l8, l9, l10, l11, l12, l13, l14, l15); | 320 l8, l9, l10, l11, l12, l13, l14, l15); |
321 ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); | 321 ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); |
322 } | 322 } |
323 | 323 |
324 void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, | 324 void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, |
325 int32_t dst_stride) { | 325 int32_t dst_stride) { |
326 v8i16 v0, v2, v4, v6, k0, k1, k2, k3; | 326 v8i16 v0, v2, v4, v6, k0, k1, k2, k3; |
327 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; | 327 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; |
328 v8i16 out0, out1, out2, out3, out4, out5, out6, out7; | 328 v8i16 out0, out1, out2, out3, out4, out5, out6, out7; |
329 v8i16 out8, out9, out10, out11, out12, out13, out14, out15; | 329 v8i16 out8, out9, out10, out11, out12, out13, out14, out15; |
330 v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; | 330 v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; |
331 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; | 331 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; |
332 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; | 332 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; |
333 v8i16 res8, res9, res10, res11, res12, res13, res14, res15; | 333 v8i16 res8, res9, res10, res11, res12, res13, res14, res15; |
334 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | 334 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
(...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
478 SRARI_H2_SH(out14, out15, 6); | 478 SRARI_H2_SH(out14, out15, 6); |
479 dst14 = LD_UB(dst + 5 * dst_stride); | 479 dst14 = LD_UB(dst + 5 * dst_stride); |
480 dst15 = LD_UB(dst + 10 * dst_stride); | 480 dst15 = LD_UB(dst + 10 * dst_stride); |
481 ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); | 481 ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); |
482 ADD2(res14, out14, res15, out15, res14, res15); | 482 ADD2(res14, out14, res15, out15, res14, res15); |
483 CLIP_SH2_0_255(res14, res15); | 483 CLIP_SH2_0_255(res14, res15); |
484 PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); | 484 PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); |
485 ST8x1_UB(res14, dst + 5 * dst_stride); | 485 ST8x1_UB(res14, dst + 5 * dst_stride); |
486 ST8x1_UB(res15, dst + 10 * dst_stride); | 486 ST8x1_UB(res15, dst + 10 * dst_stride); |
487 } | 487 } |
OLD | NEW |