| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "vpx_dsp/mips/inv_txfm_msa.h" | 11 #include "vpx_dsp/mips/inv_txfm_msa.h" |
| 12 | 12 |
| 13 void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { | 13 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { |
| 14 v8i16 loc0, loc1, loc2, loc3; | 14 v8i16 loc0, loc1, loc2, loc3; |
| 15 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; | 15 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; |
| 16 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; | 16 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; |
| 17 v8i16 tmp5, tmp6, tmp7; | 17 v8i16 tmp5, tmp6, tmp7; |
| 18 | 18 |
| 19 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); | 19 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); |
| 20 input += 8; | 20 input += 8; |
| 21 LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); | 21 LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); |
| 22 | 22 |
| 23 TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, | 23 TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, |
| (...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 96 TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, | 96 TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, |
| 97 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); | 97 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); |
| 98 ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); | 98 ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); |
| 99 | 99 |
| 100 /* transpose block */ | 100 /* transpose block */ |
| 101 TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, | 101 TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, |
| 102 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); | 102 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); |
| 103 ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); | 103 ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); |
| 104 } | 104 } |
| 105 | 105 |
| 106 void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, | 106 void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, |
| 107 int32_t dst_stride) { | 107 int32_t dst_stride) { |
| 108 v8i16 loc0, loc1, loc2, loc3; | 108 v8i16 loc0, loc1, loc2, loc3; |
| 109 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; | 109 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; |
| 110 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; | 110 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; |
| 111 v8i16 tmp5, tmp6, tmp7; | 111 v8i16 tmp5, tmp6, tmp7; |
| 112 | 112 |
| 113 /* load up 8x8 */ | 113 /* load up 8x8 */ |
| 114 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); | 114 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); |
| 115 input += 8 * 16; | 115 input += 8 * 16; |
| 116 /* load bottom 8x8 */ | 116 /* load bottom 8x8 */ |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 194 SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); | 194 SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); |
| 195 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); | 195 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); |
| 196 dst += (4 * dst_stride); | 196 dst += (4 * dst_stride); |
| 197 SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); | 197 SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); |
| 198 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); | 198 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); |
| 199 dst += (4 * dst_stride); | 199 dst += (4 * dst_stride); |
| 200 SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); | 200 SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); |
| 201 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); | 201 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); |
| 202 } | 202 } |
| 203 | 203 |
| 204 void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, | 204 void vpx_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, |
| 205 int32_t dst_stride) { | 205 int32_t dst_stride) { |
| 206 int32_t i; | 206 int32_t i; |
| 207 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); | 207 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); |
| 208 int16_t *out = out_arr; | 208 int16_t *out = out_arr; |
| 209 | 209 |
| 210 /* transform rows */ | 210 /* transform rows */ |
| 211 for (i = 0; i < 2; ++i) { | 211 for (i = 0; i < 2; ++i) { |
| 212 /* process 16 * 8 block */ | 212 /* process 16 * 8 block */ |
| 213 vp9_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); | 213 vpx_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); |
| 214 } | 214 } |
| 215 | 215 |
| 216 /* transform columns */ | 216 /* transform columns */ |
| 217 for (i = 0; i < 2; ++i) { | 217 for (i = 0; i < 2; ++i) { |
| 218 /* process 8 * 16 block */ | 218 /* process 8 * 16 block */ |
| 219 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), | 219 vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), |
| 220 dst_stride); | 220 dst_stride); |
| 221 } | 221 } |
| 222 } | 222 } |
| 223 | 223 |
| 224 void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, | 224 void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, |
| 225 int32_t dst_stride) { | 225 int32_t dst_stride) { |
| 226 uint8_t i; | 226 uint8_t i; |
| 227 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); | 227 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); |
| 228 int16_t *out = out_arr; | 228 int16_t *out = out_arr; |
| 229 | 229 |
| 230 /* process 16 * 8 block */ | 230 /* process 16 * 8 block */ |
| 231 vp9_idct16_1d_rows_msa(input, out); | 231 vpx_idct16_1d_rows_msa(input, out); |
| 232 | 232 |
| 233 /* short case just considers top 4 rows as valid output */ | 233 /* short case just considers top 4 rows as valid output */ |
| 234 out += 4 * 16; | 234 out += 4 * 16; |
| 235 for (i = 12; i--;) { | 235 for (i = 12; i--;) { |
| 236 __asm__ __volatile__ ( | 236 __asm__ __volatile__ ( |
| 237 "sw $zero, 0(%[out]) \n\t" | 237 "sw $zero, 0(%[out]) \n\t" |
| 238 "sw $zero, 4(%[out]) \n\t" | 238 "sw $zero, 4(%[out]) \n\t" |
| 239 "sw $zero, 8(%[out]) \n\t" | 239 "sw $zero, 8(%[out]) \n\t" |
| 240 "sw $zero, 12(%[out]) \n\t" | 240 "sw $zero, 12(%[out]) \n\t" |
| 241 "sw $zero, 16(%[out]) \n\t" | 241 "sw $zero, 16(%[out]) \n\t" |
| 242 "sw $zero, 20(%[out]) \n\t" | 242 "sw $zero, 20(%[out]) \n\t" |
| 243 "sw $zero, 24(%[out]) \n\t" | 243 "sw $zero, 24(%[out]) \n\t" |
| 244 "sw $zero, 28(%[out]) \n\t" | 244 "sw $zero, 28(%[out]) \n\t" |
| 245 | 245 |
| 246 : | 246 : |
| 247 : [out] "r" (out) | 247 : [out] "r" (out) |
| 248 ); | 248 ); |
| 249 | 249 |
| 250 out += 16; | 250 out += 16; |
| 251 } | 251 } |
| 252 | 252 |
| 253 out = out_arr; | 253 out = out_arr; |
| 254 | 254 |
| 255 /* transform columns */ | 255 /* transform columns */ |
| 256 for (i = 0; i < 2; ++i) { | 256 for (i = 0; i < 2; ++i) { |
| 257 /* process 8 * 16 block */ | 257 /* process 8 * 16 block */ |
| 258 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), | 258 vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), |
| 259 dst_stride); | 259 dst_stride); |
| 260 } | 260 } |
| 261 } | 261 } |
| 262 | 262 |
| 263 void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, | 263 void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, |
| 264 int32_t dst_stride) { | 264 int32_t dst_stride) { |
| 265 uint8_t i; | 265 uint8_t i; |
| 266 int16_t out; | 266 int16_t out; |
| 267 v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; | 267 v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; |
| 268 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; | 268 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; |
| 269 | 269 |
| 270 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); | 270 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); |
| 271 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); | 271 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); |
| 272 out = ROUND_POWER_OF_TWO(out, 6); | 272 out = ROUND_POWER_OF_TWO(out, 6); |
| 273 | 273 |
| 274 vec = __msa_fill_h(out); | 274 vec = __msa_fill_h(out); |
| 275 | 275 |
| 276 for (i = 4; i--;) { | 276 for (i = 4; i--;) { |
| 277 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 277 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 278 UNPCK_UB_SH(dst0, res0, res4); | 278 UNPCK_UB_SH(dst0, res0, res4); |
| 279 UNPCK_UB_SH(dst1, res1, res5); | 279 UNPCK_UB_SH(dst1, res1, res5); |
| 280 UNPCK_UB_SH(dst2, res2, res6); | 280 UNPCK_UB_SH(dst2, res2, res6); |
| 281 UNPCK_UB_SH(dst3, res3, res7); | 281 UNPCK_UB_SH(dst3, res3, res7); |
| 282 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); | 282 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); |
| 283 ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); | 283 ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); |
| 284 CLIP_SH4_0_255(res0, res1, res2, res3); | 284 CLIP_SH4_0_255(res0, res1, res2, res3); |
| 285 CLIP_SH4_0_255(res4, res5, res6, res7); | 285 CLIP_SH4_0_255(res4, res5, res6, res7); |
| 286 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, | 286 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, |
| 287 tmp0, tmp1, tmp2, tmp3); | 287 tmp0, tmp1, tmp2, tmp3); |
| 288 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); | 288 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); |
| 289 dst += (4 * dst_stride); | 289 dst += (4 * dst_stride); |
| 290 } | 290 } |
| 291 } | 291 } |
| 292 | 292 |
| 293 void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { | 293 void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { |
| 294 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; | 294 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; |
| 295 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; | 295 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; |
| 296 | 296 |
| 297 /* load input data */ | 297 /* load input data */ |
| 298 LD_SH16(input, 8, | 298 LD_SH16(input, 8, |
| 299 l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); | 299 l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); |
| 300 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, | 300 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, |
| 301 l0, l1, l2, l3, l4, l5, l6, l7); | 301 l0, l1, l2, l3, l4, l5, l6, l7); |
| 302 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, | 302 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, |
| 303 l8, l9, l10, l11, l12, l13, l14, l15); | 303 l8, l9, l10, l11, l12, l13, l14, l15); |
| (...skipping 10 matching lines...) Expand all Loading... |
| 314 l15 = -r1; | 314 l15 = -r1; |
| 315 | 315 |
| 316 TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, | 316 TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, |
| 317 l0, l1, l2, l3, l4, l5, l6, l7); | 317 l0, l1, l2, l3, l4, l5, l6, l7); |
| 318 ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); | 318 ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); |
| 319 TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, | 319 TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, |
| 320 l8, l9, l10, l11, l12, l13, l14, l15); | 320 l8, l9, l10, l11, l12, l13, l14, l15); |
| 321 ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); | 321 ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); |
| 322 } | 322 } |
| 323 | 323 |
| 324 void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, | 324 void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, |
| 325 int32_t dst_stride) { | 325 int32_t dst_stride) { |
| 326 v8i16 v0, v2, v4, v6, k0, k1, k2, k3; | 326 v8i16 v0, v2, v4, v6, k0, k1, k2, k3; |
| 327 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; | 327 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; |
| 328 v8i16 out0, out1, out2, out3, out4, out5, out6, out7; | 328 v8i16 out0, out1, out2, out3, out4, out5, out6, out7; |
| 329 v8i16 out8, out9, out10, out11, out12, out13, out14, out15; | 329 v8i16 out8, out9, out10, out11, out12, out13, out14, out15; |
| 330 v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; | 330 v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; |
| 331 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; | 331 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; |
| 332 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; | 332 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; |
| 333 v8i16 res8, res9, res10, res11, res12, res13, res14, res15; | 333 v8i16 res8, res9, res10, res11, res12, res13, res14, res15; |
| 334 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | 334 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
| (...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 478 SRARI_H2_SH(out14, out15, 6); | 478 SRARI_H2_SH(out14, out15, 6); |
| 479 dst14 = LD_UB(dst + 5 * dst_stride); | 479 dst14 = LD_UB(dst + 5 * dst_stride); |
| 480 dst15 = LD_UB(dst + 10 * dst_stride); | 480 dst15 = LD_UB(dst + 10 * dst_stride); |
| 481 ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); | 481 ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); |
| 482 ADD2(res14, out14, res15, out15, res14, res15); | 482 ADD2(res14, out14, res15, out15, res14, res15); |
| 483 CLIP_SH2_0_255(res14, res15); | 483 CLIP_SH2_0_255(res14, res15); |
| 484 PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); | 484 PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); |
| 485 ST8x1_UB(res14, dst + 5 * dst_stride); | 485 ST8x1_UB(res14, dst + 5 * dst_stride); |
| 486 ST8x1_UB(res15, dst + 10 * dst_stride); | 486 ST8x1_UB(res15, dst + 10 * dst_stride); |
| 487 } | 487 } |
| OLD | NEW |