OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "vpx_dsp/mips/inv_txfm_msa.h" | 11 #include "vpx_dsp/mips/inv_txfm_msa.h" |
12 | 12 |
13 static void vp9_idct32x8_row_transpose_store(const int16_t *input, | 13 static void idct32x8_row_transpose_store(const int16_t *input, |
14 int16_t *tmp_buf) { | 14 int16_t *tmp_buf) { |
15 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; | 15 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; |
16 | 16 |
17 /* 1st & 2nd 8x8 */ | 17 /* 1st & 2nd 8x8 */ |
18 LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); | 18 LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); |
19 LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); | 19 LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); |
20 TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, | 20 TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, |
21 m0, n0, m1, n1, m2, n2, m3, n3); | 21 m0, n0, m1, n1, m2, n2, m3, n3); |
22 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, | 22 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, |
23 m4, n4, m5, n5, m6, n6, m7, n7); | 23 m4, n4, m5, n5, m6, n6, m7, n7); |
24 ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); | 24 ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); |
25 ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); | 25 ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); |
26 ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); | 26 ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); |
27 | 27 |
28 /* 3rd & 4th 8x8 */ | 28 /* 3rd & 4th 8x8 */ |
29 LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); | 29 LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); |
30 LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); | 30 LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); |
31 TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, | 31 TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, |
32 m0, n0, m1, n1, m2, n2, m3, n3); | 32 m0, n0, m1, n1, m2, n2, m3, n3); |
33 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, | 33 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, |
34 m4, n4, m5, n5, m6, n6, m7, n7); | 34 m4, n4, m5, n5, m6, n6, m7, n7); |
35 ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); | 35 ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); |
36 ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); | 36 ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); |
37 ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); | 37 ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); |
38 ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); | 38 ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); |
39 } | 39 } |
40 | 40 |
41 static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf, | 41 static void idct32x8_row_even_process_store(int16_t *tmp_buf, |
42 int16_t *tmp_eve_buf) { | 42 int16_t *tmp_eve_buf) { |
43 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; | 43 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; |
44 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | 44 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
45 v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; | 45 v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; |
46 | 46 |
47 /* Even stage 1 */ | 47 /* Even stage 1 */ |
48 LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); | 48 LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); |
49 | 49 |
50 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); | 50 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); |
51 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); | 51 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); |
52 BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); | 52 BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
115 ST_SH(loc2, (tmp_eve_buf + 10 * 8)); | 115 ST_SH(loc2, (tmp_eve_buf + 10 * 8)); |
116 ST_SH(loc3, (tmp_eve_buf + 5 * 8)); | 116 ST_SH(loc3, (tmp_eve_buf + 5 * 8)); |
117 | 117 |
118 BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); | 118 BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); |
119 ST_SH(loc0, (tmp_eve_buf + 9 * 8)); | 119 ST_SH(loc0, (tmp_eve_buf + 9 * 8)); |
120 ST_SH(loc1, (tmp_eve_buf + 6 * 8)); | 120 ST_SH(loc1, (tmp_eve_buf + 6 * 8)); |
121 ST_SH(loc2, (tmp_eve_buf + 8 * 8)); | 121 ST_SH(loc2, (tmp_eve_buf + 8 * 8)); |
122 ST_SH(loc3, (tmp_eve_buf + 7 * 8)); | 122 ST_SH(loc3, (tmp_eve_buf + 7 * 8)); |
123 } | 123 } |
124 | 124 |
125 static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf, | 125 static void idct32x8_row_odd_process_store(int16_t *tmp_buf, |
126 int16_t *tmp_odd_buf) { | 126 int16_t *tmp_odd_buf) { |
127 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; | 127 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; |
128 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | 128 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
129 | 129 |
130 /* Odd stage 1 */ | 130 /* Odd stage 1 */ |
131 reg0 = LD_SH(tmp_buf + 8); | 131 reg0 = LD_SH(tmp_buf + 8); |
132 reg1 = LD_SH(tmp_buf + 7 * 8); | 132 reg1 = LD_SH(tmp_buf + 7 * 8); |
133 reg2 = LD_SH(tmp_buf + 9 * 8); | 133 reg2 = LD_SH(tmp_buf + 9 * 8); |
134 reg3 = LD_SH(tmp_buf + 15 * 8); | 134 reg3 = LD_SH(tmp_buf + 15 * 8); |
135 reg4 = LD_SH(tmp_buf + 17 * 8); | 135 reg4 = LD_SH(tmp_buf + 17 * 8); |
136 reg5 = LD_SH(tmp_buf + 23 * 8); | 136 reg5 = LD_SH(tmp_buf + 23 * 8); |
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
233 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); | 233 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); |
234 | 234 |
235 SUB2(reg0, reg4, reg3, reg7, vec0, vec1); | 235 SUB2(reg0, reg4, reg3, reg7, vec0, vec1); |
236 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); | 236 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); |
237 | 237 |
238 SUB2(reg1, reg5, reg2, reg6, vec0, vec1); | 238 SUB2(reg1, reg5, reg2, reg6, vec0, vec1); |
239 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); | 239 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); |
240 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); | 240 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); |
241 } | 241 } |
242 | 242 |
243 static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf, | 243 static void idct_butterfly_transpose_store(int16_t *tmp_buf, |
244 int16_t *tmp_eve_buf, | 244 int16_t *tmp_eve_buf, |
245 int16_t *tmp_odd_buf, | 245 int16_t *tmp_odd_buf, |
246 int16_t *dst) { | 246 int16_t *dst) { |
247 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; | 247 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; |
248 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; | 248 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; |
249 | 249 |
250 /* FINAL BUTTERFLY : Dependency on Even & Odd */ | 250 /* FINAL BUTTERFLY : Dependency on Even & Odd */ |
251 vec0 = LD_SH(tmp_odd_buf); | 251 vec0 = LD_SH(tmp_odd_buf); |
252 vec1 = LD_SH(tmp_odd_buf + 9 * 8); | 252 vec1 = LD_SH(tmp_odd_buf + 9 * 8); |
253 vec2 = LD_SH(tmp_odd_buf + 14 * 8); | 253 vec2 = LD_SH(tmp_odd_buf + 14 * 8); |
254 vec3 = LD_SH(tmp_odd_buf + 6 * 8); | 254 vec3 = LD_SH(tmp_odd_buf + 6 * 8); |
255 loc0 = LD_SH(tmp_eve_buf); | 255 loc0 = LD_SH(tmp_eve_buf); |
256 loc1 = LD_SH(tmp_eve_buf + 8 * 8); | 256 loc1 = LD_SH(tmp_eve_buf + 8 * 8); |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
334 m0, n0, m1, n1, m2, n2, m3, n3); | 334 m0, n0, m1, n1, m2, n2, m3, n3); |
335 ST_SH4(m0, n0, m1, n1, (dst + 16), 32); | 335 ST_SH4(m0, n0, m1, n1, (dst + 16), 32); |
336 ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); | 336 ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); |
337 | 337 |
338 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, | 338 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, |
339 m4, n4, m5, n5, m6, n6, m7, n7); | 339 m4, n4, m5, n5, m6, n6, m7, n7); |
340 ST_SH4(m4, n4, m5, n5, (dst + 24), 32); | 340 ST_SH4(m4, n4, m5, n5, (dst + 24), 32); |
341 ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); | 341 ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); |
342 } | 342 } |
343 | 343 |
344 static void vp9_idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { | 344 static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { |
345 DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); | 345 DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); |
346 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); | 346 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); |
347 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); | 347 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); |
348 | 348 |
349 vp9_idct32x8_row_transpose_store(input, &tmp_buf[0]); | 349 idct32x8_row_transpose_store(input, &tmp_buf[0]); |
350 vp9_idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); | 350 idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); |
351 vp9_idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); | 351 idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); |
352 vp9_idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], | 352 idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], |
353 &tmp_odd_buf[0], output); | 353 &tmp_odd_buf[0], output); |
354 } | 354 } |
355 | 355 |
356 static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf, | 356 static void idct8x32_column_even_process_store(int16_t *tmp_buf, |
357 int16_t *tmp_eve_buf) { | 357 int16_t *tmp_eve_buf) { |
358 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; | 358 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; |
359 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | 359 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
360 v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; | 360 v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; |
361 | 361 |
362 /* Even stage 1 */ | 362 /* Even stage 1 */ |
363 LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); | 363 LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); |
364 tmp_buf += (2 * 32); | 364 tmp_buf += (2 * 32); |
365 | 365 |
366 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); | 366 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); |
367 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); | 367 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
426 /* Store 8 */ | 426 /* Store 8 */ |
427 BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); | 427 BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); |
428 ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); | 428 ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); |
429 ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); | 429 ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); |
430 | 430 |
431 BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); | 431 BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); |
432 ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); | 432 ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); |
433 ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); | 433 ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); |
434 } | 434 } |
435 | 435 |
436 static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, | 436 static void idct8x32_column_odd_process_store(int16_t *tmp_buf, |
437 int16_t *tmp_odd_buf) { | 437 int16_t *tmp_odd_buf) { |
438 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; | 438 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; |
439 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | 439 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
440 | 440 |
441 /* Odd stage 1 */ | 441 /* Odd stage 1 */ |
442 reg0 = LD_SH(tmp_buf + 32); | 442 reg0 = LD_SH(tmp_buf + 32); |
443 reg1 = LD_SH(tmp_buf + 7 * 32); | 443 reg1 = LD_SH(tmp_buf + 7 * 32); |
444 reg2 = LD_SH(tmp_buf + 9 * 32); | 444 reg2 = LD_SH(tmp_buf + 9 * 32); |
445 reg3 = LD_SH(tmp_buf + 15 * 32); | 445 reg3 = LD_SH(tmp_buf + 15 * 32); |
446 reg4 = LD_SH(tmp_buf + 17 * 32); | 446 reg4 = LD_SH(tmp_buf + 17 * 32); |
447 reg5 = LD_SH(tmp_buf + 23 * 32); | 447 reg5 = LD_SH(tmp_buf + 23 * 32); |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
533 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); | 533 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); |
534 | 534 |
535 SUB2(reg0, reg4, reg3, reg7, vec0, vec1); | 535 SUB2(reg0, reg4, reg3, reg7, vec0, vec1); |
536 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); | 536 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); |
537 | 537 |
538 SUB2(reg1, reg5, reg2, reg6, vec0, vec1); | 538 SUB2(reg1, reg5, reg2, reg6, vec0, vec1); |
539 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); | 539 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); |
540 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); | 540 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); |
541 } | 541 } |
542 | 542 |
543 static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, | 543 static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, |
544 int16_t *tmp_odd_buf, | 544 int16_t *tmp_odd_buf, |
545 uint8_t *dst, | 545 uint8_t *dst, |
546 int32_t dst_stride) { | 546 int32_t dst_stride) { |
547 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; | 547 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; |
548 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; | 548 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; |
549 | 549 |
550 /* FINAL BUTTERFLY : Dependency on Even & Odd */ | 550 /* FINAL BUTTERFLY : Dependency on Even & Odd */ |
551 vec0 = LD_SH(tmp_odd_buf); | 551 vec0 = LD_SH(tmp_odd_buf); |
552 vec1 = LD_SH(tmp_odd_buf + 9 * 8); | 552 vec1 = LD_SH(tmp_odd_buf + 9 * 8); |
553 vec2 = LD_SH(tmp_odd_buf + 14 * 8); | 553 vec2 = LD_SH(tmp_odd_buf + 14 * 8); |
554 vec3 = LD_SH(tmp_odd_buf + 6 * 8); | 554 vec3 = LD_SH(tmp_odd_buf + 6 * 8); |
555 loc0 = LD_SH(tmp_eve_buf); | 555 loc0 = LD_SH(tmp_eve_buf); |
556 loc1 = LD_SH(tmp_eve_buf + 8 * 8); | 556 loc1 = LD_SH(tmp_eve_buf + 8 * 8); |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
620 SRARI_H4_SH(n1, n3, n5, n7, 6); | 620 SRARI_H4_SH(n1, n3, n5, n7, 6); |
621 VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), | 621 VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), |
622 n1, n3, n5, n7); | 622 n1, n3, n5, n7); |
623 | 623 |
624 SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); | 624 SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); |
625 SRARI_H4_SH(n1, n3, n5, n7, 6); | 625 SRARI_H4_SH(n1, n3, n5, n7, 6); |
626 VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), | 626 VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), |
627 n1, n3, n5, n7); | 627 n1, n3, n5, n7); |
628 } | 628 } |
629 | 629 |
630 static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, | 630 static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, |
631 int32_t dst_stride) { | 631 int32_t dst_stride) { |
632 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); | 632 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); |
633 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); | 633 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); |
634 | 634 |
635 vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); | 635 idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); |
636 vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); | 636 idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); |
637 vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], | 637 idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], |
638 dst, dst_stride); | 638 dst, dst_stride); |
639 } | 639 } |
640 | 640 |
641 void vp9_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, | 641 void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, |
642 int32_t dst_stride) { | 642 int32_t dst_stride) { |
643 int32_t i; | 643 int32_t i; |
644 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); | 644 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); |
645 int16_t *out_ptr = out_arr; | 645 int16_t *out_ptr = out_arr; |
646 | 646 |
647 /* transform rows */ | 647 /* transform rows */ |
648 for (i = 0; i < 4; ++i) { | 648 for (i = 0; i < 4; ++i) { |
649 /* process 32 * 8 block */ | 649 /* process 32 * 8 block */ |
650 vp9_idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); | 650 idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); |
651 } | 651 } |
652 | 652 |
653 /* transform columns */ | 653 /* transform columns */ |
654 for (i = 0; i < 4; ++i) { | 654 for (i = 0; i < 4; ++i) { |
655 /* process 8 * 32 block */ | 655 /* process 8 * 32 block */ |
656 vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), | 656 idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), |
657 dst_stride); | 657 dst_stride); |
658 } | 658 } |
659 } | 659 } |
660 | 660 |
661 void vp9_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, | 661 void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, |
662 int32_t dst_stride) { | 662 int32_t dst_stride) { |
663 int32_t i; | 663 int32_t i; |
664 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); | 664 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); |
665 int16_t *out_ptr = out_arr; | 665 int16_t *out_ptr = out_arr; |
666 | 666 |
667 for (i = 32; i--;) { | 667 for (i = 32; i--;) { |
668 __asm__ __volatile__ ( | 668 __asm__ __volatile__ ( |
669 "sw $zero, 0(%[out_ptr]) \n\t" | 669 "sw $zero, 0(%[out_ptr]) \n\t" |
670 "sw $zero, 4(%[out_ptr]) \n\t" | 670 "sw $zero, 4(%[out_ptr]) \n\t" |
671 "sw $zero, 8(%[out_ptr]) \n\t" | 671 "sw $zero, 8(%[out_ptr]) \n\t" |
(...skipping 14 matching lines...) Expand all Loading... |
686 : | 686 : |
687 : [out_ptr] "r" (out_ptr) | 687 : [out_ptr] "r" (out_ptr) |
688 ); | 688 ); |
689 | 689 |
690 out_ptr += 32; | 690 out_ptr += 32; |
691 } | 691 } |
692 | 692 |
693 out_ptr = out_arr; | 693 out_ptr = out_arr; |
694 | 694 |
695 /* rows: only upper-left 8x8 has non-zero coeff */ | 695 /* rows: only upper-left 8x8 has non-zero coeff */ |
696 vp9_idct32x8_1d_rows_msa(input, out_ptr); | 696 idct32x8_1d_rows_msa(input, out_ptr); |
697 | 697 |
698 /* transform columns */ | 698 /* transform columns */ |
699 for (i = 0; i < 4; ++i) { | 699 for (i = 0; i < 4; ++i) { |
700 /* process 8 * 32 block */ | 700 /* process 8 * 32 block */ |
701 vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), | 701 idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), |
702 dst_stride); | 702 dst_stride); |
703 } | 703 } |
704 } | 704 } |
705 | 705 |
706 void vp9_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, | 706 void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, |
707 int32_t dst_stride) { | 707 int32_t dst_stride) { |
708 int32_t i; | 708 int32_t i; |
709 int16_t out; | 709 int16_t out; |
710 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; | 710 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; |
711 v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; | 711 v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; |
712 | 712 |
713 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); | 713 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); |
714 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); | 714 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); |
715 out = ROUND_POWER_OF_TWO(out, 6); | 715 out = ROUND_POWER_OF_TWO(out, 6); |
716 | 716 |
(...skipping 13 matching lines...) Expand all Loading... |
730 CLIP_SH4_0_255(res4, res5, res6, res7); | 730 CLIP_SH4_0_255(res4, res5, res6, res7); |
731 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, | 731 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, |
732 tmp0, tmp1, tmp2, tmp3); | 732 tmp0, tmp1, tmp2, tmp3); |
733 | 733 |
734 ST_UB2(tmp0, tmp1, dst, 16); | 734 ST_UB2(tmp0, tmp1, dst, 16); |
735 dst += dst_stride; | 735 dst += dst_stride; |
736 ST_UB2(tmp2, tmp3, dst, 16); | 736 ST_UB2(tmp2, tmp3, dst, 16); |
737 dst += dst_stride; | 737 dst += dst_stride; |
738 } | 738 } |
739 } | 739 } |
OLD | NEW |