Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(232)

Side by Side Diff: source/libvpx/vpx_dsp/mips/idct32x32_msa.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vpx_dsp/mips/idct16x16_msa.c ('k') | source/libvpx/vpx_dsp/mips/idct4x4_msa.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "vpx_dsp/mips/inv_txfm_msa.h" 11 #include "vpx_dsp/mips/inv_txfm_msa.h"
12 12
13 static void vp9_idct32x8_row_transpose_store(const int16_t *input, 13 static void idct32x8_row_transpose_store(const int16_t *input,
14 int16_t *tmp_buf) { 14 int16_t *tmp_buf) {
15 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 15 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
16 16
17 /* 1st & 2nd 8x8 */ 17 /* 1st & 2nd 8x8 */
18 LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); 18 LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
19 LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); 19 LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
20 TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, 20 TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
21 m0, n0, m1, n1, m2, n2, m3, n3); 21 m0, n0, m1, n1, m2, n2, m3, n3);
22 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 22 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
23 m4, n4, m5, n5, m6, n6, m7, n7); 23 m4, n4, m5, n5, m6, n6, m7, n7);
24 ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); 24 ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
25 ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); 25 ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
26 ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); 26 ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
27 27
28 /* 3rd & 4th 8x8 */ 28 /* 3rd & 4th 8x8 */
29 LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); 29 LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
30 LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); 30 LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
31 TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, 31 TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
32 m0, n0, m1, n1, m2, n2, m3, n3); 32 m0, n0, m1, n1, m2, n2, m3, n3);
33 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 33 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
34 m4, n4, m5, n5, m6, n6, m7, n7); 34 m4, n4, m5, n5, m6, n6, m7, n7);
35 ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); 35 ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
36 ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); 36 ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
37 ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); 37 ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
38 ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); 38 ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
39 } 39 }
40 40
41 static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf, 41 static void idct32x8_row_even_process_store(int16_t *tmp_buf,
42 int16_t *tmp_eve_buf) { 42 int16_t *tmp_eve_buf) {
43 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 43 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
44 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 44 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
45 v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; 45 v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
46 46
47 /* Even stage 1 */ 47 /* Even stage 1 */
48 LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 48 LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
49 49
50 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); 50 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
51 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); 51 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
52 BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); 52 BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
115 ST_SH(loc2, (tmp_eve_buf + 10 * 8)); 115 ST_SH(loc2, (tmp_eve_buf + 10 * 8));
116 ST_SH(loc3, (tmp_eve_buf + 5 * 8)); 116 ST_SH(loc3, (tmp_eve_buf + 5 * 8));
117 117
118 BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); 118 BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
119 ST_SH(loc0, (tmp_eve_buf + 9 * 8)); 119 ST_SH(loc0, (tmp_eve_buf + 9 * 8));
120 ST_SH(loc1, (tmp_eve_buf + 6 * 8)); 120 ST_SH(loc1, (tmp_eve_buf + 6 * 8));
121 ST_SH(loc2, (tmp_eve_buf + 8 * 8)); 121 ST_SH(loc2, (tmp_eve_buf + 8 * 8));
122 ST_SH(loc3, (tmp_eve_buf + 7 * 8)); 122 ST_SH(loc3, (tmp_eve_buf + 7 * 8));
123 } 123 }
124 124
125 static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf, 125 static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
126 int16_t *tmp_odd_buf) { 126 int16_t *tmp_odd_buf) {
127 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 127 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
128 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 128 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
129 129
130 /* Odd stage 1 */ 130 /* Odd stage 1 */
131 reg0 = LD_SH(tmp_buf + 8); 131 reg0 = LD_SH(tmp_buf + 8);
132 reg1 = LD_SH(tmp_buf + 7 * 8); 132 reg1 = LD_SH(tmp_buf + 7 * 8);
133 reg2 = LD_SH(tmp_buf + 9 * 8); 133 reg2 = LD_SH(tmp_buf + 9 * 8);
134 reg3 = LD_SH(tmp_buf + 15 * 8); 134 reg3 = LD_SH(tmp_buf + 15 * 8);
135 reg4 = LD_SH(tmp_buf + 17 * 8); 135 reg4 = LD_SH(tmp_buf + 17 * 8);
136 reg5 = LD_SH(tmp_buf + 23 * 8); 136 reg5 = LD_SH(tmp_buf + 23 * 8);
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after
233 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); 233 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
234 234
235 SUB2(reg0, reg4, reg3, reg7, vec0, vec1); 235 SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
236 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 236 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
237 237
238 SUB2(reg1, reg5, reg2, reg6, vec0, vec1); 238 SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
239 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 239 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
240 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); 240 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
241 } 241 }
242 242
243 static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf, 243 static void idct_butterfly_transpose_store(int16_t *tmp_buf,
244 int16_t *tmp_eve_buf, 244 int16_t *tmp_eve_buf,
245 int16_t *tmp_odd_buf, 245 int16_t *tmp_odd_buf,
246 int16_t *dst) { 246 int16_t *dst) {
247 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 247 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
248 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 248 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
249 249
250 /* FINAL BUTTERFLY : Dependency on Even & Odd */ 250 /* FINAL BUTTERFLY : Dependency on Even & Odd */
251 vec0 = LD_SH(tmp_odd_buf); 251 vec0 = LD_SH(tmp_odd_buf);
252 vec1 = LD_SH(tmp_odd_buf + 9 * 8); 252 vec1 = LD_SH(tmp_odd_buf + 9 * 8);
253 vec2 = LD_SH(tmp_odd_buf + 14 * 8); 253 vec2 = LD_SH(tmp_odd_buf + 14 * 8);
254 vec3 = LD_SH(tmp_odd_buf + 6 * 8); 254 vec3 = LD_SH(tmp_odd_buf + 6 * 8);
255 loc0 = LD_SH(tmp_eve_buf); 255 loc0 = LD_SH(tmp_eve_buf);
256 loc1 = LD_SH(tmp_eve_buf + 8 * 8); 256 loc1 = LD_SH(tmp_eve_buf + 8 * 8);
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
334 m0, n0, m1, n1, m2, n2, m3, n3); 334 m0, n0, m1, n1, m2, n2, m3, n3);
335 ST_SH4(m0, n0, m1, n1, (dst + 16), 32); 335 ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
336 ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); 336 ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
337 337
338 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, 338 TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
339 m4, n4, m5, n5, m6, n6, m7, n7); 339 m4, n4, m5, n5, m6, n6, m7, n7);
340 ST_SH4(m4, n4, m5, n5, (dst + 24), 32); 340 ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
341 ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); 341 ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
342 } 342 }
343 343
344 static void vp9_idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { 344 static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
345 DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); 345 DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
346 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); 346 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
347 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); 347 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
348 348
349 vp9_idct32x8_row_transpose_store(input, &tmp_buf[0]); 349 idct32x8_row_transpose_store(input, &tmp_buf[0]);
350 vp9_idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); 350 idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
351 vp9_idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); 351 idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
352 vp9_idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], 352 idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0],
353 &tmp_odd_buf[0], output); 353 &tmp_odd_buf[0], output);
354 } 354 }
355 355
356 static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf, 356 static void idct8x32_column_even_process_store(int16_t *tmp_buf,
357 int16_t *tmp_eve_buf) { 357 int16_t *tmp_eve_buf) {
358 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 358 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
359 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 359 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
360 v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; 360 v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
361 361
362 /* Even stage 1 */ 362 /* Even stage 1 */
363 LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 363 LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
364 tmp_buf += (2 * 32); 364 tmp_buf += (2 * 32);
365 365
366 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); 366 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
367 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); 367 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
426 /* Store 8 */ 426 /* Store 8 */
427 BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); 427 BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
428 ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); 428 ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
429 ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); 429 ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
430 430
431 BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); 431 BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
432 ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); 432 ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
433 ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); 433 ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
434 } 434 }
435 435
436 static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf, 436 static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
437 int16_t *tmp_odd_buf) { 437 int16_t *tmp_odd_buf) {
438 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 438 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
439 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 439 v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
440 440
441 /* Odd stage 1 */ 441 /* Odd stage 1 */
442 reg0 = LD_SH(tmp_buf + 32); 442 reg0 = LD_SH(tmp_buf + 32);
443 reg1 = LD_SH(tmp_buf + 7 * 32); 443 reg1 = LD_SH(tmp_buf + 7 * 32);
444 reg2 = LD_SH(tmp_buf + 9 * 32); 444 reg2 = LD_SH(tmp_buf + 9 * 32);
445 reg3 = LD_SH(tmp_buf + 15 * 32); 445 reg3 = LD_SH(tmp_buf + 15 * 32);
446 reg4 = LD_SH(tmp_buf + 17 * 32); 446 reg4 = LD_SH(tmp_buf + 17 * 32);
447 reg5 = LD_SH(tmp_buf + 23 * 32); 447 reg5 = LD_SH(tmp_buf + 23 * 32);
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
533 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); 533 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
534 534
535 SUB2(reg0, reg4, reg3, reg7, vec0, vec1); 535 SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
536 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); 536 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
537 537
538 SUB2(reg1, reg5, reg2, reg6, vec0, vec1); 538 SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
539 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); 539 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
540 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); 540 ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
541 } 541 }
542 542
543 static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, 543 static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
544 int16_t *tmp_odd_buf, 544 int16_t *tmp_odd_buf,
545 uint8_t *dst, 545 uint8_t *dst,
546 int32_t dst_stride) { 546 int32_t dst_stride) {
547 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; 547 v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
548 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; 548 v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
549 549
550 /* FINAL BUTTERFLY : Dependency on Even & Odd */ 550 /* FINAL BUTTERFLY : Dependency on Even & Odd */
551 vec0 = LD_SH(tmp_odd_buf); 551 vec0 = LD_SH(tmp_odd_buf);
552 vec1 = LD_SH(tmp_odd_buf + 9 * 8); 552 vec1 = LD_SH(tmp_odd_buf + 9 * 8);
553 vec2 = LD_SH(tmp_odd_buf + 14 * 8); 553 vec2 = LD_SH(tmp_odd_buf + 14 * 8);
554 vec3 = LD_SH(tmp_odd_buf + 6 * 8); 554 vec3 = LD_SH(tmp_odd_buf + 6 * 8);
555 loc0 = LD_SH(tmp_eve_buf); 555 loc0 = LD_SH(tmp_eve_buf);
556 loc1 = LD_SH(tmp_eve_buf + 8 * 8); 556 loc1 = LD_SH(tmp_eve_buf + 8 * 8);
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
620 SRARI_H4_SH(n1, n3, n5, n7, 6); 620 SRARI_H4_SH(n1, n3, n5, n7, 6);
621 VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), 621 VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
622 n1, n3, n5, n7); 622 n1, n3, n5, n7);
623 623
624 SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); 624 SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
625 SRARI_H4_SH(n1, n3, n5, n7, 6); 625 SRARI_H4_SH(n1, n3, n5, n7, 6);
626 VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), 626 VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
627 n1, n3, n5, n7); 627 n1, n3, n5, n7);
628 } 628 }
629 629
630 static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 630 static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
631 int32_t dst_stride) { 631 int32_t dst_stride) {
632 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); 632 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
633 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); 633 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
634 634
635 vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); 635 idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
636 vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); 636 idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
637 vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], 637 idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
638 dst, dst_stride); 638 dst, dst_stride);
639 } 639 }
640 640
641 void vp9_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, 641 void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
642 int32_t dst_stride) { 642 int32_t dst_stride) {
643 int32_t i; 643 int32_t i;
644 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); 644 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
645 int16_t *out_ptr = out_arr; 645 int16_t *out_ptr = out_arr;
646 646
647 /* transform rows */ 647 /* transform rows */
648 for (i = 0; i < 4; ++i) { 648 for (i = 0; i < 4; ++i) {
649 /* process 32 * 8 block */ 649 /* process 32 * 8 block */
650 vp9_idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); 650 idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
651 } 651 }
652 652
653 /* transform columns */ 653 /* transform columns */
654 for (i = 0; i < 4; ++i) { 654 for (i = 0; i < 4; ++i) {
655 /* process 8 * 32 block */ 655 /* process 8 * 32 block */
656 vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), 656 idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
657 dst_stride); 657 dst_stride);
658 } 658 }
659 } 659 }
660 660
661 void vp9_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, 661 void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
662 int32_t dst_stride) { 662 int32_t dst_stride) {
663 int32_t i; 663 int32_t i;
664 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); 664 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
665 int16_t *out_ptr = out_arr; 665 int16_t *out_ptr = out_arr;
666 666
667 for (i = 32; i--;) { 667 for (i = 32; i--;) {
668 __asm__ __volatile__ ( 668 __asm__ __volatile__ (
669 "sw $zero, 0(%[out_ptr]) \n\t" 669 "sw $zero, 0(%[out_ptr]) \n\t"
670 "sw $zero, 4(%[out_ptr]) \n\t" 670 "sw $zero, 4(%[out_ptr]) \n\t"
671 "sw $zero, 8(%[out_ptr]) \n\t" 671 "sw $zero, 8(%[out_ptr]) \n\t"
(...skipping 14 matching lines...) Expand all
686 : 686 :
687 : [out_ptr] "r" (out_ptr) 687 : [out_ptr] "r" (out_ptr)
688 ); 688 );
689 689
690 out_ptr += 32; 690 out_ptr += 32;
691 } 691 }
692 692
693 out_ptr = out_arr; 693 out_ptr = out_arr;
694 694
695 /* rows: only upper-left 8x8 has non-zero coeff */ 695 /* rows: only upper-left 8x8 has non-zero coeff */
696 vp9_idct32x8_1d_rows_msa(input, out_ptr); 696 idct32x8_1d_rows_msa(input, out_ptr);
697 697
698 /* transform columns */ 698 /* transform columns */
699 for (i = 0; i < 4; ++i) { 699 for (i = 0; i < 4; ++i) {
700 /* process 8 * 32 block */ 700 /* process 8 * 32 block */
701 vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), 701 idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
702 dst_stride); 702 dst_stride);
703 } 703 }
704 } 704 }
705 705
706 void vp9_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, 706 void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
707 int32_t dst_stride) { 707 int32_t dst_stride) {
708 int32_t i; 708 int32_t i;
709 int16_t out; 709 int16_t out;
710 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 710 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
711 v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; 711 v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
712 712
713 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); 713 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
714 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); 714 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
715 out = ROUND_POWER_OF_TWO(out, 6); 715 out = ROUND_POWER_OF_TWO(out, 6);
716 716
(...skipping 13 matching lines...) Expand all
730 CLIP_SH4_0_255(res4, res5, res6, res7); 730 CLIP_SH4_0_255(res4, res5, res6, res7);
731 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, 731 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
732 tmp0, tmp1, tmp2, tmp3); 732 tmp0, tmp1, tmp2, tmp3);
733 733
734 ST_UB2(tmp0, tmp1, dst, 16); 734 ST_UB2(tmp0, tmp1, dst, 16);
735 dst += dst_stride; 735 dst += dst_stride;
736 ST_UB2(tmp2, tmp3, dst, 16); 736 ST_UB2(tmp2, tmp3, dst, 16);
737 dst += dst_stride; 737 dst += dst_stride;
738 } 738 }
739 } 739 }
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/idct16x16_msa.c ('k') | source/libvpx/vpx_dsp/mips/idct4x4_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698