Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(884)

Side by Side Diff: source/libvpx/vpx_dsp/mips/idct16x16_msa.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "vpx_dsp/mips/inv_txfm_msa.h" 11 #include "vpx_dsp/mips/inv_txfm_msa.h"
12 12
13 void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { 13 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
14 v8i16 loc0, loc1, loc2, loc3; 14 v8i16 loc0, loc1, loc2, loc3;
15 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; 15 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
16 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; 16 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
17 v8i16 tmp5, tmp6, tmp7; 17 v8i16 tmp5, tmp6, tmp7;
18 18
19 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 19 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
20 input += 8; 20 input += 8;
21 LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); 21 LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
22 22
23 TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, 23 TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after
96 TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, 96 TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
97 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); 97 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
98 ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); 98 ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
99 99
100 /* transpose block */ 100 /* transpose block */
101 TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, 101 TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
102 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); 102 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
103 ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); 103 ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
104 } 104 }
105 105
106 void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 106 void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
107 int32_t dst_stride) { 107 int32_t dst_stride) {
108 v8i16 loc0, loc1, loc2, loc3; 108 v8i16 loc0, loc1, loc2, loc3;
109 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; 109 v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
110 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; 110 v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
111 v8i16 tmp5, tmp6, tmp7; 111 v8i16 tmp5, tmp6, tmp7;
112 112
113 /* load up 8x8 */ 113 /* load up 8x8 */
114 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); 114 LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
115 input += 8 * 16; 115 input += 8 * 16;
116 /* load bottom 8x8 */ 116 /* load bottom 8x8 */
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
194 SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); 194 SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
195 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); 195 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
196 dst += (4 * dst_stride); 196 dst += (4 * dst_stride);
197 SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); 197 SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
198 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); 198 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
199 dst += (4 * dst_stride); 199 dst += (4 * dst_stride);
200 SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); 200 SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
201 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); 201 VP9_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
202 } 202 }
203 203
204 void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, 204 void vpx_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
205 int32_t dst_stride) { 205 int32_t dst_stride) {
206 int32_t i; 206 int32_t i;
207 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); 207 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
208 int16_t *out = out_arr; 208 int16_t *out = out_arr;
209 209
210 /* transform rows */ 210 /* transform rows */
211 for (i = 0; i < 2; ++i) { 211 for (i = 0; i < 2; ++i) {
212 /* process 16 * 8 block */ 212 /* process 16 * 8 block */
213 vp9_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); 213 vpx_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
214 } 214 }
215 215
216 /* transform columns */ 216 /* transform columns */
217 for (i = 0; i < 2; ++i) { 217 for (i = 0; i < 2; ++i) {
218 /* process 8 * 16 block */ 218 /* process 8 * 16 block */
219 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), 219 vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
220 dst_stride); 220 dst_stride);
221 } 221 }
222 } 222 }
223 223
224 void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, 224 void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
225 int32_t dst_stride) { 225 int32_t dst_stride) {
226 uint8_t i; 226 uint8_t i;
227 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); 227 DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
228 int16_t *out = out_arr; 228 int16_t *out = out_arr;
229 229
230 /* process 16 * 8 block */ 230 /* process 16 * 8 block */
231 vp9_idct16_1d_rows_msa(input, out); 231 vpx_idct16_1d_rows_msa(input, out);
232 232
233 /* short case just considers top 4 rows as valid output */ 233 /* short case just considers top 4 rows as valid output */
234 out += 4 * 16; 234 out += 4 * 16;
235 for (i = 12; i--;) { 235 for (i = 12; i--;) {
236 __asm__ __volatile__ ( 236 __asm__ __volatile__ (
237 "sw $zero, 0(%[out]) \n\t" 237 "sw $zero, 0(%[out]) \n\t"
238 "sw $zero, 4(%[out]) \n\t" 238 "sw $zero, 4(%[out]) \n\t"
239 "sw $zero, 8(%[out]) \n\t" 239 "sw $zero, 8(%[out]) \n\t"
240 "sw $zero, 12(%[out]) \n\t" 240 "sw $zero, 12(%[out]) \n\t"
241 "sw $zero, 16(%[out]) \n\t" 241 "sw $zero, 16(%[out]) \n\t"
242 "sw $zero, 20(%[out]) \n\t" 242 "sw $zero, 20(%[out]) \n\t"
243 "sw $zero, 24(%[out]) \n\t" 243 "sw $zero, 24(%[out]) \n\t"
244 "sw $zero, 28(%[out]) \n\t" 244 "sw $zero, 28(%[out]) \n\t"
245 245
246 : 246 :
247 : [out] "r" (out) 247 : [out] "r" (out)
248 ); 248 );
249 249
250 out += 16; 250 out += 16;
251 } 251 }
252 252
253 out = out_arr; 253 out = out_arr;
254 254
255 /* transform columns */ 255 /* transform columns */
256 for (i = 0; i < 2; ++i) { 256 for (i = 0; i < 2; ++i) {
257 /* process 8 * 16 block */ 257 /* process 8 * 16 block */
258 vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), 258 vpx_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
259 dst_stride); 259 dst_stride);
260 } 260 }
261 } 261 }
262 262
263 void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, 263 void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
264 int32_t dst_stride) { 264 int32_t dst_stride) {
265 uint8_t i; 265 uint8_t i;
266 int16_t out; 266 int16_t out;
267 v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; 267 v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
268 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; 268 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
269 269
270 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); 270 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
271 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); 271 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
272 out = ROUND_POWER_OF_TWO(out, 6); 272 out = ROUND_POWER_OF_TWO(out, 6);
273 273
274 vec = __msa_fill_h(out); 274 vec = __msa_fill_h(out);
275 275
276 for (i = 4; i--;) { 276 for (i = 4; i--;) {
277 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 277 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
278 UNPCK_UB_SH(dst0, res0, res4); 278 UNPCK_UB_SH(dst0, res0, res4);
279 UNPCK_UB_SH(dst1, res1, res5); 279 UNPCK_UB_SH(dst1, res1, res5);
280 UNPCK_UB_SH(dst2, res2, res6); 280 UNPCK_UB_SH(dst2, res2, res6);
281 UNPCK_UB_SH(dst3, res3, res7); 281 UNPCK_UB_SH(dst3, res3, res7);
282 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); 282 ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
283 ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); 283 ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
284 CLIP_SH4_0_255(res0, res1, res2, res3); 284 CLIP_SH4_0_255(res0, res1, res2, res3);
285 CLIP_SH4_0_255(res4, res5, res6, res7); 285 CLIP_SH4_0_255(res4, res5, res6, res7);
286 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, 286 PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
287 tmp0, tmp1, tmp2, tmp3); 287 tmp0, tmp1, tmp2, tmp3);
288 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); 288 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
289 dst += (4 * dst_stride); 289 dst += (4 * dst_stride);
290 } 290 }
291 } 291 }
292 292
293 void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { 293 void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
294 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 294 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
295 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; 295 v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
296 296
297 /* load input data */ 297 /* load input data */
298 LD_SH16(input, 8, 298 LD_SH16(input, 8,
299 l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); 299 l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15);
300 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, 300 TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
301 l0, l1, l2, l3, l4, l5, l6, l7); 301 l0, l1, l2, l3, l4, l5, l6, l7);
302 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, 302 TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
303 l8, l9, l10, l11, l12, l13, l14, l15); 303 l8, l9, l10, l11, l12, l13, l14, l15);
(...skipping 10 matching lines...) Expand all
314 l15 = -r1; 314 l15 = -r1;
315 315
316 TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, 316 TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
317 l0, l1, l2, l3, l4, l5, l6, l7); 317 l0, l1, l2, l3, l4, l5, l6, l7);
318 ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); 318 ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
319 TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, 319 TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
320 l8, l9, l10, l11, l12, l13, l14, l15); 320 l8, l9, l10, l11, l12, l13, l14, l15);
321 ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); 321 ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
322 } 322 }
323 323
324 void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 324 void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
325 int32_t dst_stride) { 325 int32_t dst_stride) {
326 v8i16 v0, v2, v4, v6, k0, k1, k2, k3; 326 v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
327 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 327 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
328 v8i16 out0, out1, out2, out3, out4, out5, out6, out7; 328 v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
329 v8i16 out8, out9, out10, out11, out12, out13, out14, out15; 329 v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
330 v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; 330 v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
331 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; 331 v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
332 v8i16 res0, res1, res2, res3, res4, res5, res6, res7; 332 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
333 v8i16 res8, res9, res10, res11, res12, res13, res14, res15; 333 v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
334 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 334 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
(...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after
478 SRARI_H2_SH(out14, out15, 6); 478 SRARI_H2_SH(out14, out15, 6);
479 dst14 = LD_UB(dst + 5 * dst_stride); 479 dst14 = LD_UB(dst + 5 * dst_stride);
480 dst15 = LD_UB(dst + 10 * dst_stride); 480 dst15 = LD_UB(dst + 10 * dst_stride);
481 ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); 481 ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
482 ADD2(res14, out14, res15, out15, res14, res15); 482 ADD2(res14, out14, res15, out15, res14, res15);
483 CLIP_SH2_0_255(res14, res15); 483 CLIP_SH2_0_255(res14, res15);
484 PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); 484 PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
485 ST8x1_UB(res14, dst + 5 * dst_stride); 485 ST8x1_UB(res14, dst + 5 * dst_stride);
486 ST8x1_UB(res15, dst + 10 * dst_stride); 486 ST8x1_UB(res15, dst + 10 * dst_stride);
487 } 487 }
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/convolve_common_dspr2.h ('k') | source/libvpx/vpx_dsp/mips/idct32x32_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698