| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | |
| 12 #include <stdio.h> | |
| 13 | |
| 14 #include "./vpx_config.h" | 11 #include "./vpx_config.h" |
| 15 #include "./vp9_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
| 16 #include "vp9/common/vp9_common.h" | 13 #include "vpx_dsp/mips/inv_txfm_dspr2.h" |
| 17 #include "vp9/common/vp9_blockd.h" | |
| 18 #include "vp9/common/vp9_idct.h" | |
| 19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | |
| 20 #include "vpx_dsp/txfm_common.h" | 14 #include "vpx_dsp/txfm_common.h" |
| 21 #include "vpx_ports/mem.h" | |
| 22 | 15 |
| 23 #if HAVE_DSPR2 | 16 #if HAVE_DSPR2 |
| 24 static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) { | 17 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { |
| 25 int16_t step_0, step_1, step_2, step_3; | 18 int16_t step_0, step_1, step_2, step_3; |
| 26 int Temp0, Temp1, Temp2, Temp3; | 19 int Temp0, Temp1, Temp2, Temp3; |
| 27 const int const_2_power_13 = 8192; | 20 const int const_2_power_13 = 8192; |
| 28 int i; | 21 int i; |
| 29 | 22 |
| 30 for (i = 4; i--; ) { | 23 for (i = 4; i--; ) { |
| 31 __asm__ __volatile__ ( | 24 __asm__ __volatile__ ( |
| 32 /* | 25 /* |
| 33 temp_1 = (input[0] + input[2]) * cospi_16_64; | 26 temp_1 = (input[0] + input[2]) * cospi_16_64; |
| 34 step_0 = dct_const_round_shift(temp_1); | 27 step_0 = dct_const_round_shift(temp_1); |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 99 [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), | 92 [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), |
| 100 [cospi_24_64] "r" (cospi_24_64), | 93 [cospi_24_64] "r" (cospi_24_64), |
| 101 [input] "r" (input) | 94 [input] "r" (input) |
| 102 ); | 95 ); |
| 103 | 96 |
| 104 input += 4; | 97 input += 4; |
| 105 output += 1; | 98 output += 1; |
| 106 } | 99 } |
| 107 } | 100 } |
| 108 | 101 |
| 109 static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, | 102 void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, |
| 110 int dest_stride) { | 103 int dest_stride) { |
| 111 int16_t step_0, step_1, step_2, step_3; | 104 int16_t step_0, step_1, step_2, step_3; |
| 112 int Temp0, Temp1, Temp2, Temp3; | 105 int Temp0, Temp1, Temp2, Temp3; |
| 113 const int const_2_power_13 = 8192; | 106 const int const_2_power_13 = 8192; |
| 114 int i; | 107 int i; |
| 115 uint8_t *dest_pix; | 108 uint8_t *dest_pix; |
| 116 uint8_t *cm = vpx_ff_cropTbl; | 109 uint8_t *cm = vpx_ff_cropTbl; |
| 117 | 110 |
| 118 /* prefetch vpx_ff_cropTbl */ | 111 /* prefetch vpx_ff_cropTbl */ |
| 119 prefetch_load(vpx_ff_cropTbl); | 112 prefetch_load(vpx_ff_cropTbl); |
| 120 prefetch_load(vpx_ff_cropTbl + 32); | 113 prefetch_load(vpx_ff_cropTbl + 32); |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 221 : [const_2_power_13] "r" (const_2_power_13), | 214 : [const_2_power_13] "r" (const_2_power_13), |
| 222 [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), | 215 [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), |
| 223 [cospi_24_64] "r" (cospi_24_64), | 216 [cospi_24_64] "r" (cospi_24_64), |
| 224 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) | 217 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) |
| 225 ); | 218 ); |
| 226 | 219 |
| 227 input += 4; | 220 input += 4; |
| 228 } | 221 } |
| 229 } | 222 } |
| 230 | 223 |
| 231 void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, | 224 void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, |
| 232 int dest_stride) { | 225 int dest_stride) { |
| 233 DECLARE_ALIGNED(32, int16_t, out[4 * 4]); | 226 DECLARE_ALIGNED(32, int16_t, out[4 * 4]); |
| 234 int16_t *outptr = out; | 227 int16_t *outptr = out; |
| 235 uint32_t pos = 45; | 228 uint32_t pos = 45; |
| 236 | 229 |
| 237 /* bit positon for extract from acc */ | 230 /* bit positon for extract from acc */ |
| 238 __asm__ __volatile__ ( | 231 __asm__ __volatile__ ( |
| 239 "wrdsp %[pos], 1 \n\t" | 232 "wrdsp %[pos], 1 \n\t" |
| 240 : | 233 : |
| 241 : [pos] "r" (pos) | 234 : [pos] "r" (pos) |
| 242 ); | 235 ); |
| 243 | 236 |
| 244 // Rows | 237 // Rows |
| 245 vp9_idct4_rows_dspr2(input, outptr); | 238 vpx_idct4_rows_dspr2(input, outptr); |
| 246 | 239 |
| 247 // Columns | 240 // Columns |
| 248 vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); | 241 vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); |
| 249 } | 242 } |
| 250 | 243 |
| 251 void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, | 244 void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, |
| 252 int dest_stride) { | 245 int dest_stride) { |
| 253 int a1, absa1; | 246 int a1, absa1; |
| 254 int r; | 247 int r; |
| 255 int32_t out; | 248 int32_t out; |
| 256 int t2, vector_a1, vector_a; | 249 int t2, vector_a1, vector_a; |
| 257 uint32_t pos = 45; | 250 uint32_t pos = 45; |
| 258 int16_t input_dc = input[0]; | 251 int16_t input_dc = input[0]; |
| 259 | 252 |
| 260 /* bit positon for extract from acc */ | 253 /* bit positon for extract from acc */ |
| 261 __asm__ __volatile__ ( | 254 __asm__ __volatile__ ( |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 314 "add %[dest], %[dest], %[dest_stride] \n\t" | 307 "add %[dest], %[dest], %[dest_stride] \n\t" |
| 315 | 308 |
| 316 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), | 309 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), |
| 317 [dest] "+&r" (dest) | 310 [dest] "+&r" (dest) |
| 318 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | 311 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
| 319 ); | 312 ); |
| 320 } | 313 } |
| 321 } | 314 } |
| 322 } | 315 } |
| 323 | 316 |
| 324 static void iadst4_dspr2(const int16_t *input, int16_t *output) { | 317 void iadst4_dspr2(const int16_t *input, int16_t *output) { |
| 325 int s0, s1, s2, s3, s4, s5, s6, s7; | 318 int s0, s1, s2, s3, s4, s5, s6, s7; |
| 326 int x0, x1, x2, x3; | 319 int x0, x1, x2, x3; |
| 327 | 320 |
| 328 x0 = input[0]; | 321 x0 = input[0]; |
| 329 x1 = input[1]; | 322 x1 = input[1]; |
| 330 x2 = input[2]; | 323 x2 = input[2]; |
| 331 x3 = input[3]; | 324 x3 = input[3]; |
| 332 | 325 |
| 333 if (!(x0 | x1 | x2 | x3)) { | 326 if (!(x0 | x1 | x2 | x3)) { |
| 334 output[0] = output[1] = output[2] = output[3] = 0; | 327 output[0] = output[1] = output[2] = output[3] = 0; |
| (...skipping 21 matching lines...) Expand all Loading... |
| 356 | 349 |
| 357 // 1-D transform scaling factor is sqrt(2). | 350 // 1-D transform scaling factor is sqrt(2). |
| 358 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) | 351 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) |
| 359 // + 1b (addition) = 29b. | 352 // + 1b (addition) = 29b. |
| 360 // Hence the output bit depth is 15b. | 353 // Hence the output bit depth is 15b. |
| 361 output[0] = dct_const_round_shift(s0); | 354 output[0] = dct_const_round_shift(s0); |
| 362 output[1] = dct_const_round_shift(s1); | 355 output[1] = dct_const_round_shift(s1); |
| 363 output[2] = dct_const_round_shift(s2); | 356 output[2] = dct_const_round_shift(s2); |
| 364 output[3] = dct_const_round_shift(s3); | 357 output[3] = dct_const_round_shift(s3); |
| 365 } | 358 } |
| 366 | |
| 367 void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 368 int dest_stride, int tx_type) { | |
| 369 int i, j; | |
| 370 DECLARE_ALIGNED(32, int16_t, out[4 * 4]); | |
| 371 int16_t *outptr = out; | |
| 372 int16_t temp_in[4 * 4], temp_out[4]; | |
| 373 uint32_t pos = 45; | |
| 374 | |
| 375 /* bit positon for extract from acc */ | |
| 376 __asm__ __volatile__ ( | |
| 377 "wrdsp %[pos], 1 \n\t" | |
| 378 : | |
| 379 : [pos] "r" (pos) | |
| 380 ); | |
| 381 | |
| 382 switch (tx_type) { | |
| 383 case DCT_DCT: // DCT in both horizontal and vertical | |
| 384 vp9_idct4_rows_dspr2(input, outptr); | |
| 385 vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); | |
| 386 break; | |
| 387 case ADST_DCT: // ADST in vertical, DCT in horizontal | |
| 388 vp9_idct4_rows_dspr2(input, outptr); | |
| 389 | |
| 390 outptr = out; | |
| 391 | |
| 392 for (i = 0; i < 4; ++i) { | |
| 393 iadst4_dspr2(outptr, temp_out); | |
| 394 | |
| 395 for (j = 0; j < 4; ++j) | |
| 396 dest[j * dest_stride + i] = | |
| 397 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) | |
| 398 + dest[j * dest_stride + i]); | |
| 399 | |
| 400 outptr += 4; | |
| 401 } | |
| 402 break; | |
| 403 case DCT_ADST: // DCT in vertical, ADST in horizontal | |
| 404 for (i = 0; i < 4; ++i) { | |
| 405 iadst4_dspr2(input, outptr); | |
| 406 input += 4; | |
| 407 outptr += 4; | |
| 408 } | |
| 409 | |
| 410 for (i = 0; i < 4; ++i) { | |
| 411 for (j = 0; j < 4; ++j) { | |
| 412 temp_in[i * 4 + j] = out[j * 4 + i]; | |
| 413 } | |
| 414 } | |
| 415 vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); | |
| 416 break; | |
| 417 case ADST_ADST: // ADST in both directions | |
| 418 for (i = 0; i < 4; ++i) { | |
| 419 iadst4_dspr2(input, outptr); | |
| 420 input += 4; | |
| 421 outptr += 4; | |
| 422 } | |
| 423 | |
| 424 for (i = 0; i < 4; ++i) { | |
| 425 for (j = 0; j < 4; ++j) | |
| 426 temp_in[j] = out[j * 4 + i]; | |
| 427 iadst4_dspr2(temp_in, temp_out); | |
| 428 | |
| 429 for (j = 0; j < 4; ++j) | |
| 430 dest[j * dest_stride + i] = | |
| 431 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) | |
| 432 + dest[j * dest_stride + i]); | |
| 433 } | |
| 434 break; | |
| 435 default: | |
| 436 printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); | |
| 437 break; | |
| 438 } | |
| 439 } | |
| 440 #endif // #if HAVE_DSPR2 | 359 #endif // #if HAVE_DSPR2 |
| OLD | NEW |