Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2314)

Side by Side Diff: source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "./vpx_dsp_rtcd.h"
11 #include "vpx_dsp/x86/inv_txfm_sse2.h" 12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
12 #include "vpx_dsp/x86/txfm_common_sse2.h" 13 #include "vpx_dsp/x86/txfm_common_sse2.h"
13 14
14 #define RECON_AND_STORE4X4(dest, in_x) \ 15 #define RECON_AND_STORE4X4(dest, in_x) \
15 { \ 16 { \
16 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 17 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
17 d0 = _mm_unpacklo_epi8(d0, zero); \ 18 d0 = _mm_unpacklo_epi8(d0, zero); \
18 d0 = _mm_add_epi16(in_x, d0); \ 19 d0 = _mm_add_epi16(in_x, d0); \
19 d0 = _mm_packus_epi16(d0, d0); \ 20 d0 = _mm_packus_epi16(d0, d0); \
20 *(int *)(dest) = _mm_cvtsi128_si32(d0); \ 21 *(int *)(dest) = _mm_cvtsi128_si32(d0); \
21 } 22 }
22 23
23 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 24 void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
24 const __m128i zero = _mm_setzero_si128(); 25 const __m128i zero = _mm_setzero_si128();
25 const __m128i eight = _mm_set1_epi16(8); 26 const __m128i eight = _mm_set1_epi16(8);
26 const __m128i cst = _mm_setr_epi16( 27 const __m128i cst = _mm_setr_epi16(
27 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, 28 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
28 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 29 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
29 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 30 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
30 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 31 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
31 __m128i input0, input1, input2, input3; 32 __m128i input0, input1, input2, input3;
32 33
33 // Rows 34 // Rows
(...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after
143 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 144 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
144 // store input2 145 // store input2
145 d0 = _mm_srli_si128(d0, 4); 146 d0 = _mm_srli_si128(d0, 4);
146 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 147 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
147 // store input3 148 // store input3
148 d0 = _mm_srli_si128(d0, 4); 149 d0 = _mm_srli_si128(d0, 4);
149 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 150 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
150 } 151 }
151 } 152 }
152 153
153 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 154 void vpx_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
154 __m128i dc_value; 155 __m128i dc_value;
155 const __m128i zero = _mm_setzero_si128(); 156 const __m128i zero = _mm_setzero_si128();
156 int a; 157 int a;
157 158
158 a = dct_const_round_shift(input[0] * cospi_16_64); 159 a = dct_const_round_shift(input[0] * cospi_16_64);
159 a = dct_const_round_shift(a * cospi_16_64); 160 a = dct_const_round_shift(a * cospi_16_64);
160 a = ROUND_POWER_OF_TWO(a, 4); 161 a = ROUND_POWER_OF_TWO(a, 4);
161 162
162 dc_value = _mm_set1_epi16(a); 163 dc_value = _mm_set1_epi16(a);
163 164
(...skipping 277 matching lines...) Expand 10 before | Expand all | Expand 10 after
441 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ 442 out0 = _mm_adds_epi16(stp1_0, stp2_7); \
442 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ 443 out1 = _mm_adds_epi16(stp1_1, stp1_6); \
443 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ 444 out2 = _mm_adds_epi16(stp1_2, stp1_5); \
444 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ 445 out3 = _mm_adds_epi16(stp1_3, stp2_4); \
445 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ 446 out4 = _mm_subs_epi16(stp1_3, stp2_4); \
446 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ 447 out5 = _mm_subs_epi16(stp1_2, stp1_5); \
447 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ 448 out6 = _mm_subs_epi16(stp1_1, stp1_6); \
448 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ 449 out7 = _mm_subs_epi16(stp1_0, stp2_7); \
449 } 450 }
450 451
451 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 452 void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
452 const __m128i zero = _mm_setzero_si128(); 453 const __m128i zero = _mm_setzero_si128();
453 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 454 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
454 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 455 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
455 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 456 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
456 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 457 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
457 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 458 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
458 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 459 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
459 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 460 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
460 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 461 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
461 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 462 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
(...skipping 10 matching lines...) Expand all
472 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 473 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
473 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 474 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
474 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 475 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
475 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 476 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
476 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 477 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
477 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 478 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
478 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 479 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
479 480
480 // 2-D 481 // 2-D
481 for (i = 0; i < 2; i++) { 482 for (i = 0; i < 2; i++) {
482 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 483 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
483 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, 484 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
484 in0, in1, in2, in3, in4, in5, in6, in7); 485 in0, in1, in2, in3, in4, in5, in6, in7);
485 486
486 // 4-stage 1D idct8x8 487 // 4-stage 1D idct8x8
487 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 488 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
488 in0, in1, in2, in3, in4, in5, in6, in7); 489 in0, in1, in2, in3, in4, in5, in6, in7);
489 } 490 }
490 491
491 // Final rounding and shift 492 // Final rounding and shift
492 in0 = _mm_adds_epi16(in0, final_rounding); 493 in0 = _mm_adds_epi16(in0, final_rounding);
(...skipping 17 matching lines...) Expand all
510 RECON_AND_STORE(dest + 0 * stride, in0); 511 RECON_AND_STORE(dest + 0 * stride, in0);
511 RECON_AND_STORE(dest + 1 * stride, in1); 512 RECON_AND_STORE(dest + 1 * stride, in1);
512 RECON_AND_STORE(dest + 2 * stride, in2); 513 RECON_AND_STORE(dest + 2 * stride, in2);
513 RECON_AND_STORE(dest + 3 * stride, in3); 514 RECON_AND_STORE(dest + 3 * stride, in3);
514 RECON_AND_STORE(dest + 4 * stride, in4); 515 RECON_AND_STORE(dest + 4 * stride, in4);
515 RECON_AND_STORE(dest + 5 * stride, in5); 516 RECON_AND_STORE(dest + 5 * stride, in5);
516 RECON_AND_STORE(dest + 6 * stride, in6); 517 RECON_AND_STORE(dest + 6 * stride, in6);
517 RECON_AND_STORE(dest + 7 * stride, in7); 518 RECON_AND_STORE(dest + 7 * stride, in7);
518 } 519 }
519 520
520 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 521 void vpx_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
521 __m128i dc_value; 522 __m128i dc_value;
522 const __m128i zero = _mm_setzero_si128(); 523 const __m128i zero = _mm_setzero_si128();
523 int a; 524 int a;
524 525
525 a = dct_const_round_shift(input[0] * cospi_16_64); 526 a = dct_const_round_shift(input[0] * cospi_16_64);
526 a = dct_const_round_shift(a * cospi_16_64); 527 a = dct_const_round_shift(a * cospi_16_64);
527 a = ROUND_POWER_OF_TWO(a, 5); 528 a = ROUND_POWER_OF_TWO(a, 5);
528 529
529 dc_value = _mm_set1_epi16(a); 530 dc_value = _mm_set1_epi16(a);
530 531
(...skipping 16 matching lines...) Expand all
547 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 548 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
548 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 549 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
549 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 550 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
550 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 551 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
551 552
552 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 553 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
553 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 554 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
554 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 555 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
555 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 556 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
556 557
557 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 558 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
558 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], 559 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
559 in0, in1, in2, in3, in4, in5, in6, in7); 560 in0, in1, in2, in3, in4, in5, in6, in7);
560 561
561 // 4-stage 1D idct8x8 562 // 4-stage 1D idct8x8
562 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 563 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
563 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); 564 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
564 } 565 }
565 566
566 void iadst8_sse2(__m128i *in) { 567 void iadst8_sse2(__m128i *in) {
567 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 568 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
(...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after
784 in[0] = s0; 785 in[0] = s0;
785 in[1] = _mm_sub_epi16(k__const_0, s4); 786 in[1] = _mm_sub_epi16(k__const_0, s4);
786 in[2] = s6; 787 in[2] = s6;
787 in[3] = _mm_sub_epi16(k__const_0, s2); 788 in[3] = _mm_sub_epi16(k__const_0, s2);
788 in[4] = s3; 789 in[4] = s3;
789 in[5] = _mm_sub_epi16(k__const_0, s7); 790 in[5] = _mm_sub_epi16(k__const_0, s7);
790 in[6] = s5; 791 in[6] = s5;
791 in[7] = _mm_sub_epi16(k__const_0, s1); 792 in[7] = _mm_sub_epi16(k__const_0, s1);
792 } 793 }
793 794
794 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 795 void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
795 const __m128i zero = _mm_setzero_si128(); 796 const __m128i zero = _mm_setzero_si128();
796 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 797 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
797 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 798 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
798 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 799 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
799 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 800 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
800 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 801 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
801 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 802 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
802 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 803 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
803 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 804 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
804 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 805 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
(...skipping 356 matching lines...) Expand 10 before | Expand all | Expand 10 after
1161 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1162 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1162 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1163 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1163 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1164 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1164 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1165 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1165 \ 1166 \
1166 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1167 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1167 stg6_0, stg4_0, stg6_0, stg4_0, \ 1168 stg6_0, stg4_0, stg6_0, stg4_0, \
1168 stp2_10, stp2_13, stp2_11, stp2_12) \ 1169 stp2_10, stp2_13, stp2_11, stp2_12) \
1169 } 1170 }
1170 1171
1171 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 1172 void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1172 int stride) { 1173 int stride) {
1173 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1174 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1174 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 1175 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1175 const __m128i zero = _mm_setzero_si128(); 1176 const __m128i zero = _mm_setzero_si128();
1176 1177
1177 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1178 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1178 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1179 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1179 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1180 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1180 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1181 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1181 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1182 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after
1286 // Final rounding and shift 1287 // Final rounding and shift
1287 in[j] = _mm_adds_epi16(in[j], final_rounding); 1288 in[j] = _mm_adds_epi16(in[j], final_rounding);
1288 in[j] = _mm_srai_epi16(in[j], 6); 1289 in[j] = _mm_srai_epi16(in[j], 6);
1289 RECON_AND_STORE(dest + j * stride, in[j]); 1290 RECON_AND_STORE(dest + j * stride, in[j]);
1290 } 1291 }
1291 1292
1292 dest += 8; 1293 dest += 8;
1293 } 1294 }
1294 } 1295 }
1295 1296
1296 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1297 void vpx_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1297 __m128i dc_value; 1298 __m128i dc_value;
1298 const __m128i zero = _mm_setzero_si128(); 1299 const __m128i zero = _mm_setzero_si128();
1299 int a, i; 1300 int a, i;
1300 1301
1301 a = dct_const_round_shift(input[0] * cospi_16_64); 1302 a = dct_const_round_shift(input[0] * cospi_16_64);
1302 a = dct_const_round_shift(a * cospi_16_64); 1303 a = dct_const_round_shift(a * cospi_16_64);
1303 a = ROUND_POWER_OF_TWO(a, 6); 1304 a = ROUND_POWER_OF_TWO(a, 6);
1304 1305
1305 dc_value = _mm_set1_epi16(a); 1306 dc_value = _mm_set1_epi16(a);
1306 1307
(...skipping 837 matching lines...) Expand 10 before | Expand all | Expand 10 after
2144 idct16_8col(in0); 2145 idct16_8col(in0);
2145 idct16_8col(in1); 2146 idct16_8col(in1);
2146 } 2147 }
2147 2148
2148 void iadst16_sse2(__m128i *in0, __m128i *in1) { 2149 void iadst16_sse2(__m128i *in0, __m128i *in1) {
2149 array_transpose_16x16(in0, in1); 2150 array_transpose_16x16(in0, in1);
2150 iadst16_8col(in0); 2151 iadst16_8col(in0);
2151 iadst16_8col(in1); 2152 iadst16_8col(in1);
2152 } 2153 }
2153 2154
2154 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 2155 void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2155 int stride) { 2156 int stride) {
2156 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2157 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2157 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 2158 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2158 const __m128i zero = _mm_setzero_si128(); 2159 const __m128i zero = _mm_setzero_si128();
2159 2160
2160 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2161 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2161 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2162 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2162 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2163 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2163 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2164 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2164 2165
(...skipping 856 matching lines...) Expand 10 before | Expand all | Expand 10 after
3021 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3022 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3022 stp1_23, stp1_24) \ 3023 stp1_23, stp1_24) \
3023 \ 3024 \
3024 stp1_28 = stp2_28; \ 3025 stp1_28 = stp2_28; \
3025 stp1_29 = stp2_29; \ 3026 stp1_29 = stp2_29; \
3026 stp1_30 = stp2_30; \ 3027 stp1_30 = stp2_30; \
3027 stp1_31 = stp2_31; \ 3028 stp1_31 = stp2_31; \
3028 } 3029 }
3029 3030
3030 // Only upper-left 8x8 has non-zero coeff 3031 // Only upper-left 8x8 has non-zero coeff
3031 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 3032 void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3032 int stride) { 3033 int stride) {
3033 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3034 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3034 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3035 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3035 3036
3036 // idct constants for each stage 3037 // idct constants for each stage
3037 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3038 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3038 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3039 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3039 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3040 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3040 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3041 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3041 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3042 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
(...skipping 138 matching lines...) Expand 10 before | Expand all | Expand 10 after
3180 // Final rounding and shift 3181 // Final rounding and shift
3181 in[j] = _mm_adds_epi16(in[j], final_rounding); 3182 in[j] = _mm_adds_epi16(in[j], final_rounding);
3182 in[j] = _mm_srai_epi16(in[j], 6); 3183 in[j] = _mm_srai_epi16(in[j], 6);
3183 RECON_AND_STORE(dest + j * stride, in[j]); 3184 RECON_AND_STORE(dest + j * stride, in[j]);
3184 } 3185 }
3185 3186
3186 dest += 8; 3187 dest += 8;
3187 } 3188 }
3188 } 3189 }
3189 3190
3190 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3191 void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3191 int stride) { 3192 int stride) {
3192 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3193 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3193 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 3194 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3194 const __m128i zero = _mm_setzero_si128(); 3195 const __m128i zero = _mm_setzero_si128();
3195 3196
3196 // idct constants for each stage 3197 // idct constants for each stage
3197 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3198 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3198 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3199 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3199 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3200 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3200 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3201 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after
3456 // Final rounding and shift 3457 // Final rounding and shift
3457 in[j] = _mm_adds_epi16(in[j], final_rounding); 3458 in[j] = _mm_adds_epi16(in[j], final_rounding);
3458 in[j] = _mm_srai_epi16(in[j], 6); 3459 in[j] = _mm_srai_epi16(in[j], 6);
3459 RECON_AND_STORE(dest + j * stride, in[j]); 3460 RECON_AND_STORE(dest + j * stride, in[j]);
3460 } 3461 }
3461 3462
3462 dest += 8; 3463 dest += 8;
3463 } 3464 }
3464 } 3465 }
3465 3466
3466 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 3467 void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
3467 __m128i dc_value; 3468 __m128i dc_value;
3468 const __m128i zero = _mm_setzero_si128(); 3469 const __m128i zero = _mm_setzero_si128();
3469 int a, i; 3470 int a, i;
3470 3471
3471 a = dct_const_round_shift(input[0] * cospi_16_64); 3472 a = dct_const_round_shift(input[0] * cospi_16_64);
3472 a = dct_const_round_shift(a * cospi_16_64); 3473 a = dct_const_round_shift(a * cospi_16_64);
3473 a = ROUND_POWER_OF_TWO(a, 6); 3474 a = ROUND_POWER_OF_TWO(a, 6);
3474 3475
3475 dc_value = _mm_set1_epi16(a); 3476 dc_value = _mm_set1_epi16(a);
3476 3477
(...skipping 13 matching lines...) Expand all
3490 const __m128i one = _mm_set1_epi16(1); 3491 const __m128i one = _mm_set1_epi16(1);
3491 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); 3492 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3492 ubounded = _mm_cmpgt_epi16(value, max); 3493 ubounded = _mm_cmpgt_epi16(value, max);
3493 retval = _mm_andnot_si128(ubounded, value); 3494 retval = _mm_andnot_si128(ubounded, value);
3494 ubounded = _mm_and_si128(ubounded, max); 3495 ubounded = _mm_and_si128(ubounded, max);
3495 retval = _mm_or_si128(retval, ubounded); 3496 retval = _mm_or_si128(retval, ubounded);
3496 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); 3497 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
3497 return retval; 3498 return retval;
3498 } 3499 }
3499 3500
3500 void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, 3501 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
3501 int stride, int bd) { 3502 int stride, int bd) {
3502 tran_low_t out[4 * 4]; 3503 tran_low_t out[4 * 4];
3503 tran_low_t *outptr = out; 3504 tran_low_t *outptr = out;
3504 int i, j; 3505 int i, j;
3505 __m128i inptr[4]; 3506 __m128i inptr[4];
3506 __m128i sign_bits[2]; 3507 __m128i sign_bits[2];
3507 __m128i temp_mm, min_input, max_input; 3508 __m128i temp_mm, min_input, max_input;
3508 int test; 3509 int test;
3509 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3510 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3510 int optimised_cols = 0; 3511 int optimised_cols = 0;
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
3553 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); 3554 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
3554 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); 3555 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
3555 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); 3556 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
3556 } else { 3557 } else {
3557 // Set to use the optimised transform for the column 3558 // Set to use the optimised transform for the column
3558 optimised_cols = 1; 3559 optimised_cols = 1;
3559 } 3560 }
3560 } else { 3561 } else {
3561 // Run the un-optimised row transform 3562 // Run the un-optimised row transform
3562 for (i = 0; i < 4; ++i) { 3563 for (i = 0; i < 4; ++i) {
3563 vp9_highbd_idct4_c(input, outptr, bd); 3564 vpx_highbd_idct4_c(input, outptr, bd);
3564 input += 4; 3565 input += 4;
3565 outptr += 4; 3566 outptr += 4;
3566 } 3567 }
3567 } 3568 }
3568 3569
3569 if (optimised_cols) { 3570 if (optimised_cols) {
3570 idct4_sse2(inptr); 3571 idct4_sse2(inptr);
3571 3572
3572 // Final round and shift 3573 // Final round and shift
3573 inptr[0] = _mm_add_epi16(inptr[0], eight); 3574 inptr[0] = _mm_add_epi16(inptr[0], eight);
(...skipping 23 matching lines...) Expand all
3597 d2 = _mm_srli_si128(d2, 8); 3598 d2 = _mm_srli_si128(d2, 8);
3598 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); 3599 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
3599 } 3600 }
3600 } else { 3601 } else {
3601 // Run the un-optimised column transform 3602 // Run the un-optimised column transform
3602 tran_low_t temp_in[4], temp_out[4]; 3603 tran_low_t temp_in[4], temp_out[4];
3603 // Columns 3604 // Columns
3604 for (i = 0; i < 4; ++i) { 3605 for (i = 0; i < 4; ++i) {
3605 for (j = 0; j < 4; ++j) 3606 for (j = 0; j < 4; ++j)
3606 temp_in[j] = out[j * 4 + i]; 3607 temp_in[j] = out[j * 4 + i];
3607 vp9_highbd_idct4_c(temp_in, temp_out, bd); 3608 vpx_highbd_idct4_c(temp_in, temp_out, bd);
3608 for (j = 0; j < 4; ++j) { 3609 for (j = 0; j < 4; ++j) {
3609 dest[j * stride + i] = highbd_clip_pixel_add( 3610 dest[j * stride + i] = highbd_clip_pixel_add(
3610 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 3611 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
3611 } 3612 }
3612 } 3613 }
3613 } 3614 }
3614 } 3615 }
3615 3616
3616 void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, 3617 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
3617 int stride, int bd) { 3618 int stride, int bd) {
3618 tran_low_t out[8 * 8]; 3619 tran_low_t out[8 * 8];
3619 tran_low_t *outptr = out; 3620 tran_low_t *outptr = out;
3620 int i, j, test; 3621 int i, j, test;
3621 __m128i inptr[8]; 3622 __m128i inptr[8];
3622 __m128i min_input, max_input, temp1, temp2, sign_bits; 3623 __m128i min_input, max_input, temp1, temp2, sign_bits;
3623 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3624 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3624 const __m128i zero = _mm_set1_epi16(0); 3625 const __m128i zero = _mm_set1_epi16(0);
3625 const __m128i sixteen = _mm_set1_epi16(16); 3626 const __m128i sixteen = _mm_set1_epi16(16);
3626 const __m128i max = _mm_set1_epi16(6201); 3627 const __m128i max = _mm_set1_epi16(6201);
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
3671 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); 3672 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3672 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); 3673 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3673 } 3674 }
3674 } else { 3675 } else {
3675 // Set to use the optimised transform for the column 3676 // Set to use the optimised transform for the column
3676 optimised_cols = 1; 3677 optimised_cols = 1;
3677 } 3678 }
3678 } else { 3679 } else {
3679 // Run the un-optimised row transform 3680 // Run the un-optimised row transform
3680 for (i = 0; i < 8; ++i) { 3681 for (i = 0; i < 8; ++i) {
3681 vp9_highbd_idct8_c(input, outptr, bd); 3682 vpx_highbd_idct8_c(input, outptr, bd);
3682 input += 8; 3683 input += 8;
3683 outptr += 8; 3684 outptr += 8;
3684 } 3685 }
3685 } 3686 }
3686 3687
3687 if (optimised_cols) { 3688 if (optimised_cols) {
3688 idct8_sse2(inptr); 3689 idct8_sse2(inptr);
3689 3690
3690 // Final round & shift and Reconstruction and Store 3691 // Final round & shift and Reconstruction and Store
3691 { 3692 {
3692 __m128i d[8]; 3693 __m128i d[8];
3693 for (i = 0; i < 8; i++) { 3694 for (i = 0; i < 8; i++) {
3694 inptr[i] = _mm_add_epi16(inptr[i], sixteen); 3695 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3695 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 3696 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3696 inptr[i] = _mm_srai_epi16(inptr[i], 5); 3697 inptr[i] = _mm_srai_epi16(inptr[i], 5);
3697 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); 3698 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3698 // Store 3699 // Store
3699 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); 3700 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3700 } 3701 }
3701 } 3702 }
3702 } else { 3703 } else {
3703 // Run the un-optimised column transform 3704 // Run the un-optimised column transform
3704 tran_low_t temp_in[8], temp_out[8]; 3705 tran_low_t temp_in[8], temp_out[8];
3705 for (i = 0; i < 8; ++i) { 3706 for (i = 0; i < 8; ++i) {
3706 for (j = 0; j < 8; ++j) 3707 for (j = 0; j < 8; ++j)
3707 temp_in[j] = out[j * 8 + i]; 3708 temp_in[j] = out[j * 8 + i];
3708 vp9_highbd_idct8_c(temp_in, temp_out, bd); 3709 vpx_highbd_idct8_c(temp_in, temp_out, bd);
3709 for (j = 0; j < 8; ++j) { 3710 for (j = 0; j < 8; ++j) {
3710 dest[j * stride + i] = highbd_clip_pixel_add( 3711 dest[j * stride + i] = highbd_clip_pixel_add(
3711 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 3712 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3712 } 3713 }
3713 } 3714 }
3714 } 3715 }
3715 } 3716 }
3716 3717
3717 void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, 3718 void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3718 int stride, int bd) { 3719 int stride, int bd) {
3719 tran_low_t out[8 * 8] = { 0 }; 3720 tran_low_t out[8 * 8] = { 0 };
3720 tran_low_t *outptr = out; 3721 tran_low_t *outptr = out;
3721 int i, j, test; 3722 int i, j, test;
3722 __m128i inptr[8]; 3723 __m128i inptr[8];
3723 __m128i min_input, max_input, temp1, temp2, sign_bits; 3724 __m128i min_input, max_input, temp1, temp2, sign_bits;
3724 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3725 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3725 const __m128i zero = _mm_set1_epi16(0); 3726 const __m128i zero = _mm_set1_epi16(0);
3726 const __m128i sixteen = _mm_set1_epi16(16); 3727 const __m128i sixteen = _mm_set1_epi16(16);
3727 const __m128i max = _mm_set1_epi16(6201); 3728 const __m128i max = _mm_set1_epi16(6201);
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
3775 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); 3776 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3776 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); 3777 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3777 } 3778 }
3778 } else { 3779 } else {
3779 // Set to use the optimised transform for the column 3780 // Set to use the optimised transform for the column
3780 optimised_cols = 1; 3781 optimised_cols = 1;
3781 } 3782 }
3782 } else { 3783 } else {
3783 // Run the un-optimised row transform 3784 // Run the un-optimised row transform
3784 for (i = 0; i < 4; ++i) { 3785 for (i = 0; i < 4; ++i) {
3785 vp9_highbd_idct8_c(input, outptr, bd); 3786 vpx_highbd_idct8_c(input, outptr, bd);
3786 input += 8; 3787 input += 8;
3787 outptr += 8; 3788 outptr += 8;
3788 } 3789 }
3789 } 3790 }
3790 3791
3791 if (optimised_cols) { 3792 if (optimised_cols) {
3792 idct8_sse2(inptr); 3793 idct8_sse2(inptr);
3793 3794
3794 // Final round & shift and Reconstruction and Store 3795 // Final round & shift and Reconstruction and Store
3795 { 3796 {
3796 __m128i d[8]; 3797 __m128i d[8];
3797 for (i = 0; i < 8; i++) { 3798 for (i = 0; i < 8; i++) {
3798 inptr[i] = _mm_add_epi16(inptr[i], sixteen); 3799 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3799 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 3800 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3800 inptr[i] = _mm_srai_epi16(inptr[i], 5); 3801 inptr[i] = _mm_srai_epi16(inptr[i], 5);
3801 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); 3802 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3802 // Store 3803 // Store
3803 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); 3804 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3804 } 3805 }
3805 } 3806 }
3806 } else { 3807 } else {
3807 // Run the un-optimised column transform 3808 // Run the un-optimised column transform
3808 tran_low_t temp_in[8], temp_out[8]; 3809 tran_low_t temp_in[8], temp_out[8];
3809 for (i = 0; i < 8; ++i) { 3810 for (i = 0; i < 8; ++i) {
3810 for (j = 0; j < 8; ++j) 3811 for (j = 0; j < 8; ++j)
3811 temp_in[j] = out[j * 8 + i]; 3812 temp_in[j] = out[j * 8 + i];
3812 vp9_highbd_idct8_c(temp_in, temp_out, bd); 3813 vpx_highbd_idct8_c(temp_in, temp_out, bd);
3813 for (j = 0; j < 8; ++j) { 3814 for (j = 0; j < 8; ++j) {
3814 dest[j * stride + i] = highbd_clip_pixel_add( 3815 dest[j * stride + i] = highbd_clip_pixel_add(
3815 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 3816 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3816 } 3817 }
3817 } 3818 }
3818 } 3819 }
3819 } 3820 }
3820 3821
3821 void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, 3822 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
3822 int stride, int bd) { 3823 int stride, int bd) {
3823 tran_low_t out[16 * 16]; 3824 tran_low_t out[16 * 16];
3824 tran_low_t *outptr = out; 3825 tran_low_t *outptr = out;
3825 int i, j, test; 3826 int i, j, test;
3826 __m128i inptr[32]; 3827 __m128i inptr[32];
3827 __m128i min_input, max_input, temp1, temp2, sign_bits; 3828 __m128i min_input, max_input, temp1, temp2, sign_bits;
3828 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3829 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3829 const __m128i zero = _mm_set1_epi16(0); 3830 const __m128i zero = _mm_set1_epi16(0);
3830 const __m128i rounding = _mm_set1_epi16(32); 3831 const __m128i rounding = _mm_set1_epi16(32);
3831 const __m128i max = _mm_set1_epi16(3155); 3832 const __m128i max = _mm_set1_epi16(3155);
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
3884 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); 3885 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3885 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); 3886 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3886 } 3887 }
3887 } else { 3888 } else {
3888 // Set to use the optimised transform for the column 3889 // Set to use the optimised transform for the column
3889 optimised_cols = 1; 3890 optimised_cols = 1;
3890 } 3891 }
3891 } else { 3892 } else {
3892 // Run the un-optimised row transform 3893 // Run the un-optimised row transform
3893 for (i = 0; i < 16; ++i) { 3894 for (i = 0; i < 16; ++i) {
3894 vp9_highbd_idct16_c(input, outptr, bd); 3895 vpx_highbd_idct16_c(input, outptr, bd);
3895 input += 16; 3896 input += 16;
3896 outptr += 16; 3897 outptr += 16;
3897 } 3898 }
3898 } 3899 }
3899 3900
3900 if (optimised_cols) { 3901 if (optimised_cols) {
3901 idct16_sse2(inptr, inptr + 16); 3902 idct16_sse2(inptr, inptr + 16);
3902 3903
3903 // Final round & shift and Reconstruction and Store 3904 // Final round & shift and Reconstruction and Store
3904 { 3905 {
(...skipping 11 matching lines...) Expand all
3916 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); 3917 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
3917 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); 3918 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
3918 } 3919 }
3919 } 3920 }
3920 } else { 3921 } else {
3921 // Run the un-optimised column transform 3922 // Run the un-optimised column transform
3922 tran_low_t temp_in[16], temp_out[16]; 3923 tran_low_t temp_in[16], temp_out[16];
3923 for (i = 0; i < 16; ++i) { 3924 for (i = 0; i < 16; ++i) {
3924 for (j = 0; j < 16; ++j) 3925 for (j = 0; j < 16; ++j)
3925 temp_in[j] = out[j * 16 + i]; 3926 temp_in[j] = out[j * 16 + i];
3926 vp9_highbd_idct16_c(temp_in, temp_out, bd); 3927 vpx_highbd_idct16_c(temp_in, temp_out, bd);
3927 for (j = 0; j < 16; ++j) { 3928 for (j = 0; j < 16; ++j) {
3928 dest[j * stride + i] = highbd_clip_pixel_add( 3929 dest[j * stride + i] = highbd_clip_pixel_add(
3929 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 3930 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
3930 } 3931 }
3931 } 3932 }
3932 } 3933 }
3933 } 3934 }
3934 3935
3935 void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, 3936 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3936 int stride, int bd) { 3937 int stride, int bd) {
3937 tran_low_t out[16 * 16] = { 0 }; 3938 tran_low_t out[16 * 16] = { 0 };
3938 tran_low_t *outptr = out; 3939 tran_low_t *outptr = out;
3939 int i, j, test; 3940 int i, j, test;
3940 __m128i inptr[32]; 3941 __m128i inptr[32];
3941 __m128i min_input, max_input, temp1, temp2, sign_bits; 3942 __m128i min_input, max_input, temp1, temp2, sign_bits;
3942 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3943 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3943 const __m128i zero = _mm_set1_epi16(0); 3944 const __m128i zero = _mm_set1_epi16(0);
3944 const __m128i rounding = _mm_set1_epi16(32); 3945 const __m128i rounding = _mm_set1_epi16(32);
3945 const __m128i max = _mm_set1_epi16(3155); 3946 const __m128i max = _mm_set1_epi16(3155);
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
4003 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); 4004 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
4004 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); 4005 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
4005 } 4006 }
4006 } else { 4007 } else {
4007 // Set to use the optimised transform for the column 4008 // Set to use the optimised transform for the column
4008 optimised_cols = 1; 4009 optimised_cols = 1;
4009 } 4010 }
4010 } else { 4011 } else {
4011 // Run the un-optimised row transform 4012 // Run the un-optimised row transform
4012 for (i = 0; i < 4; ++i) { 4013 for (i = 0; i < 4; ++i) {
4013 vp9_highbd_idct16_c(input, outptr, bd); 4014 vpx_highbd_idct16_c(input, outptr, bd);
4014 input += 16; 4015 input += 16;
4015 outptr += 16; 4016 outptr += 16;
4016 } 4017 }
4017 } 4018 }
4018 4019
4019 if (optimised_cols) { 4020 if (optimised_cols) {
4020 idct16_sse2(inptr, inptr + 16); 4021 idct16_sse2(inptr, inptr + 16);
4021 4022
4022 // Final round & shift and Reconstruction and Store 4023 // Final round & shift and Reconstruction and Store
4023 { 4024 {
(...skipping 11 matching lines...) Expand all
4035 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); 4036 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
4036 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); 4037 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
4037 } 4038 }
4038 } 4039 }
4039 } else { 4040 } else {
4040 // Run the un-optimised column transform 4041 // Run the un-optimised column transform
4041 tran_low_t temp_in[16], temp_out[16]; 4042 tran_low_t temp_in[16], temp_out[16];
4042 for (i = 0; i < 16; ++i) { 4043 for (i = 0; i < 16; ++i) {
4043 for (j = 0; j < 16; ++j) 4044 for (j = 0; j < 16; ++j)
4044 temp_in[j] = out[j * 16 + i]; 4045 temp_in[j] = out[j * 16 + i];
4045 vp9_highbd_idct16_c(temp_in, temp_out, bd); 4046 vpx_highbd_idct16_c(temp_in, temp_out, bd);
4046 for (j = 0; j < 16; ++j) { 4047 for (j = 0; j < 16; ++j) {
4047 dest[j * stride + i] = highbd_clip_pixel_add( 4048 dest[j * stride + i] = highbd_clip_pixel_add(
4048 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 4049 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4049 } 4050 }
4050 } 4051 }
4051 } 4052 }
4052 } 4053 }
4053 #endif // CONFIG_VP9_HIGHBITDEPTH 4054 #endif // CONFIG_VP9_HIGHBITDEPTH
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/x86/inv_txfm_sse2.asm ('k') | source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698