| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | |
| 12 #include <stdio.h> | |
| 13 | |
| 14 #include "./vpx_config.h" | 11 #include "./vpx_config.h" |
| 15 #include "./vp9_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
| 16 #include "vp9/common/vp9_common.h" | 13 #include "vpx_dsp/mips/inv_txfm_dspr2.h" |
| 17 #include "vp9/common/vp9_blockd.h" | |
| 18 #include "vp9/common/vp9_idct.h" | |
| 19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | |
| 20 #include "vpx_dsp/txfm_common.h" | 14 #include "vpx_dsp/txfm_common.h" |
| 21 #include "vpx_ports/mem.h" | |
| 22 | 15 |
| 23 #if HAVE_DSPR2 | 16 #if HAVE_DSPR2 |
| 24 static void idct16_rows_dspr2(const int16_t *input, int16_t *output, | 17 void idct16_rows_dspr2(const int16_t *input, int16_t *output, |
| 25 uint32_t no_rows) { | 18 uint32_t no_rows) { |
| 26 int i; | 19 int i; |
| 27 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | 20 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
| 28 int step1_10, step1_11, step1_12, step1_13; | 21 int step1_10, step1_11, step1_12, step1_13; |
| 29 int step2_0, step2_1, step2_2, step2_3; | 22 int step2_0, step2_1, step2_2, step2_3; |
| 30 int step2_8, step2_9, step2_10, step2_11; | 23 int step2_8, step2_9, step2_10, step2_11; |
| 31 int step2_12, step2_13, step2_14, step2_15; | 24 int step2_12, step2_13, step2_14, step2_15; |
| 32 int load1, load2, load3, load4, load5, load6, load7, load8; | 25 int load1, load2, load3, load4, load5, load6, load7, load8; |
| 33 int result1, result2, result3, result4; | 26 int result1, result2, result3, result4; |
| 34 const int const_2_power_13 = 8192; | 27 const int const_2_power_13 = 8192; |
| 35 | 28 |
| (...skipping 363 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 399 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), | 392 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), |
| 400 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | 393 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), |
| 401 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) | 394 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) |
| 402 ); | 395 ); |
| 403 | 396 |
| 404 input += 16; | 397 input += 16; |
| 405 output += 1; | 398 output += 1; |
| 406 } | 399 } |
| 407 } | 400 } |
| 408 | 401 |
| 409 static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, | 402 void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, |
| 410 int dest_stride) { | 403 int dest_stride) { |
| 411 int i; | 404 int i; |
| 412 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | 405 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
| 413 int step1_8, step1_9, step1_10, step1_11; | 406 int step1_8, step1_9, step1_10, step1_11; |
| 414 int step1_12, step1_13, step1_14, step1_15; | 407 int step1_12, step1_13, step1_14, step1_15; |
| 415 int step2_0, step2_1, step2_2, step2_3; | 408 int step2_0, step2_1, step2_2, step2_3; |
| 416 int step2_8, step2_9, step2_10, step2_11; | 409 int step2_8, step2_9, step2_10, step2_11; |
| 417 int step2_12, step2_13, step2_14, step2_15; | 410 int step2_12, step2_13, step2_14, step2_15; |
| 418 int load1, load2, load3, load4, load5, load6, load7, load8; | 411 int load1, load2, load3, load4, load5, load6, load7, load8; |
| 419 int result1, result2, result3, result4; | 412 int result1, result2, result3, result4; |
| 420 const int const_2_power_13 = 8192; | 413 const int const_2_power_13 = 8192; |
| (...skipping 466 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 887 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), | 880 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), |
| 888 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | 881 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), |
| 889 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), | 882 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), |
| 890 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) | 883 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) |
| 891 ); | 884 ); |
| 892 | 885 |
| 893 input += 16; | 886 input += 16; |
| 894 } | 887 } |
| 895 } | 888 } |
| 896 | 889 |
| 897 void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, | 890 void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, |
| 898 int dest_stride) { | 891 int dest_stride) { |
| 899 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | 892 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
| 900 uint32_t pos = 45; | 893 uint32_t pos = 45; |
| 901 | 894 |
| 902 /* bit positon for extract from acc */ | 895 /* bit positon for extract from acc */ |
| 903 __asm__ __volatile__ ( | 896 __asm__ __volatile__ ( |
| 904 "wrdsp %[pos], 1 \n\t" | 897 "wrdsp %[pos], 1 \n\t" |
| 905 : | 898 : |
| 906 : [pos] "r" (pos) | 899 : [pos] "r" (pos) |
| 907 ); | 900 ); |
| 908 | 901 |
| 909 // First transform rows | 902 // First transform rows |
| 910 idct16_rows_dspr2(input, out, 16); | 903 idct16_rows_dspr2(input, out, 16); |
| 911 | 904 |
| 912 // Then transform columns and add to dest | 905 // Then transform columns and add to dest |
| 913 idct16_cols_add_blk_dspr2(out, dest, dest_stride); | 906 idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
| 914 } | 907 } |
| 915 | 908 |
| 916 static void iadst16(const int16_t *input, int16_t *output) { | 909 void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, |
| 910 int dest_stride) { |
| 911 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
| 912 int16_t *outptr = out; |
| 913 uint32_t i; |
| 914 uint32_t pos = 45; |
| 915 |
| 916 /* bit positon for extract from acc */ |
| 917 __asm__ __volatile__ ( |
| 918 "wrdsp %[pos], 1 \n\t" |
| 919 : |
| 920 : [pos] "r" (pos) |
| 921 ); |
| 922 |
| 923 // First transform rows. Since all non-zero dct coefficients are in |
| 924 // upper-left 4x4 area, we only need to calculate first 4 rows here. |
| 925 idct16_rows_dspr2(input, outptr, 4); |
| 926 |
| 927 outptr += 4; |
| 928 for (i = 0; i < 6; ++i) { |
| 929 __asm__ __volatile__ ( |
| 930 "sw $zero, 0(%[outptr]) \n\t" |
| 931 "sw $zero, 32(%[outptr]) \n\t" |
| 932 "sw $zero, 64(%[outptr]) \n\t" |
| 933 "sw $zero, 96(%[outptr]) \n\t" |
| 934 "sw $zero, 128(%[outptr]) \n\t" |
| 935 "sw $zero, 160(%[outptr]) \n\t" |
| 936 "sw $zero, 192(%[outptr]) \n\t" |
| 937 "sw $zero, 224(%[outptr]) \n\t" |
| 938 "sw $zero, 256(%[outptr]) \n\t" |
| 939 "sw $zero, 288(%[outptr]) \n\t" |
| 940 "sw $zero, 320(%[outptr]) \n\t" |
| 941 "sw $zero, 352(%[outptr]) \n\t" |
| 942 "sw $zero, 384(%[outptr]) \n\t" |
| 943 "sw $zero, 416(%[outptr]) \n\t" |
| 944 "sw $zero, 448(%[outptr]) \n\t" |
| 945 "sw $zero, 480(%[outptr]) \n\t" |
| 946 |
| 947 : |
| 948 : [outptr] "r" (outptr) |
| 949 ); |
| 950 |
| 951 outptr += 2; |
| 952 } |
| 953 |
| 954 // Then transform columns |
| 955 idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
| 956 } |
| 957 |
| 958 void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, |
| 959 int dest_stride) { |
| 960 uint32_t pos = 45; |
| 961 int32_t out; |
| 962 int32_t r; |
| 963 int32_t a1, absa1; |
| 964 int32_t vector_a1; |
| 965 int32_t t1, t2, t3, t4; |
| 966 int32_t vector_1, vector_2, vector_3, vector_4; |
| 967 |
| 968 /* bit positon for extract from acc */ |
| 969 __asm__ __volatile__ ( |
| 970 "wrdsp %[pos], 1 \n\t" |
| 971 |
| 972 : |
| 973 : [pos] "r" (pos) |
| 974 ); |
| 975 |
| 976 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); |
| 977 __asm__ __volatile__ ( |
| 978 "addi %[out], %[out], 32 \n\t" |
| 979 "sra %[a1], %[out], 6 \n\t" |
| 980 |
| 981 : [out] "+r" (out), [a1] "=r" (a1) |
| 982 : |
| 983 ); |
| 984 |
| 985 if (a1 < 0) { |
| 986 /* use quad-byte |
| 987 * input and output memory are four byte aligned */ |
| 988 __asm__ __volatile__ ( |
| 989 "abs %[absa1], %[a1] \n\t" |
| 990 "replv.qb %[vector_a1], %[absa1] \n\t" |
| 991 |
| 992 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) |
| 993 : [a1] "r" (a1) |
| 994 ); |
| 995 |
| 996 for (r = 16; r--;) { |
| 997 __asm__ __volatile__ ( |
| 998 "lw %[t1], 0(%[dest]) \n\t" |
| 999 "lw %[t2], 4(%[dest]) \n\t" |
| 1000 "lw %[t3], 8(%[dest]) \n\t" |
| 1001 "lw %[t4], 12(%[dest]) \n\t" |
| 1002 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
| 1003 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
| 1004 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
| 1005 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
| 1006 "sw %[vector_1], 0(%[dest]) \n\t" |
| 1007 "sw %[vector_2], 4(%[dest]) \n\t" |
| 1008 "sw %[vector_3], 8(%[dest]) \n\t" |
| 1009 "sw %[vector_4], 12(%[dest]) \n\t" |
| 1010 "add %[dest], %[dest], %[dest_stride] \n\t" |
| 1011 |
| 1012 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
| 1013 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
| 1014 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
| 1015 [dest] "+&r" (dest) |
| 1016 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
| 1017 ); |
| 1018 } |
| 1019 } else { |
| 1020 /* use quad-byte |
| 1021 * input and output memory are four byte aligned */ |
| 1022 __asm__ __volatile__ ( |
| 1023 "replv.qb %[vector_a1], %[a1] \n\t" |
| 1024 |
| 1025 : [vector_a1] "=r" (vector_a1) |
| 1026 : [a1] "r" (a1) |
| 1027 ); |
| 1028 |
| 1029 for (r = 16; r--;) { |
| 1030 __asm__ __volatile__ ( |
| 1031 "lw %[t1], 0(%[dest]) \n\t" |
| 1032 "lw %[t2], 4(%[dest]) \n\t" |
| 1033 "lw %[t3], 8(%[dest]) \n\t" |
| 1034 "lw %[t4], 12(%[dest]) \n\t" |
| 1035 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
| 1036 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
| 1037 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
| 1038 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
| 1039 "sw %[vector_1], 0(%[dest]) \n\t" |
| 1040 "sw %[vector_2], 4(%[dest]) \n\t" |
| 1041 "sw %[vector_3], 8(%[dest]) \n\t" |
| 1042 "sw %[vector_4], 12(%[dest]) \n\t" |
| 1043 "add %[dest], %[dest], %[dest_stride] \n\t" |
| 1044 |
| 1045 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
| 1046 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
| 1047 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
| 1048 [dest] "+&r" (dest) |
| 1049 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
| 1050 ); |
| 1051 } |
| 1052 } |
| 1053 } |
| 1054 |
| 1055 void iadst16_dspr2(const int16_t *input, int16_t *output) { |
| 917 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 1056 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; |
| 918 | 1057 |
| 919 int x0 = input[15]; | 1058 int x0 = input[15]; |
| 920 int x1 = input[0]; | 1059 int x1 = input[0]; |
| 921 int x2 = input[13]; | 1060 int x2 = input[13]; |
| 922 int x3 = input[2]; | 1061 int x3 = input[2]; |
| 923 int x4 = input[11]; | 1062 int x4 = input[11]; |
| 924 int x5 = input[4]; | 1063 int x5 = input[4]; |
| 925 int x6 = input[9]; | 1064 int x6 = input[9]; |
| 926 int x7 = input[6]; | 1065 int x7 = input[6]; |
| (...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1077 output[8] = x3; | 1216 output[8] = x3; |
| 1078 output[9] = x11; | 1217 output[9] = x11; |
| 1079 output[10] = x15; | 1218 output[10] = x15; |
| 1080 output[11] = x7; | 1219 output[11] = x7; |
| 1081 output[12] = x5; | 1220 output[12] = x5; |
| 1082 output[13] = -x13; | 1221 output[13] = -x13; |
| 1083 output[14] = x9; | 1222 output[14] = x9; |
| 1084 output[15] = -x1; | 1223 output[15] = -x1; |
| 1085 } | 1224 } |
| 1086 | 1225 |
| 1087 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 1088 int pitch, int tx_type) { | |
| 1089 int i, j; | |
| 1090 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | |
| 1091 int16_t *outptr = out; | |
| 1092 int16_t temp_out[16]; | |
| 1093 uint32_t pos = 45; | |
| 1094 | 1226 |
| 1095 /* bit positon for extract from acc */ | 1227 #endif // HAVE_DSPR2 |
| 1096 __asm__ __volatile__ ( | |
| 1097 "wrdsp %[pos], 1 \n\t" | |
| 1098 : | |
| 1099 : [pos] "r" (pos) | |
| 1100 ); | |
| 1101 | |
| 1102 switch (tx_type) { | |
| 1103 case DCT_DCT: // DCT in both horizontal and vertical | |
| 1104 idct16_rows_dspr2(input, outptr, 16); | |
| 1105 idct16_cols_add_blk_dspr2(out, dest, pitch); | |
| 1106 break; | |
| 1107 case ADST_DCT: // ADST in vertical, DCT in horizontal | |
| 1108 idct16_rows_dspr2(input, outptr, 16); | |
| 1109 | |
| 1110 outptr = out; | |
| 1111 | |
| 1112 for (i = 0; i < 16; ++i) { | |
| 1113 iadst16(outptr, temp_out); | |
| 1114 | |
| 1115 for (j = 0; j < 16; ++j) | |
| 1116 dest[j * pitch + i] = | |
| 1117 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | |
| 1118 + dest[j * pitch + i]); | |
| 1119 outptr += 16; | |
| 1120 } | |
| 1121 break; | |
| 1122 case DCT_ADST: // DCT in vertical, ADST in horizontal | |
| 1123 { | |
| 1124 int16_t temp_in[16 * 16]; | |
| 1125 | |
| 1126 for (i = 0; i < 16; ++i) { | |
| 1127 /* prefetch row */ | |
| 1128 prefetch_load((const uint8_t *)(input + 16)); | |
| 1129 | |
| 1130 iadst16(input, outptr); | |
| 1131 input += 16; | |
| 1132 outptr += 16; | |
| 1133 } | |
| 1134 | |
| 1135 for (i = 0; i < 16; ++i) | |
| 1136 for (j = 0; j < 16; ++j) | |
| 1137 temp_in[j * 16 + i] = out[i * 16 + j]; | |
| 1138 | |
| 1139 idct16_cols_add_blk_dspr2(temp_in, dest, pitch); | |
| 1140 } | |
| 1141 break; | |
| 1142 case ADST_ADST: // ADST in both directions | |
| 1143 { | |
| 1144 int16_t temp_in[16]; | |
| 1145 | |
| 1146 for (i = 0; i < 16; ++i) { | |
| 1147 /* prefetch row */ | |
| 1148 prefetch_load((const uint8_t *)(input + 16)); | |
| 1149 | |
| 1150 iadst16(input, outptr); | |
| 1151 input += 16; | |
| 1152 outptr += 16; | |
| 1153 } | |
| 1154 | |
| 1155 for (i = 0; i < 16; ++i) { | |
| 1156 for (j = 0; j < 16; ++j) | |
| 1157 temp_in[j] = out[j * 16 + i]; | |
| 1158 iadst16(temp_in, temp_out); | |
| 1159 for (j = 0; j < 16; ++j) | |
| 1160 dest[j * pitch + i] = | |
| 1161 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | |
| 1162 + dest[j * pitch + i]); | |
| 1163 } | |
| 1164 } | |
| 1165 break; | |
| 1166 default: | |
| 1167 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); | |
| 1168 break; | |
| 1169 } | |
| 1170 } | |
| 1171 | |
| 1172 void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 1173 int dest_stride) { | |
| 1174 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | |
| 1175 int16_t *outptr = out; | |
| 1176 uint32_t i; | |
| 1177 uint32_t pos = 45; | |
| 1178 | |
| 1179 /* bit positon for extract from acc */ | |
| 1180 __asm__ __volatile__ ( | |
| 1181 "wrdsp %[pos], 1 \n\t" | |
| 1182 : | |
| 1183 : [pos] "r" (pos) | |
| 1184 ); | |
| 1185 | |
| 1186 // First transform rows. Since all non-zero dct coefficients are in | |
| 1187 // upper-left 4x4 area, we only need to calculate first 4 rows here. | |
| 1188 idct16_rows_dspr2(input, outptr, 4); | |
| 1189 | |
| 1190 outptr += 4; | |
| 1191 for (i = 0; i < 6; ++i) { | |
| 1192 __asm__ __volatile__ ( | |
| 1193 "sw $zero, 0(%[outptr]) \n\t" | |
| 1194 "sw $zero, 32(%[outptr]) \n\t" | |
| 1195 "sw $zero, 64(%[outptr]) \n\t" | |
| 1196 "sw $zero, 96(%[outptr]) \n\t" | |
| 1197 "sw $zero, 128(%[outptr]) \n\t" | |
| 1198 "sw $zero, 160(%[outptr]) \n\t" | |
| 1199 "sw $zero, 192(%[outptr]) \n\t" | |
| 1200 "sw $zero, 224(%[outptr]) \n\t" | |
| 1201 "sw $zero, 256(%[outptr]) \n\t" | |
| 1202 "sw $zero, 288(%[outptr]) \n\t" | |
| 1203 "sw $zero, 320(%[outptr]) \n\t" | |
| 1204 "sw $zero, 352(%[outptr]) \n\t" | |
| 1205 "sw $zero, 384(%[outptr]) \n\t" | |
| 1206 "sw $zero, 416(%[outptr]) \n\t" | |
| 1207 "sw $zero, 448(%[outptr]) \n\t" | |
| 1208 "sw $zero, 480(%[outptr]) \n\t" | |
| 1209 | |
| 1210 : | |
| 1211 : [outptr] "r" (outptr) | |
| 1212 ); | |
| 1213 | |
| 1214 outptr += 2; | |
| 1215 } | |
| 1216 | |
| 1217 // Then transform columns | |
| 1218 idct16_cols_add_blk_dspr2(out, dest, dest_stride); | |
| 1219 } | |
| 1220 | |
| 1221 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 1222 int dest_stride) { | |
| 1223 uint32_t pos = 45; | |
| 1224 int32_t out; | |
| 1225 int32_t r; | |
| 1226 int32_t a1, absa1; | |
| 1227 int32_t vector_a1; | |
| 1228 int32_t t1, t2, t3, t4; | |
| 1229 int32_t vector_1, vector_2, vector_3, vector_4; | |
| 1230 | |
| 1231 /* bit positon for extract from acc */ | |
| 1232 __asm__ __volatile__ ( | |
| 1233 "wrdsp %[pos], 1 \n\t" | |
| 1234 | |
| 1235 : | |
| 1236 : [pos] "r" (pos) | |
| 1237 ); | |
| 1238 | |
| 1239 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); | |
| 1240 __asm__ __volatile__ ( | |
| 1241 "addi %[out], %[out], 32 \n\t" | |
| 1242 "sra %[a1], %[out], 6 \n\t" | |
| 1243 | |
| 1244 : [out] "+r" (out), [a1] "=r" (a1) | |
| 1245 : | |
| 1246 ); | |
| 1247 | |
| 1248 if (a1 < 0) { | |
| 1249 /* use quad-byte | |
| 1250 * input and output memory are four byte aligned */ | |
| 1251 __asm__ __volatile__ ( | |
| 1252 "abs %[absa1], %[a1] \n\t" | |
| 1253 "replv.qb %[vector_a1], %[absa1] \n\t" | |
| 1254 | |
| 1255 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) | |
| 1256 : [a1] "r" (a1) | |
| 1257 ); | |
| 1258 | |
| 1259 for (r = 16; r--;) { | |
| 1260 __asm__ __volatile__ ( | |
| 1261 "lw %[t1], 0(%[dest]) \n\t" | |
| 1262 "lw %[t2], 4(%[dest]) \n\t" | |
| 1263 "lw %[t3], 8(%[dest]) \n\t" | |
| 1264 "lw %[t4], 12(%[dest]) \n\t" | |
| 1265 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 1266 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 1267 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
| 1268 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
| 1269 "sw %[vector_1], 0(%[dest]) \n\t" | |
| 1270 "sw %[vector_2], 4(%[dest]) \n\t" | |
| 1271 "sw %[vector_3], 8(%[dest]) \n\t" | |
| 1272 "sw %[vector_4], 12(%[dest]) \n\t" | |
| 1273 "add %[dest], %[dest], %[dest_stride] \n\t" | |
| 1274 | |
| 1275 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
| 1276 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
| 1277 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
| 1278 [dest] "+&r" (dest) | |
| 1279 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
| 1280 ); | |
| 1281 } | |
| 1282 } else { | |
| 1283 /* use quad-byte | |
| 1284 * input and output memory are four byte aligned */ | |
| 1285 __asm__ __volatile__ ( | |
| 1286 "replv.qb %[vector_a1], %[a1] \n\t" | |
| 1287 | |
| 1288 : [vector_a1] "=r" (vector_a1) | |
| 1289 : [a1] "r" (a1) | |
| 1290 ); | |
| 1291 | |
| 1292 for (r = 16; r--;) { | |
| 1293 __asm__ __volatile__ ( | |
| 1294 "lw %[t1], 0(%[dest]) \n\t" | |
| 1295 "lw %[t2], 4(%[dest]) \n\t" | |
| 1296 "lw %[t3], 8(%[dest]) \n\t" | |
| 1297 "lw %[t4], 12(%[dest]) \n\t" | |
| 1298 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 1299 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 1300 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
| 1301 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
| 1302 "sw %[vector_1], 0(%[dest]) \n\t" | |
| 1303 "sw %[vector_2], 4(%[dest]) \n\t" | |
| 1304 "sw %[vector_3], 8(%[dest]) \n\t" | |
| 1305 "sw %[vector_4], 12(%[dest]) \n\t" | |
| 1306 "add %[dest], %[dest], %[dest_stride] \n\t" | |
| 1307 | |
| 1308 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
| 1309 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
| 1310 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
| 1311 [dest] "+&r" (dest) | |
| 1312 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
| 1313 ); | |
| 1314 } | |
| 1315 } | |
| 1316 } | |
| 1317 #endif // #if HAVE_DSPR2 | |
| OLD | NEW |