OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 #include <stdio.h> | 12 #include <stdio.h> |
13 | 13 |
14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" |
16 #include "vp9/common/vp9_common.h" | 16 #include "vp9/common/vp9_common.h" |
17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
18 #include "vp9/common/vp9_idct.h" | 18 #include "vp9/common/vp9_idct.h" |
19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | 19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" |
20 | 20 |
21 #if HAVE_DSPR2 | 21 #if HAVE_DSPR2 |
22 static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, | 22 static void idct16_rows_dspr2(const int16_t *input, int16_t *output, |
23 uint32_t no_rows) { | 23 uint32_t no_rows) { |
24 int i; | 24 int i; |
25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | 25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
26 int step1_10, step1_11, step1_12, step1_13; | 26 int step1_10, step1_11, step1_12, step1_13; |
27 int step2_0, step2_1, step2_2, step2_3; | 27 int step2_0, step2_1, step2_2, step2_3; |
28 int step2_8, step2_9, step2_10, step2_11; | 28 int step2_8, step2_9, step2_10, step2_11; |
29 int step2_12, step2_13, step2_14, step2_15; | 29 int step2_12, step2_13, step2_14, step2_15; |
30 int load1, load2, load3, load4, load5, load6, load7, load8; | 30 int load1, load2, load3, load4, load5, load6, load7, load8; |
31 int result1, result2, result3, result4; | 31 int result1, result2, result3, result4; |
32 const int const_2_power_13 = 8192; | 32 const int const_2_power_13 = 8192; |
33 | 33 |
(...skipping 363 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
397 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), | 397 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), |
398 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | 398 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), |
399 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) | 399 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) |
400 ); | 400 ); |
401 | 401 |
402 input += 16; | 402 input += 16; |
403 output += 1; | 403 output += 1; |
404 } | 404 } |
405 } | 405 } |
406 | 406 |
407 static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, | 407 static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, |
408 int dest_stride) { | 408 int dest_stride) { |
409 int i; | 409 int i; |
410 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | 410 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
411 int step1_8, step1_9, step1_10, step1_11; | 411 int step1_8, step1_9, step1_10, step1_11; |
412 int step1_12, step1_13, step1_14, step1_15; | 412 int step1_12, step1_13, step1_14, step1_15; |
413 int step2_0, step2_1, step2_2, step2_3; | 413 int step2_0, step2_1, step2_2, step2_3; |
414 int step2_8, step2_9, step2_10, step2_11; | 414 int step2_8, step2_9, step2_10, step2_11; |
415 int step2_12, step2_13, step2_14, step2_15; | 415 int step2_12, step2_13, step2_14, step2_15; |
416 int load1, load2, load3, load4, load5, load6, load7, load8; | 416 int load1, load2, load3, load4, load5, load6, load7, load8; |
417 int result1, result2, result3, result4; | 417 int result1, result2, result3, result4; |
418 const int const_2_power_13 = 8192; | 418 const int const_2_power_13 = 8192; |
(...skipping 479 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
898 uint32_t pos = 45; | 898 uint32_t pos = 45; |
899 | 899 |
900 /* bit positon for extract from acc */ | 900 /* bit positon for extract from acc */ |
901 __asm__ __volatile__ ( | 901 __asm__ __volatile__ ( |
902 "wrdsp %[pos], 1 \n\t" | 902 "wrdsp %[pos], 1 \n\t" |
903 : | 903 : |
904 : [pos] "r" (pos) | 904 : [pos] "r" (pos) |
905 ); | 905 ); |
906 | 906 |
907 // First transform rows | 907 // First transform rows |
908 idct16_1d_rows_dspr2(input, out, 16); | 908 idct16_rows_dspr2(input, out, 16); |
909 | 909 |
910 // Then transform columns and add to dest | 910 // Then transform columns and add to dest |
911 idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); | 911 idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
912 } | 912 } |
913 | 913 |
914 static void iadst16_1d(const int16_t *input, int16_t *output) { | 914 static void iadst16(const int16_t *input, int16_t *output) { |
915 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 915 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; |
916 | 916 |
917 int x0 = input[15]; | 917 int x0 = input[15]; |
918 int x1 = input[0]; | 918 int x1 = input[0]; |
919 int x2 = input[13]; | 919 int x2 = input[13]; |
920 int x3 = input[2]; | 920 int x3 = input[2]; |
921 int x4 = input[11]; | 921 int x4 = input[11]; |
922 int x5 = input[4]; | 922 int x5 = input[4]; |
923 int x6 = input[9]; | 923 int x6 = input[9]; |
924 int x7 = input[6]; | 924 int x7 = input[6]; |
(...skipping 167 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1092 | 1092 |
1093 /* bit positon for extract from acc */ | 1093 /* bit positon for extract from acc */ |
1094 __asm__ __volatile__ ( | 1094 __asm__ __volatile__ ( |
1095 "wrdsp %[pos], 1 \n\t" | 1095 "wrdsp %[pos], 1 \n\t" |
1096 : | 1096 : |
1097 : [pos] "r" (pos) | 1097 : [pos] "r" (pos) |
1098 ); | 1098 ); |
1099 | 1099 |
1100 switch (tx_type) { | 1100 switch (tx_type) { |
1101 case DCT_DCT: // DCT in both horizontal and vertical | 1101 case DCT_DCT: // DCT in both horizontal and vertical |
1102 idct16_1d_rows_dspr2(input, outptr, 16); | 1102 idct16_rows_dspr2(input, outptr, 16); |
1103 idct16_1d_cols_add_blk_dspr2(out, dest, pitch); | 1103 idct16_cols_add_blk_dspr2(out, dest, pitch); |
1104 break; | 1104 break; |
1105 case ADST_DCT: // ADST in vertical, DCT in horizontal | 1105 case ADST_DCT: // ADST in vertical, DCT in horizontal |
1106 idct16_1d_rows_dspr2(input, outptr, 16); | 1106 idct16_rows_dspr2(input, outptr, 16); |
1107 | 1107 |
1108 outptr = out; | 1108 outptr = out; |
1109 | 1109 |
1110 for (i = 0; i < 16; ++i) { | 1110 for (i = 0; i < 16; ++i) { |
1111 iadst16_1d(outptr, temp_out); | 1111 iadst16(outptr, temp_out); |
1112 | 1112 |
1113 for (j = 0; j < 16; ++j) | 1113 for (j = 0; j < 16; ++j) |
1114 dest[j * pitch + i] = | 1114 dest[j * pitch + i] = |
1115 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 1115 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
1116 + dest[j * pitch + i]); | 1116 + dest[j * pitch + i]); |
1117 outptr += 16; | 1117 outptr += 16; |
1118 } | 1118 } |
1119 break; | 1119 break; |
1120 case DCT_ADST: // DCT in vertical, ADST in horizontal | 1120 case DCT_ADST: // DCT in vertical, ADST in horizontal |
1121 { | 1121 { |
1122 int16_t temp_in[16 * 16]; | 1122 int16_t temp_in[16 * 16]; |
1123 | 1123 |
1124 for (i = 0; i < 16; ++i) { | 1124 for (i = 0; i < 16; ++i) { |
1125 /* prefetch row */ | 1125 /* prefetch row */ |
1126 vp9_prefetch_load((const uint8_t *)(input + 16)); | 1126 vp9_prefetch_load((const uint8_t *)(input + 16)); |
1127 | 1127 |
1128 iadst16_1d(input, outptr); | 1128 iadst16(input, outptr); |
1129 input += 16; | 1129 input += 16; |
1130 outptr += 16; | 1130 outptr += 16; |
1131 } | 1131 } |
1132 | 1132 |
1133 for (i = 0; i < 16; ++i) | 1133 for (i = 0; i < 16; ++i) |
1134 for (j = 0; j < 16; ++j) | 1134 for (j = 0; j < 16; ++j) |
1135 temp_in[j * 16 + i] = out[i * 16 + j]; | 1135 temp_in[j * 16 + i] = out[i * 16 + j]; |
1136 | 1136 |
1137 idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch); | 1137 idct16_cols_add_blk_dspr2(temp_in, dest, pitch); |
1138 } | 1138 } |
1139 break; | 1139 break; |
1140 case ADST_ADST: // ADST in both directions | 1140 case ADST_ADST: // ADST in both directions |
1141 { | 1141 { |
1142 int16_t temp_in[16]; | 1142 int16_t temp_in[16]; |
1143 | 1143 |
1144 for (i = 0; i < 16; ++i) { | 1144 for (i = 0; i < 16; ++i) { |
1145 /* prefetch row */ | 1145 /* prefetch row */ |
1146 vp9_prefetch_load((const uint8_t *)(input + 16)); | 1146 vp9_prefetch_load((const uint8_t *)(input + 16)); |
1147 | 1147 |
1148 iadst16_1d(input, outptr); | 1148 iadst16(input, outptr); |
1149 input += 16; | 1149 input += 16; |
1150 outptr += 16; | 1150 outptr += 16; |
1151 } | 1151 } |
1152 | 1152 |
1153 for (i = 0; i < 16; ++i) { | 1153 for (i = 0; i < 16; ++i) { |
1154 for (j = 0; j < 16; ++j) | 1154 for (j = 0; j < 16; ++j) |
1155 temp_in[j] = out[j * 16 + i]; | 1155 temp_in[j] = out[j * 16 + i]; |
1156 iadst16_1d(temp_in, temp_out); | 1156 iadst16(temp_in, temp_out); |
1157 for (j = 0; j < 16; ++j) | 1157 for (j = 0; j < 16; ++j) |
1158 dest[j * pitch + i] = | 1158 dest[j * pitch + i] = |
1159 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 1159 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
1160 + dest[j * pitch + i]); | 1160 + dest[j * pitch + i]); |
1161 } | 1161 } |
1162 } | 1162 } |
1163 break; | 1163 break; |
1164 default: | 1164 default: |
1165 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); | 1165 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); |
1166 break; | 1166 break; |
1167 } | 1167 } |
1168 } | 1168 } |
1169 | 1169 |
1170 void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, | 1170 void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, |
1171 int dest_stride) { | 1171 int dest_stride) { |
1172 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | 1172 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
1173 int16_t *outptr = out; | 1173 int16_t *outptr = out; |
1174 uint32_t i; | 1174 uint32_t i; |
1175 uint32_t pos = 45; | 1175 uint32_t pos = 45; |
1176 | 1176 |
1177 /* bit positon for extract from acc */ | 1177 /* bit positon for extract from acc */ |
1178 __asm__ __volatile__ ( | 1178 __asm__ __volatile__ ( |
1179 "wrdsp %[pos], 1 \n\t" | 1179 "wrdsp %[pos], 1 \n\t" |
1180 : | 1180 : |
1181 : [pos] "r" (pos) | 1181 : [pos] "r" (pos) |
1182 ); | 1182 ); |
1183 | 1183 |
1184 // First transform rows. Since all non-zero dct coefficients are in | 1184 // First transform rows. Since all non-zero dct coefficients are in |
1185 // upper-left 4x4 area, we only need to calculate first 4 rows here. | 1185 // upper-left 4x4 area, we only need to calculate first 4 rows here. |
1186 idct16_1d_rows_dspr2(input, outptr, 4); | 1186 idct16_rows_dspr2(input, outptr, 4); |
1187 | 1187 |
1188 outptr += 4; | 1188 outptr += 4; |
1189 for (i = 0; i < 6; ++i) { | 1189 for (i = 0; i < 6; ++i) { |
1190 __asm__ __volatile__ ( | 1190 __asm__ __volatile__ ( |
1191 "sw $zero, 0(%[outptr]) \n\t" | 1191 "sw $zero, 0(%[outptr]) \n\t" |
1192 "sw $zero, 32(%[outptr]) \n\t" | 1192 "sw $zero, 32(%[outptr]) \n\t" |
1193 "sw $zero, 64(%[outptr]) \n\t" | 1193 "sw $zero, 64(%[outptr]) \n\t" |
1194 "sw $zero, 96(%[outptr]) \n\t" | 1194 "sw $zero, 96(%[outptr]) \n\t" |
1195 "sw $zero, 128(%[outptr]) \n\t" | 1195 "sw $zero, 128(%[outptr]) \n\t" |
1196 "sw $zero, 160(%[outptr]) \n\t" | 1196 "sw $zero, 160(%[outptr]) \n\t" |
1197 "sw $zero, 192(%[outptr]) \n\t" | 1197 "sw $zero, 192(%[outptr]) \n\t" |
1198 "sw $zero, 224(%[outptr]) \n\t" | 1198 "sw $zero, 224(%[outptr]) \n\t" |
1199 "sw $zero, 256(%[outptr]) \n\t" | 1199 "sw $zero, 256(%[outptr]) \n\t" |
1200 "sw $zero, 288(%[outptr]) \n\t" | 1200 "sw $zero, 288(%[outptr]) \n\t" |
1201 "sw $zero, 320(%[outptr]) \n\t" | 1201 "sw $zero, 320(%[outptr]) \n\t" |
1202 "sw $zero, 352(%[outptr]) \n\t" | 1202 "sw $zero, 352(%[outptr]) \n\t" |
1203 "sw $zero, 384(%[outptr]) \n\t" | 1203 "sw $zero, 384(%[outptr]) \n\t" |
1204 "sw $zero, 416(%[outptr]) \n\t" | 1204 "sw $zero, 416(%[outptr]) \n\t" |
1205 "sw $zero, 448(%[outptr]) \n\t" | 1205 "sw $zero, 448(%[outptr]) \n\t" |
1206 "sw $zero, 480(%[outptr]) \n\t" | 1206 "sw $zero, 480(%[outptr]) \n\t" |
1207 | 1207 |
1208 : | 1208 : |
1209 : [outptr] "r" (outptr) | 1209 : [outptr] "r" (outptr) |
1210 ); | 1210 ); |
1211 | 1211 |
1212 outptr += 2; | 1212 outptr += 2; |
1213 } | 1213 } |
1214 | 1214 |
1215 // Then transform columns | 1215 // Then transform columns |
1216 idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); | 1216 idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
1217 } | 1217 } |
1218 | 1218 |
1219 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, | 1219 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, |
1220 int dest_stride) { | 1220 int dest_stride) { |
1221 uint32_t pos = 45; | 1221 uint32_t pos = 45; |
1222 int32_t out; | 1222 int32_t out; |
1223 int32_t r; | 1223 int32_t r; |
1224 int32_t a1, absa1; | 1224 int32_t a1, absa1; |
1225 int32_t vector_a1; | 1225 int32_t vector_a1; |
1226 int32_t t1, t2, t3, t4; | 1226 int32_t t1, t2, t3, t4; |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1306 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | 1306 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
1307 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | 1307 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
1308 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | 1308 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
1309 [dest] "+&r" (dest) | 1309 [dest] "+&r" (dest) |
1310 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | 1310 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
1311 ); | 1311 ); |
1312 } | 1312 } |
1313 } | 1313 } |
1314 } | 1314 } |
1315 #endif // #if HAVE_DSPR2 | 1315 #endif // #if HAVE_DSPR2 |
OLD | NEW |