OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | |
12 #include <stdio.h> | |
13 | |
14 #include "./vpx_config.h" | 11 #include "./vpx_config.h" |
15 #include "./vp9_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
16 #include "vp9/common/vp9_common.h" | 13 #include "vpx_dsp/mips/inv_txfm_dspr2.h" |
17 #include "vp9/common/vp9_blockd.h" | |
18 #include "vp9/common/vp9_idct.h" | |
19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | |
20 #include "vpx_dsp/txfm_common.h" | 14 #include "vpx_dsp/txfm_common.h" |
21 #include "vpx_ports/mem.h" | |
22 | 15 |
23 #if HAVE_DSPR2 | 16 #if HAVE_DSPR2 |
24 static void idct16_rows_dspr2(const int16_t *input, int16_t *output, | 17 void idct16_rows_dspr2(const int16_t *input, int16_t *output, |
25 uint32_t no_rows) { | 18 uint32_t no_rows) { |
26 int i; | 19 int i; |
27 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | 20 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
28 int step1_10, step1_11, step1_12, step1_13; | 21 int step1_10, step1_11, step1_12, step1_13; |
29 int step2_0, step2_1, step2_2, step2_3; | 22 int step2_0, step2_1, step2_2, step2_3; |
30 int step2_8, step2_9, step2_10, step2_11; | 23 int step2_8, step2_9, step2_10, step2_11; |
31 int step2_12, step2_13, step2_14, step2_15; | 24 int step2_12, step2_13, step2_14, step2_15; |
32 int load1, load2, load3, load4, load5, load6, load7, load8; | 25 int load1, load2, load3, load4, load5, load6, load7, load8; |
33 int result1, result2, result3, result4; | 26 int result1, result2, result3, result4; |
34 const int const_2_power_13 = 8192; | 27 const int const_2_power_13 = 8192; |
35 | 28 |
(...skipping 363 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
399 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), | 392 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), |
400 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | 393 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), |
401 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) | 394 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) |
402 ); | 395 ); |
403 | 396 |
404 input += 16; | 397 input += 16; |
405 output += 1; | 398 output += 1; |
406 } | 399 } |
407 } | 400 } |
408 | 401 |
409 static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, | 402 void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, |
410 int dest_stride) { | 403 int dest_stride) { |
411 int i; | 404 int i; |
412 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | 405 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; |
413 int step1_8, step1_9, step1_10, step1_11; | 406 int step1_8, step1_9, step1_10, step1_11; |
414 int step1_12, step1_13, step1_14, step1_15; | 407 int step1_12, step1_13, step1_14, step1_15; |
415 int step2_0, step2_1, step2_2, step2_3; | 408 int step2_0, step2_1, step2_2, step2_3; |
416 int step2_8, step2_9, step2_10, step2_11; | 409 int step2_8, step2_9, step2_10, step2_11; |
417 int step2_12, step2_13, step2_14, step2_15; | 410 int step2_12, step2_13, step2_14, step2_15; |
418 int load1, load2, load3, load4, load5, load6, load7, load8; | 411 int load1, load2, load3, load4, load5, load6, load7, load8; |
419 int result1, result2, result3, result4; | 412 int result1, result2, result3, result4; |
420 const int const_2_power_13 = 8192; | 413 const int const_2_power_13 = 8192; |
(...skipping 466 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
887 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), | 880 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), |
888 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | 881 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), |
889 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), | 882 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), |
890 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) | 883 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) |
891 ); | 884 ); |
892 | 885 |
893 input += 16; | 886 input += 16; |
894 } | 887 } |
895 } | 888 } |
896 | 889 |
897 void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, | 890 void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, |
898 int dest_stride) { | 891 int dest_stride) { |
899 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | 892 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
900 uint32_t pos = 45; | 893 uint32_t pos = 45; |
901 | 894 |
902 /* bit positon for extract from acc */ | 895 /* bit positon for extract from acc */ |
903 __asm__ __volatile__ ( | 896 __asm__ __volatile__ ( |
904 "wrdsp %[pos], 1 \n\t" | 897 "wrdsp %[pos], 1 \n\t" |
905 : | 898 : |
906 : [pos] "r" (pos) | 899 : [pos] "r" (pos) |
907 ); | 900 ); |
908 | 901 |
909 // First transform rows | 902 // First transform rows |
910 idct16_rows_dspr2(input, out, 16); | 903 idct16_rows_dspr2(input, out, 16); |
911 | 904 |
912 // Then transform columns and add to dest | 905 // Then transform columns and add to dest |
913 idct16_cols_add_blk_dspr2(out, dest, dest_stride); | 906 idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
914 } | 907 } |
915 | 908 |
916 static void iadst16(const int16_t *input, int16_t *output) { | 909 void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, |
| 910 int dest_stride) { |
| 911 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
| 912 int16_t *outptr = out; |
| 913 uint32_t i; |
| 914 uint32_t pos = 45; |
| 915 |
| 916 /* bit positon for extract from acc */ |
| 917 __asm__ __volatile__ ( |
| 918 "wrdsp %[pos], 1 \n\t" |
| 919 : |
| 920 : [pos] "r" (pos) |
| 921 ); |
| 922 |
| 923 // First transform rows. Since all non-zero dct coefficients are in |
| 924 // upper-left 4x4 area, we only need to calculate first 4 rows here. |
| 925 idct16_rows_dspr2(input, outptr, 4); |
| 926 |
| 927 outptr += 4; |
| 928 for (i = 0; i < 6; ++i) { |
| 929 __asm__ __volatile__ ( |
| 930 "sw $zero, 0(%[outptr]) \n\t" |
| 931 "sw $zero, 32(%[outptr]) \n\t" |
| 932 "sw $zero, 64(%[outptr]) \n\t" |
| 933 "sw $zero, 96(%[outptr]) \n\t" |
| 934 "sw $zero, 128(%[outptr]) \n\t" |
| 935 "sw $zero, 160(%[outptr]) \n\t" |
| 936 "sw $zero, 192(%[outptr]) \n\t" |
| 937 "sw $zero, 224(%[outptr]) \n\t" |
| 938 "sw $zero, 256(%[outptr]) \n\t" |
| 939 "sw $zero, 288(%[outptr]) \n\t" |
| 940 "sw $zero, 320(%[outptr]) \n\t" |
| 941 "sw $zero, 352(%[outptr]) \n\t" |
| 942 "sw $zero, 384(%[outptr]) \n\t" |
| 943 "sw $zero, 416(%[outptr]) \n\t" |
| 944 "sw $zero, 448(%[outptr]) \n\t" |
| 945 "sw $zero, 480(%[outptr]) \n\t" |
| 946 |
| 947 : |
| 948 : [outptr] "r" (outptr) |
| 949 ); |
| 950 |
| 951 outptr += 2; |
| 952 } |
| 953 |
| 954 // Then transform columns |
| 955 idct16_cols_add_blk_dspr2(out, dest, dest_stride); |
| 956 } |
| 957 |
| 958 void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, |
| 959 int dest_stride) { |
| 960 uint32_t pos = 45; |
| 961 int32_t out; |
| 962 int32_t r; |
| 963 int32_t a1, absa1; |
| 964 int32_t vector_a1; |
| 965 int32_t t1, t2, t3, t4; |
| 966 int32_t vector_1, vector_2, vector_3, vector_4; |
| 967 |
| 968 /* bit positon for extract from acc */ |
| 969 __asm__ __volatile__ ( |
| 970 "wrdsp %[pos], 1 \n\t" |
| 971 |
| 972 : |
| 973 : [pos] "r" (pos) |
| 974 ); |
| 975 |
| 976 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); |
| 977 __asm__ __volatile__ ( |
| 978 "addi %[out], %[out], 32 \n\t" |
| 979 "sra %[a1], %[out], 6 \n\t" |
| 980 |
| 981 : [out] "+r" (out), [a1] "=r" (a1) |
| 982 : |
| 983 ); |
| 984 |
| 985 if (a1 < 0) { |
| 986 /* use quad-byte |
| 987 * input and output memory are four byte aligned */ |
| 988 __asm__ __volatile__ ( |
| 989 "abs %[absa1], %[a1] \n\t" |
| 990 "replv.qb %[vector_a1], %[absa1] \n\t" |
| 991 |
| 992 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) |
| 993 : [a1] "r" (a1) |
| 994 ); |
| 995 |
| 996 for (r = 16; r--;) { |
| 997 __asm__ __volatile__ ( |
| 998 "lw %[t1], 0(%[dest]) \n\t" |
| 999 "lw %[t2], 4(%[dest]) \n\t" |
| 1000 "lw %[t3], 8(%[dest]) \n\t" |
| 1001 "lw %[t4], 12(%[dest]) \n\t" |
| 1002 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
| 1003 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
| 1004 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
| 1005 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
| 1006 "sw %[vector_1], 0(%[dest]) \n\t" |
| 1007 "sw %[vector_2], 4(%[dest]) \n\t" |
| 1008 "sw %[vector_3], 8(%[dest]) \n\t" |
| 1009 "sw %[vector_4], 12(%[dest]) \n\t" |
| 1010 "add %[dest], %[dest], %[dest_stride] \n\t" |
| 1011 |
| 1012 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
| 1013 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
| 1014 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
| 1015 [dest] "+&r" (dest) |
| 1016 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
| 1017 ); |
| 1018 } |
| 1019 } else { |
| 1020 /* use quad-byte |
| 1021 * input and output memory are four byte aligned */ |
| 1022 __asm__ __volatile__ ( |
| 1023 "replv.qb %[vector_a1], %[a1] \n\t" |
| 1024 |
| 1025 : [vector_a1] "=r" (vector_a1) |
| 1026 : [a1] "r" (a1) |
| 1027 ); |
| 1028 |
| 1029 for (r = 16; r--;) { |
| 1030 __asm__ __volatile__ ( |
| 1031 "lw %[t1], 0(%[dest]) \n\t" |
| 1032 "lw %[t2], 4(%[dest]) \n\t" |
| 1033 "lw %[t3], 8(%[dest]) \n\t" |
| 1034 "lw %[t4], 12(%[dest]) \n\t" |
| 1035 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
| 1036 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
| 1037 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
| 1038 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
| 1039 "sw %[vector_1], 0(%[dest]) \n\t" |
| 1040 "sw %[vector_2], 4(%[dest]) \n\t" |
| 1041 "sw %[vector_3], 8(%[dest]) \n\t" |
| 1042 "sw %[vector_4], 12(%[dest]) \n\t" |
| 1043 "add %[dest], %[dest], %[dest_stride] \n\t" |
| 1044 |
| 1045 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
| 1046 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
| 1047 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
| 1048 [dest] "+&r" (dest) |
| 1049 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) |
| 1050 ); |
| 1051 } |
| 1052 } |
| 1053 } |
| 1054 |
| 1055 void iadst16_dspr2(const int16_t *input, int16_t *output) { |
917 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | 1056 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; |
918 | 1057 |
919 int x0 = input[15]; | 1058 int x0 = input[15]; |
920 int x1 = input[0]; | 1059 int x1 = input[0]; |
921 int x2 = input[13]; | 1060 int x2 = input[13]; |
922 int x3 = input[2]; | 1061 int x3 = input[2]; |
923 int x4 = input[11]; | 1062 int x4 = input[11]; |
924 int x5 = input[4]; | 1063 int x5 = input[4]; |
925 int x6 = input[9]; | 1064 int x6 = input[9]; |
926 int x7 = input[6]; | 1065 int x7 = input[6]; |
(...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1077 output[8] = x3; | 1216 output[8] = x3; |
1078 output[9] = x11; | 1217 output[9] = x11; |
1079 output[10] = x15; | 1218 output[10] = x15; |
1080 output[11] = x7; | 1219 output[11] = x7; |
1081 output[12] = x5; | 1220 output[12] = x5; |
1082 output[13] = -x13; | 1221 output[13] = -x13; |
1083 output[14] = x9; | 1222 output[14] = x9; |
1084 output[15] = -x1; | 1223 output[15] = -x1; |
1085 } | 1224 } |
1086 | 1225 |
1087 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, | |
1088 int pitch, int tx_type) { | |
1089 int i, j; | |
1090 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | |
1091 int16_t *outptr = out; | |
1092 int16_t temp_out[16]; | |
1093 uint32_t pos = 45; | |
1094 | 1226 |
1095 /* bit positon for extract from acc */ | 1227 #endif // HAVE_DSPR2 |
1096 __asm__ __volatile__ ( | |
1097 "wrdsp %[pos], 1 \n\t" | |
1098 : | |
1099 : [pos] "r" (pos) | |
1100 ); | |
1101 | |
1102 switch (tx_type) { | |
1103 case DCT_DCT: // DCT in both horizontal and vertical | |
1104 idct16_rows_dspr2(input, outptr, 16); | |
1105 idct16_cols_add_blk_dspr2(out, dest, pitch); | |
1106 break; | |
1107 case ADST_DCT: // ADST in vertical, DCT in horizontal | |
1108 idct16_rows_dspr2(input, outptr, 16); | |
1109 | |
1110 outptr = out; | |
1111 | |
1112 for (i = 0; i < 16; ++i) { | |
1113 iadst16(outptr, temp_out); | |
1114 | |
1115 for (j = 0; j < 16; ++j) | |
1116 dest[j * pitch + i] = | |
1117 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | |
1118 + dest[j * pitch + i]); | |
1119 outptr += 16; | |
1120 } | |
1121 break; | |
1122 case DCT_ADST: // DCT in vertical, ADST in horizontal | |
1123 { | |
1124 int16_t temp_in[16 * 16]; | |
1125 | |
1126 for (i = 0; i < 16; ++i) { | |
1127 /* prefetch row */ | |
1128 prefetch_load((const uint8_t *)(input + 16)); | |
1129 | |
1130 iadst16(input, outptr); | |
1131 input += 16; | |
1132 outptr += 16; | |
1133 } | |
1134 | |
1135 for (i = 0; i < 16; ++i) | |
1136 for (j = 0; j < 16; ++j) | |
1137 temp_in[j * 16 + i] = out[i * 16 + j]; | |
1138 | |
1139 idct16_cols_add_blk_dspr2(temp_in, dest, pitch); | |
1140 } | |
1141 break; | |
1142 case ADST_ADST: // ADST in both directions | |
1143 { | |
1144 int16_t temp_in[16]; | |
1145 | |
1146 for (i = 0; i < 16; ++i) { | |
1147 /* prefetch row */ | |
1148 prefetch_load((const uint8_t *)(input + 16)); | |
1149 | |
1150 iadst16(input, outptr); | |
1151 input += 16; | |
1152 outptr += 16; | |
1153 } | |
1154 | |
1155 for (i = 0; i < 16; ++i) { | |
1156 for (j = 0; j < 16; ++j) | |
1157 temp_in[j] = out[j * 16 + i]; | |
1158 iadst16(temp_in, temp_out); | |
1159 for (j = 0; j < 16; ++j) | |
1160 dest[j * pitch + i] = | |
1161 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | |
1162 + dest[j * pitch + i]); | |
1163 } | |
1164 } | |
1165 break; | |
1166 default: | |
1167 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); | |
1168 break; | |
1169 } | |
1170 } | |
1171 | |
1172 void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, | |
1173 int dest_stride) { | |
1174 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | |
1175 int16_t *outptr = out; | |
1176 uint32_t i; | |
1177 uint32_t pos = 45; | |
1178 | |
1179 /* bit positon for extract from acc */ | |
1180 __asm__ __volatile__ ( | |
1181 "wrdsp %[pos], 1 \n\t" | |
1182 : | |
1183 : [pos] "r" (pos) | |
1184 ); | |
1185 | |
1186 // First transform rows. Since all non-zero dct coefficients are in | |
1187 // upper-left 4x4 area, we only need to calculate first 4 rows here. | |
1188 idct16_rows_dspr2(input, outptr, 4); | |
1189 | |
1190 outptr += 4; | |
1191 for (i = 0; i < 6; ++i) { | |
1192 __asm__ __volatile__ ( | |
1193 "sw $zero, 0(%[outptr]) \n\t" | |
1194 "sw $zero, 32(%[outptr]) \n\t" | |
1195 "sw $zero, 64(%[outptr]) \n\t" | |
1196 "sw $zero, 96(%[outptr]) \n\t" | |
1197 "sw $zero, 128(%[outptr]) \n\t" | |
1198 "sw $zero, 160(%[outptr]) \n\t" | |
1199 "sw $zero, 192(%[outptr]) \n\t" | |
1200 "sw $zero, 224(%[outptr]) \n\t" | |
1201 "sw $zero, 256(%[outptr]) \n\t" | |
1202 "sw $zero, 288(%[outptr]) \n\t" | |
1203 "sw $zero, 320(%[outptr]) \n\t" | |
1204 "sw $zero, 352(%[outptr]) \n\t" | |
1205 "sw $zero, 384(%[outptr]) \n\t" | |
1206 "sw $zero, 416(%[outptr]) \n\t" | |
1207 "sw $zero, 448(%[outptr]) \n\t" | |
1208 "sw $zero, 480(%[outptr]) \n\t" | |
1209 | |
1210 : | |
1211 : [outptr] "r" (outptr) | |
1212 ); | |
1213 | |
1214 outptr += 2; | |
1215 } | |
1216 | |
1217 // Then transform columns | |
1218 idct16_cols_add_blk_dspr2(out, dest, dest_stride); | |
1219 } | |
1220 | |
1221 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, | |
1222 int dest_stride) { | |
1223 uint32_t pos = 45; | |
1224 int32_t out; | |
1225 int32_t r; | |
1226 int32_t a1, absa1; | |
1227 int32_t vector_a1; | |
1228 int32_t t1, t2, t3, t4; | |
1229 int32_t vector_1, vector_2, vector_3, vector_4; | |
1230 | |
1231 /* bit positon for extract from acc */ | |
1232 __asm__ __volatile__ ( | |
1233 "wrdsp %[pos], 1 \n\t" | |
1234 | |
1235 : | |
1236 : [pos] "r" (pos) | |
1237 ); | |
1238 | |
1239 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); | |
1240 __asm__ __volatile__ ( | |
1241 "addi %[out], %[out], 32 \n\t" | |
1242 "sra %[a1], %[out], 6 \n\t" | |
1243 | |
1244 : [out] "+r" (out), [a1] "=r" (a1) | |
1245 : | |
1246 ); | |
1247 | |
1248 if (a1 < 0) { | |
1249 /* use quad-byte | |
1250 * input and output memory are four byte aligned */ | |
1251 __asm__ __volatile__ ( | |
1252 "abs %[absa1], %[a1] \n\t" | |
1253 "replv.qb %[vector_a1], %[absa1] \n\t" | |
1254 | |
1255 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) | |
1256 : [a1] "r" (a1) | |
1257 ); | |
1258 | |
1259 for (r = 16; r--;) { | |
1260 __asm__ __volatile__ ( | |
1261 "lw %[t1], 0(%[dest]) \n\t" | |
1262 "lw %[t2], 4(%[dest]) \n\t" | |
1263 "lw %[t3], 8(%[dest]) \n\t" | |
1264 "lw %[t4], 12(%[dest]) \n\t" | |
1265 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
1266 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
1267 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
1268 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
1269 "sw %[vector_1], 0(%[dest]) \n\t" | |
1270 "sw %[vector_2], 4(%[dest]) \n\t" | |
1271 "sw %[vector_3], 8(%[dest]) \n\t" | |
1272 "sw %[vector_4], 12(%[dest]) \n\t" | |
1273 "add %[dest], %[dest], %[dest_stride] \n\t" | |
1274 | |
1275 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
1276 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
1277 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
1278 [dest] "+&r" (dest) | |
1279 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
1280 ); | |
1281 } | |
1282 } else { | |
1283 /* use quad-byte | |
1284 * input and output memory are four byte aligned */ | |
1285 __asm__ __volatile__ ( | |
1286 "replv.qb %[vector_a1], %[a1] \n\t" | |
1287 | |
1288 : [vector_a1] "=r" (vector_a1) | |
1289 : [a1] "r" (a1) | |
1290 ); | |
1291 | |
1292 for (r = 16; r--;) { | |
1293 __asm__ __volatile__ ( | |
1294 "lw %[t1], 0(%[dest]) \n\t" | |
1295 "lw %[t2], 4(%[dest]) \n\t" | |
1296 "lw %[t3], 8(%[dest]) \n\t" | |
1297 "lw %[t4], 12(%[dest]) \n\t" | |
1298 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
1299 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
1300 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
1301 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
1302 "sw %[vector_1], 0(%[dest]) \n\t" | |
1303 "sw %[vector_2], 4(%[dest]) \n\t" | |
1304 "sw %[vector_3], 8(%[dest]) \n\t" | |
1305 "sw %[vector_4], 12(%[dest]) \n\t" | |
1306 "add %[dest], %[dest], %[dest_stride] \n\t" | |
1307 | |
1308 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
1309 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
1310 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
1311 [dest] "+&r" (dest) | |
1312 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
1313 ); | |
1314 } | |
1315 } | |
1316 } | |
1317 #endif // #if HAVE_DSPR2 | |
OLD | NEW |