| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license | |
| 5 * that can be found in the LICENSE file in the root of the source | |
| 6 * tree. An additional intellectual property rights grant can be found | |
| 7 * in the file PATENTS. All contributing project authors may | |
| 8 * be found in the AUTHORS file in the root of the source tree. | |
| 9 */ | |
| 10 | |
| 11 #include <assert.h> | |
| 12 #include <stdio.h> | |
| 13 | |
| 14 #include "./vpx_config.h" | |
| 15 #include "./vp9_rtcd.h" | |
| 16 #include "vp9/common/vp9_common.h" | |
| 17 #include "vp9/common/vp9_blockd.h" | |
| 18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | |
| 19 #include "vpx_dsp/txfm_common.h" | |
| 20 | |
| 21 #if HAVE_DSPR2 | |
| 22 static void idct32_rows_dspr2(const int16_t *input, int16_t *output, | |
| 23 uint32_t no_rows) { | |
| 24 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; | |
| 25 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; | |
| 26 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; | |
| 27 int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; | |
| 28 int16_t step1_28, step1_29, step1_30, step1_31; | |
| 29 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; | |
| 30 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; | |
| 31 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; | |
| 32 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; | |
| 33 int16_t step2_28, step2_29, step2_30, step2_31; | |
| 34 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; | |
| 35 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; | |
| 36 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; | |
| 37 int16_t step3_29, step3_30, step3_31; | |
| 38 int temp0, temp1, temp2, temp3; | |
| 39 int load1, load2, load3, load4; | |
| 40 int result1, result2; | |
| 41 int temp21; | |
| 42 int i; | |
| 43 const int const_2_power_13 = 8192; | |
| 44 const int32_t *input_int; | |
| 45 | |
| 46 for (i = no_rows; i--; ) { | |
| 47 input_int = (const int32_t *)input; | |
| 48 | |
| 49 if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | | |
| 50 input_int[4] | input_int[5] | input_int[6] | input_int[7] | | |
| 51 input_int[8] | input_int[9] | input_int[10] | input_int[11] | | |
| 52 input_int[12] | input_int[13] | input_int[14] | input_int[15])) { | |
| 53 input += 32; | |
| 54 | |
| 55 __asm__ __volatile__ ( | |
| 56 "sh $zero, 0(%[output]) \n\t" | |
| 57 "sh $zero, 64(%[output]) \n\t" | |
| 58 "sh $zero, 128(%[output]) \n\t" | |
| 59 "sh $zero, 192(%[output]) \n\t" | |
| 60 "sh $zero, 256(%[output]) \n\t" | |
| 61 "sh $zero, 320(%[output]) \n\t" | |
| 62 "sh $zero, 384(%[output]) \n\t" | |
| 63 "sh $zero, 448(%[output]) \n\t" | |
| 64 "sh $zero, 512(%[output]) \n\t" | |
| 65 "sh $zero, 576(%[output]) \n\t" | |
| 66 "sh $zero, 640(%[output]) \n\t" | |
| 67 "sh $zero, 704(%[output]) \n\t" | |
| 68 "sh $zero, 768(%[output]) \n\t" | |
| 69 "sh $zero, 832(%[output]) \n\t" | |
| 70 "sh $zero, 896(%[output]) \n\t" | |
| 71 "sh $zero, 960(%[output]) \n\t" | |
| 72 "sh $zero, 1024(%[output]) \n\t" | |
| 73 "sh $zero, 1088(%[output]) \n\t" | |
| 74 "sh $zero, 1152(%[output]) \n\t" | |
| 75 "sh $zero, 1216(%[output]) \n\t" | |
| 76 "sh $zero, 1280(%[output]) \n\t" | |
| 77 "sh $zero, 1344(%[output]) \n\t" | |
| 78 "sh $zero, 1408(%[output]) \n\t" | |
| 79 "sh $zero, 1472(%[output]) \n\t" | |
| 80 "sh $zero, 1536(%[output]) \n\t" | |
| 81 "sh $zero, 1600(%[output]) \n\t" | |
| 82 "sh $zero, 1664(%[output]) \n\t" | |
| 83 "sh $zero, 1728(%[output]) \n\t" | |
| 84 "sh $zero, 1792(%[output]) \n\t" | |
| 85 "sh $zero, 1856(%[output]) \n\t" | |
| 86 "sh $zero, 1920(%[output]) \n\t" | |
| 87 "sh $zero, 1984(%[output]) \n\t" | |
| 88 | |
| 89 : | |
| 90 : [output] "r" (output) | |
| 91 ); | |
| 92 | |
| 93 output += 1; | |
| 94 | |
| 95 continue; | |
| 96 } | |
| 97 | |
| 98 /* prefetch row */ | |
| 99 prefetch_load((const uint8_t *)(input + 32)); | |
| 100 prefetch_load((const uint8_t *)(input + 48)); | |
| 101 | |
| 102 __asm__ __volatile__ ( | |
| 103 "lh %[load1], 2(%[input]) \n\t" | |
| 104 "lh %[load2], 62(%[input]) \n\t" | |
| 105 "lh %[load3], 34(%[input]) \n\t" | |
| 106 "lh %[load4], 30(%[input]) \n\t" | |
| 107 | |
| 108 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 109 "mthi $zero, $ac1 \n\t" | |
| 110 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 111 "mthi $zero, $ac3 \n\t" | |
| 112 | |
| 113 "madd $ac1, %[load1], %[cospi_31_64] \n\t" | |
| 114 "msub $ac1, %[load2], %[cospi_1_64] \n\t" | |
| 115 "extp %[temp0], $ac1, 31 \n\t" | |
| 116 | |
| 117 "madd $ac3, %[load1], %[cospi_1_64] \n\t" | |
| 118 "madd $ac3, %[load2], %[cospi_31_64] \n\t" | |
| 119 "extp %[temp3], $ac3, 31 \n\t" | |
| 120 | |
| 121 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 122 "mthi $zero, $ac1 \n\t" | |
| 123 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 124 "mthi $zero, $ac2 \n\t" | |
| 125 | |
| 126 "madd $ac2, %[load3], %[cospi_15_64] \n\t" | |
| 127 "msub $ac2, %[load4], %[cospi_17_64] \n\t" | |
| 128 "extp %[temp1], $ac2, 31 \n\t" | |
| 129 | |
| 130 "madd $ac1, %[load3], %[cospi_17_64] \n\t" | |
| 131 "madd $ac1, %[load4], %[cospi_15_64] \n\t" | |
| 132 "extp %[temp2], $ac1, 31 \n\t" | |
| 133 | |
| 134 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 135 "mthi $zero, $ac1 \n\t" | |
| 136 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 137 "mthi $zero, $ac3 \n\t" | |
| 138 | |
| 139 "sub %[load1], %[temp3], %[temp2] \n\t" | |
| 140 "sub %[load2], %[temp0], %[temp1] \n\t" | |
| 141 | |
| 142 "madd $ac1, %[load1], %[cospi_28_64] \n\t" | |
| 143 "msub $ac1, %[load2], %[cospi_4_64] \n\t" | |
| 144 "madd $ac3, %[load1], %[cospi_4_64] \n\t" | |
| 145 "madd $ac3, %[load2], %[cospi_28_64] \n\t" | |
| 146 | |
| 147 "extp %[step1_17], $ac1, 31 \n\t" | |
| 148 "extp %[step1_30], $ac3, 31 \n\t" | |
| 149 "add %[step1_16], %[temp0], %[temp1] \n\t" | |
| 150 "add %[step1_31], %[temp2], %[temp3] \n\t" | |
| 151 | |
| 152 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 153 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 154 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 155 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
| 156 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), | |
| 157 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) | |
| 158 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 159 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), | |
| 160 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), | |
| 161 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) | |
| 162 ); | |
| 163 | |
| 164 __asm__ __volatile__ ( | |
| 165 "lh %[load1], 18(%[input]) \n\t" | |
| 166 "lh %[load2], 46(%[input]) \n\t" | |
| 167 "lh %[load3], 50(%[input]) \n\t" | |
| 168 "lh %[load4], 14(%[input]) \n\t" | |
| 169 | |
| 170 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 171 "mthi $zero, $ac1 \n\t" | |
| 172 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 173 "mthi $zero, $ac3 \n\t" | |
| 174 | |
| 175 "madd $ac1, %[load1], %[cospi_23_64] \n\t" | |
| 176 "msub $ac1, %[load2], %[cospi_9_64] \n\t" | |
| 177 "extp %[temp0], $ac1, 31 \n\t" | |
| 178 | |
| 179 "madd $ac3, %[load1], %[cospi_9_64] \n\t" | |
| 180 "madd $ac3, %[load2], %[cospi_23_64] \n\t" | |
| 181 "extp %[temp3], $ac3, 31 \n\t" | |
| 182 | |
| 183 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 184 "mthi $zero, $ac1 \n\t" | |
| 185 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 186 "mthi $zero, $ac2 \n\t" | |
| 187 | |
| 188 "madd $ac2, %[load3], %[cospi_7_64] \n\t" | |
| 189 "msub $ac2, %[load4], %[cospi_25_64] \n\t" | |
| 190 "extp %[temp1], $ac2, 31 \n\t" | |
| 191 | |
| 192 "madd $ac1, %[load3], %[cospi_25_64] \n\t" | |
| 193 "madd $ac1, %[load4], %[cospi_7_64] \n\t" | |
| 194 "extp %[temp2], $ac1, 31 \n\t" | |
| 195 | |
| 196 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 197 "mthi $zero, $ac1 \n\t" | |
| 198 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 199 "mthi $zero, $ac3 \n\t" | |
| 200 | |
| 201 "sub %[load1], %[temp1], %[temp0] \n\t" | |
| 202 "sub %[load2], %[temp2], %[temp3] \n\t" | |
| 203 | |
| 204 "msub $ac1, %[load1], %[cospi_28_64] \n\t" | |
| 205 "msub $ac1, %[load2], %[cospi_4_64] \n\t" | |
| 206 "msub $ac3, %[load1], %[cospi_4_64] \n\t" | |
| 207 "madd $ac3, %[load2], %[cospi_28_64] \n\t" | |
| 208 | |
| 209 "extp %[step1_18], $ac1, 31 \n\t" | |
| 210 "extp %[step1_29], $ac3, 31 \n\t" | |
| 211 "add %[step1_19], %[temp0], %[temp1] \n\t" | |
| 212 "add %[step1_28], %[temp2], %[temp3] \n\t" | |
| 213 | |
| 214 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 215 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 216 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 217 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
| 218 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), | |
| 219 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) | |
| 220 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 221 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), | |
| 222 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), | |
| 223 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) | |
| 224 ); | |
| 225 | |
| 226 __asm__ __volatile__ ( | |
| 227 "lh %[load1], 10(%[input]) \n\t" | |
| 228 "lh %[load2], 54(%[input]) \n\t" | |
| 229 "lh %[load3], 42(%[input]) \n\t" | |
| 230 "lh %[load4], 22(%[input]) \n\t" | |
| 231 | |
| 232 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 233 "mthi $zero, $ac1 \n\t" | |
| 234 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 235 "mthi $zero, $ac3 \n\t" | |
| 236 | |
| 237 "madd $ac1, %[load1], %[cospi_27_64] \n\t" | |
| 238 "msub $ac1, %[load2], %[cospi_5_64] \n\t" | |
| 239 "extp %[temp0], $ac1, 31 \n\t" | |
| 240 | |
| 241 "madd $ac3, %[load1], %[cospi_5_64] \n\t" | |
| 242 "madd $ac3, %[load2], %[cospi_27_64] \n\t" | |
| 243 "extp %[temp3], $ac3, 31 \n\t" | |
| 244 | |
| 245 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 246 "mthi $zero, $ac1 \n\t" | |
| 247 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 248 "mthi $zero, $ac2 \n\t" | |
| 249 | |
| 250 "madd $ac2, %[load3], %[cospi_11_64] \n\t" | |
| 251 "msub $ac2, %[load4], %[cospi_21_64] \n\t" | |
| 252 "extp %[temp1], $ac2, 31 \n\t" | |
| 253 | |
| 254 "madd $ac1, %[load3], %[cospi_21_64] \n\t" | |
| 255 "madd $ac1, %[load4], %[cospi_11_64] \n\t" | |
| 256 "extp %[temp2], $ac1, 31 \n\t" | |
| 257 | |
| 258 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 259 "mthi $zero, $ac1 \n\t" | |
| 260 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 261 "mthi $zero, $ac3 \n\t" | |
| 262 | |
| 263 "sub %[load1], %[temp0], %[temp1] \n\t" | |
| 264 "sub %[load2], %[temp3], %[temp2] \n\t" | |
| 265 | |
| 266 "madd $ac1, %[load2], %[cospi_12_64] \n\t" | |
| 267 "msub $ac1, %[load1], %[cospi_20_64] \n\t" | |
| 268 "madd $ac3, %[load1], %[cospi_12_64] \n\t" | |
| 269 "madd $ac3, %[load2], %[cospi_20_64] \n\t" | |
| 270 | |
| 271 "extp %[step1_21], $ac1, 31 \n\t" | |
| 272 "extp %[step1_26], $ac3, 31 \n\t" | |
| 273 "add %[step1_20], %[temp0], %[temp1] \n\t" | |
| 274 "add %[step1_27], %[temp2], %[temp3] \n\t" | |
| 275 | |
| 276 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 277 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 278 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 279 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
| 280 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), | |
| 281 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) | |
| 282 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 283 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), | |
| 284 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), | |
| 285 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) | |
| 286 ); | |
| 287 | |
| 288 __asm__ __volatile__ ( | |
| 289 "lh %[load1], 26(%[input]) \n\t" | |
| 290 "lh %[load2], 38(%[input]) \n\t" | |
| 291 "lh %[load3], 58(%[input]) \n\t" | |
| 292 "lh %[load4], 6(%[input]) \n\t" | |
| 293 | |
| 294 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 295 "mthi $zero, $ac1 \n\t" | |
| 296 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 297 "mthi $zero, $ac3 \n\t" | |
| 298 | |
| 299 "madd $ac1, %[load1], %[cospi_19_64] \n\t" | |
| 300 "msub $ac1, %[load2], %[cospi_13_64] \n\t" | |
| 301 "extp %[temp0], $ac1, 31 \n\t" | |
| 302 | |
| 303 "madd $ac3, %[load1], %[cospi_13_64] \n\t" | |
| 304 "madd $ac3, %[load2], %[cospi_19_64] \n\t" | |
| 305 "extp %[temp3], $ac3, 31 \n\t" | |
| 306 | |
| 307 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 308 "mthi $zero, $ac1 \n\t" | |
| 309 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 310 "mthi $zero, $ac2 \n\t" | |
| 311 | |
| 312 "madd $ac2, %[load3], %[cospi_3_64] \n\t" | |
| 313 "msub $ac2, %[load4], %[cospi_29_64] \n\t" | |
| 314 "extp %[temp1], $ac2, 31 \n\t" | |
| 315 | |
| 316 "madd $ac1, %[load3], %[cospi_29_64] \n\t" | |
| 317 "madd $ac1, %[load4], %[cospi_3_64] \n\t" | |
| 318 "extp %[temp2], $ac1, 31 \n\t" | |
| 319 | |
| 320 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 321 "mthi $zero, $ac1 \n\t" | |
| 322 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 323 "mthi $zero, $ac3 \n\t" | |
| 324 | |
| 325 "sub %[load1], %[temp1], %[temp0] \n\t" | |
| 326 "sub %[load2], %[temp2], %[temp3] \n\t" | |
| 327 | |
| 328 "msub $ac1, %[load1], %[cospi_12_64] \n\t" | |
| 329 "msub $ac1, %[load2], %[cospi_20_64] \n\t" | |
| 330 "msub $ac3, %[load1], %[cospi_20_64] \n\t" | |
| 331 "madd $ac3, %[load2], %[cospi_12_64] \n\t" | |
| 332 | |
| 333 "extp %[step1_22], $ac1, 31 \n\t" | |
| 334 "extp %[step1_25], $ac3, 31 \n\t" | |
| 335 "add %[step1_23], %[temp0], %[temp1] \n\t" | |
| 336 "add %[step1_24], %[temp2], %[temp3] \n\t" | |
| 337 | |
| 338 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 339 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 340 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 341 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
| 342 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), | |
| 343 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) | |
| 344 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 345 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), | |
| 346 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), | |
| 347 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) | |
| 348 ); | |
| 349 | |
| 350 __asm__ __volatile__ ( | |
| 351 "lh %[load1], 4(%[input]) \n\t" | |
| 352 "lh %[load2], 60(%[input]) \n\t" | |
| 353 "lh %[load3], 36(%[input]) \n\t" | |
| 354 "lh %[load4], 28(%[input]) \n\t" | |
| 355 | |
| 356 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 357 "mthi $zero, $ac1 \n\t" | |
| 358 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 359 "mthi $zero, $ac3 \n\t" | |
| 360 | |
| 361 "madd $ac1, %[load1], %[cospi_30_64] \n\t" | |
| 362 "msub $ac1, %[load2], %[cospi_2_64] \n\t" | |
| 363 "extp %[temp0], $ac1, 31 \n\t" | |
| 364 | |
| 365 "madd $ac3, %[load1], %[cospi_2_64] \n\t" | |
| 366 "madd $ac3, %[load2], %[cospi_30_64] \n\t" | |
| 367 "extp %[temp3], $ac3, 31 \n\t" | |
| 368 | |
| 369 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 370 "mthi $zero, $ac1 \n\t" | |
| 371 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 372 "mthi $zero, $ac2 \n\t" | |
| 373 | |
| 374 "madd $ac2, %[load3], %[cospi_14_64] \n\t" | |
| 375 "msub $ac2, %[load4], %[cospi_18_64] \n\t" | |
| 376 "extp %[temp1], $ac2, 31 \n\t" | |
| 377 | |
| 378 "madd $ac1, %[load3], %[cospi_18_64] \n\t" | |
| 379 "madd $ac1, %[load4], %[cospi_14_64] \n\t" | |
| 380 "extp %[temp2], $ac1, 31 \n\t" | |
| 381 | |
| 382 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 383 "mthi $zero, $ac1 \n\t" | |
| 384 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 385 "mthi $zero, $ac3 \n\t" | |
| 386 | |
| 387 "sub %[load1], %[temp0], %[temp1] \n\t" | |
| 388 "sub %[load2], %[temp3], %[temp2] \n\t" | |
| 389 | |
| 390 "msub $ac1, %[load1], %[cospi_8_64] \n\t" | |
| 391 "madd $ac1, %[load2], %[cospi_24_64] \n\t" | |
| 392 "madd $ac3, %[load1], %[cospi_24_64] \n\t" | |
| 393 "madd $ac3, %[load2], %[cospi_8_64] \n\t" | |
| 394 | |
| 395 "extp %[step2_9], $ac1, 31 \n\t" | |
| 396 "extp %[step2_14], $ac3, 31 \n\t" | |
| 397 "add %[step2_8], %[temp0], %[temp1] \n\t" | |
| 398 "add %[step2_15], %[temp2], %[temp3] \n\t" | |
| 399 | |
| 400 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 401 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 402 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 403 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
| 404 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), | |
| 405 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) | |
| 406 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 407 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), | |
| 408 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), | |
| 409 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) | |
| 410 ); | |
| 411 | |
| 412 __asm__ __volatile__ ( | |
| 413 "lh %[load1], 20(%[input]) \n\t" | |
| 414 "lh %[load2], 44(%[input]) \n\t" | |
| 415 "lh %[load3], 52(%[input]) \n\t" | |
| 416 "lh %[load4], 12(%[input]) \n\t" | |
| 417 | |
| 418 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 419 "mthi $zero, $ac1 \n\t" | |
| 420 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 421 "mthi $zero, $ac3 \n\t" | |
| 422 | |
| 423 "madd $ac1, %[load1], %[cospi_22_64] \n\t" | |
| 424 "msub $ac1, %[load2], %[cospi_10_64] \n\t" | |
| 425 "extp %[temp0], $ac1, 31 \n\t" | |
| 426 | |
| 427 "madd $ac3, %[load1], %[cospi_10_64] \n\t" | |
| 428 "madd $ac3, %[load2], %[cospi_22_64] \n\t" | |
| 429 "extp %[temp3], $ac3, 31 \n\t" | |
| 430 | |
| 431 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 432 "mthi $zero, $ac1 \n\t" | |
| 433 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 434 "mthi $zero, $ac2 \n\t" | |
| 435 | |
| 436 "madd $ac2, %[load3], %[cospi_6_64] \n\t" | |
| 437 "msub $ac2, %[load4], %[cospi_26_64] \n\t" | |
| 438 "extp %[temp1], $ac2, 31 \n\t" | |
| 439 | |
| 440 "madd $ac1, %[load3], %[cospi_26_64] \n\t" | |
| 441 "madd $ac1, %[load4], %[cospi_6_64] \n\t" | |
| 442 "extp %[temp2], $ac1, 31 \n\t" | |
| 443 | |
| 444 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 445 "mthi $zero, $ac1 \n\t" | |
| 446 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 447 "mthi $zero, $ac3 \n\t" | |
| 448 | |
| 449 "sub %[load1], %[temp1], %[temp0] \n\t" | |
| 450 "sub %[load2], %[temp2], %[temp3] \n\t" | |
| 451 | |
| 452 "msub $ac1, %[load1], %[cospi_24_64] \n\t" | |
| 453 "msub $ac1, %[load2], %[cospi_8_64] \n\t" | |
| 454 "madd $ac3, %[load2], %[cospi_24_64] \n\t" | |
| 455 "msub $ac3, %[load1], %[cospi_8_64] \n\t" | |
| 456 | |
| 457 "extp %[step2_10], $ac1, 31 \n\t" | |
| 458 "extp %[step2_13], $ac3, 31 \n\t" | |
| 459 "add %[step2_11], %[temp0], %[temp1] \n\t" | |
| 460 "add %[step2_12], %[temp2], %[temp3] \n\t" | |
| 461 | |
| 462 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 463 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 464 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 465 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
| 466 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), | |
| 467 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) | |
| 468 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 469 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), | |
| 470 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), | |
| 471 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) | |
| 472 ); | |
| 473 | |
| 474 __asm__ __volatile__ ( | |
| 475 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 476 "mthi $zero, $ac0 \n\t" | |
| 477 "sub %[temp0], %[step2_14], %[step2_13] \n\t" | |
| 478 "sub %[temp0], %[temp0], %[step2_9] \n\t" | |
| 479 "add %[temp0], %[temp0], %[step2_10] \n\t" | |
| 480 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
| 481 | |
| 482 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 483 "mthi $zero, $ac1 \n\t" | |
| 484 "sub %[temp1], %[step2_14], %[step2_13] \n\t" | |
| 485 "add %[temp1], %[temp1], %[step2_9] \n\t" | |
| 486 "sub %[temp1], %[temp1], %[step2_10] \n\t" | |
| 487 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" | |
| 488 | |
| 489 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 490 "mthi $zero, $ac2 \n\t" | |
| 491 "sub %[temp0], %[step2_15], %[step2_12] \n\t" | |
| 492 "sub %[temp0], %[temp0], %[step2_8] \n\t" | |
| 493 "add %[temp0], %[temp0], %[step2_11] \n\t" | |
| 494 "madd $ac2, %[temp0], %[cospi_16_64] \n\t" | |
| 495 | |
| 496 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 497 "mthi $zero, $ac3 \n\t" | |
| 498 "sub %[temp1], %[step2_15], %[step2_12] \n\t" | |
| 499 "add %[temp1], %[temp1], %[step2_8] \n\t" | |
| 500 "sub %[temp1], %[temp1], %[step2_11] \n\t" | |
| 501 "madd $ac3, %[temp1], %[cospi_16_64] \n\t" | |
| 502 | |
| 503 "add %[step3_8], %[step2_8], %[step2_11] \n\t" | |
| 504 "add %[step3_9], %[step2_9], %[step2_10] \n\t" | |
| 505 "add %[step3_14], %[step2_13], %[step2_14] \n\t" | |
| 506 "add %[step3_15], %[step2_12], %[step2_15] \n\t" | |
| 507 | |
| 508 "extp %[step3_10], $ac0, 31 \n\t" | |
| 509 "extp %[step3_13], $ac1, 31 \n\t" | |
| 510 "extp %[step3_11], $ac2, 31 \n\t" | |
| 511 "extp %[step3_12], $ac3, 31 \n\t" | |
| 512 | |
| 513 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 514 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), | |
| 515 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), | |
| 516 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), | |
| 517 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) | |
| 518 : [const_2_power_13] "r" (const_2_power_13), | |
| 519 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), | |
| 520 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), | |
| 521 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), | |
| 522 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), | |
| 523 [cospi_16_64] "r" (cospi_16_64) | |
| 524 ); | |
| 525 | |
| 526 step2_18 = step1_17 - step1_18; | |
| 527 step2_29 = step1_30 - step1_29; | |
| 528 | |
| 529 __asm__ __volatile__ ( | |
| 530 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 531 "mthi $zero, $ac0 \n\t" | |
| 532 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" | |
| 533 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" | |
| 534 "extp %[step3_18], $ac0, 31 \n\t" | |
| 535 | |
| 536 : [step3_18] "=r" (step3_18) | |
| 537 : [const_2_power_13] "r" (const_2_power_13), | |
| 538 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), | |
| 539 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 540 ); | |
| 541 | |
| 542 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; | |
| 543 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
| 544 | |
| 545 step2_19 = step1_16 - step1_19; | |
| 546 step2_28 = step1_31 - step1_28; | |
| 547 | |
| 548 __asm__ __volatile__ ( | |
| 549 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 550 "mthi $zero, $ac0 \n\t" | |
| 551 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" | |
| 552 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" | |
| 553 "extp %[step3_19], $ac0, 31 \n\t" | |
| 554 | |
| 555 : [step3_19] "=r" (step3_19) | |
| 556 : [const_2_power_13] "r" (const_2_power_13), | |
| 557 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), | |
| 558 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 559 ); | |
| 560 | |
| 561 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; | |
| 562 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
| 563 | |
| 564 step3_16 = step1_16 + step1_19; | |
| 565 step3_17 = step1_17 + step1_18; | |
| 566 step3_30 = step1_29 + step1_30; | |
| 567 step3_31 = step1_28 + step1_31; | |
| 568 | |
| 569 step2_20 = step1_23 - step1_20; | |
| 570 step2_27 = step1_24 - step1_27; | |
| 571 | |
| 572 __asm__ __volatile__ ( | |
| 573 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 574 "mthi $zero, $ac0 \n\t" | |
| 575 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" | |
| 576 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" | |
| 577 "extp %[step3_20], $ac0, 31 \n\t" | |
| 578 | |
| 579 : [step3_20] "=r" (step3_20) | |
| 580 : [const_2_power_13] "r" (const_2_power_13), | |
| 581 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), | |
| 582 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 583 ); | |
| 584 | |
| 585 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; | |
| 586 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
| 587 | |
| 588 step2_21 = step1_22 - step1_21; | |
| 589 step2_26 = step1_25 - step1_26; | |
| 590 | |
| 591 __asm__ __volatile__ ( | |
| 592 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 593 "mthi $zero, $ac1 \n\t" | |
| 594 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" | |
| 595 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" | |
| 596 "extp %[step3_21], $ac1, 31 \n\t" | |
| 597 | |
| 598 : [step3_21] "=r" (step3_21) | |
| 599 : [const_2_power_13] "r" (const_2_power_13), | |
| 600 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), | |
| 601 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 602 ); | |
| 603 | |
| 604 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; | |
| 605 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
| 606 | |
| 607 step3_22 = step1_21 + step1_22; | |
| 608 step3_23 = step1_20 + step1_23; | |
| 609 step3_24 = step1_24 + step1_27; | |
| 610 step3_25 = step1_25 + step1_26; | |
| 611 | |
| 612 step2_16 = step3_16 + step3_23; | |
| 613 step2_17 = step3_17 + step3_22; | |
| 614 step2_18 = step3_18 + step3_21; | |
| 615 step2_19 = step3_19 + step3_20; | |
| 616 step2_20 = step3_19 - step3_20; | |
| 617 step2_21 = step3_18 - step3_21; | |
| 618 step2_22 = step3_17 - step3_22; | |
| 619 step2_23 = step3_16 - step3_23; | |
| 620 | |
| 621 step2_24 = step3_31 - step3_24; | |
| 622 step2_25 = step3_30 - step3_25; | |
| 623 step2_26 = step3_29 - step3_26; | |
| 624 step2_27 = step3_28 - step3_27; | |
| 625 step2_28 = step3_28 + step3_27; | |
| 626 step2_29 = step3_29 + step3_26; | |
| 627 step2_30 = step3_30 + step3_25; | |
| 628 step2_31 = step3_31 + step3_24; | |
| 629 | |
| 630 __asm__ __volatile__ ( | |
| 631 "lh %[load1], 0(%[input]) \n\t" | |
| 632 "lh %[load2], 32(%[input]) \n\t" | |
| 633 "lh %[load3], 16(%[input]) \n\t" | |
| 634 "lh %[load4], 48(%[input]) \n\t" | |
| 635 | |
| 636 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 637 "mthi $zero, $ac1 \n\t" | |
| 638 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 639 "mthi $zero, $ac2 \n\t" | |
| 640 "add %[result1], %[load1], %[load2] \n\t" | |
| 641 "sub %[result2], %[load1], %[load2] \n\t" | |
| 642 "madd $ac1, %[result1], %[cospi_16_64] \n\t" | |
| 643 "madd $ac2, %[result2], %[cospi_16_64] \n\t" | |
| 644 "extp %[temp0], $ac1, 31 \n\t" | |
| 645 "extp %[temp1], $ac2, 31 \n\t" | |
| 646 | |
| 647 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 648 "mthi $zero, $ac3 \n\t" | |
| 649 "madd $ac3, %[load3], %[cospi_24_64] \n\t" | |
| 650 "msub $ac3, %[load4], %[cospi_8_64] \n\t" | |
| 651 "extp %[temp2], $ac3, 31 \n\t" | |
| 652 | |
| 653 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 654 "mthi $zero, $ac1 \n\t" | |
| 655 "madd $ac1, %[load3], %[cospi_8_64] \n\t" | |
| 656 "madd $ac1, %[load4], %[cospi_24_64] \n\t" | |
| 657 "extp %[temp3], $ac1, 31 \n\t" | |
| 658 | |
| 659 "add %[step1_0], %[temp0], %[temp3] \n\t" | |
| 660 "add %[step1_1], %[temp1], %[temp2] \n\t" | |
| 661 "sub %[step1_2], %[temp1], %[temp2] \n\t" | |
| 662 "sub %[step1_3], %[temp0], %[temp3] \n\t" | |
| 663 | |
| 664 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 665 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 666 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 667 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 668 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
| 669 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), | |
| 670 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) | |
| 671 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 672 [cospi_16_64] "r" (cospi_16_64), | |
| 673 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 674 | |
| 675 ); | |
| 676 | |
| 677 __asm__ __volatile__ ( | |
| 678 "lh %[load1], 8(%[input]) \n\t" | |
| 679 "lh %[load2], 56(%[input]) \n\t" | |
| 680 "lh %[load3], 40(%[input]) \n\t" | |
| 681 "lh %[load4], 24(%[input]) \n\t" | |
| 682 | |
| 683 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 684 "mthi $zero, $ac1 \n\t" | |
| 685 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 686 "mthi $zero, $ac3 \n\t" | |
| 687 | |
| 688 "madd $ac1, %[load1], %[cospi_28_64] \n\t" | |
| 689 "msub $ac1, %[load2], %[cospi_4_64] \n\t" | |
| 690 "extp %[temp0], $ac1, 31 \n\t" | |
| 691 | |
| 692 "madd $ac3, %[load1], %[cospi_4_64] \n\t" | |
| 693 "madd $ac3, %[load2], %[cospi_28_64] \n\t" | |
| 694 "extp %[temp3], $ac3, 31 \n\t" | |
| 695 | |
| 696 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 697 "mthi $zero, $ac1 \n\t" | |
| 698 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 699 "mthi $zero, $ac2 \n\t" | |
| 700 | |
| 701 "madd $ac2, %[load3], %[cospi_12_64] \n\t" | |
| 702 "msub $ac2, %[load4], %[cospi_20_64] \n\t" | |
| 703 "extp %[temp1], $ac2, 31 \n\t" | |
| 704 | |
| 705 "madd $ac1, %[load3], %[cospi_20_64] \n\t" | |
| 706 "madd $ac1, %[load4], %[cospi_12_64] \n\t" | |
| 707 "extp %[temp2], $ac1, 31 \n\t" | |
| 708 | |
| 709 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 710 "mthi $zero, $ac1 \n\t" | |
| 711 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 712 "mthi $zero, $ac3 \n\t" | |
| 713 | |
| 714 "sub %[load1], %[temp3], %[temp2] \n\t" | |
| 715 "sub %[load1], %[load1], %[temp0] \n\t" | |
| 716 "add %[load1], %[load1], %[temp1] \n\t" | |
| 717 | |
| 718 "sub %[load2], %[temp0], %[temp1] \n\t" | |
| 719 "sub %[load2], %[load2], %[temp2] \n\t" | |
| 720 "add %[load2], %[load2], %[temp3] \n\t" | |
| 721 | |
| 722 "madd $ac1, %[load1], %[cospi_16_64] \n\t" | |
| 723 "madd $ac3, %[load2], %[cospi_16_64] \n\t" | |
| 724 | |
| 725 "extp %[step1_5], $ac1, 31 \n\t" | |
| 726 "extp %[step1_6], $ac3, 31 \n\t" | |
| 727 "add %[step1_4], %[temp0], %[temp1] \n\t" | |
| 728 "add %[step1_7], %[temp3], %[temp2] \n\t" | |
| 729 | |
| 730 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 731 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 732 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), | |
| 733 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), | |
| 734 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), | |
| 735 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) | |
| 736 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 737 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), | |
| 738 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), | |
| 739 [cospi_16_64] "r" (cospi_16_64) | |
| 740 ); | |
| 741 | |
| 742 step2_0 = step1_0 + step1_7; | |
| 743 step2_1 = step1_1 + step1_6; | |
| 744 step2_2 = step1_2 + step1_5; | |
| 745 step2_3 = step1_3 + step1_4; | |
| 746 step2_4 = step1_3 - step1_4; | |
| 747 step2_5 = step1_2 - step1_5; | |
| 748 step2_6 = step1_1 - step1_6; | |
| 749 step2_7 = step1_0 - step1_7; | |
| 750 | |
| 751 step1_0 = step2_0 + step3_15; | |
| 752 step1_1 = step2_1 + step3_14; | |
| 753 step1_2 = step2_2 + step3_13; | |
| 754 step1_3 = step2_3 + step3_12; | |
| 755 step1_4 = step2_4 + step3_11; | |
| 756 step1_5 = step2_5 + step3_10; | |
| 757 step1_6 = step2_6 + step3_9; | |
| 758 step1_7 = step2_7 + step3_8; | |
| 759 step1_8 = step2_7 - step3_8; | |
| 760 step1_9 = step2_6 - step3_9; | |
| 761 step1_10 = step2_5 - step3_10; | |
| 762 step1_11 = step2_4 - step3_11; | |
| 763 step1_12 = step2_3 - step3_12; | |
| 764 step1_13 = step2_2 - step3_13; | |
| 765 step1_14 = step2_1 - step3_14; | |
| 766 step1_15 = step2_0 - step3_15; | |
| 767 | |
| 768 __asm__ __volatile__ ( | |
| 769 "sub %[temp0], %[step2_27], %[step2_20] \n\t" | |
| 770 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 771 "mthi $zero, $ac0 \n\t" | |
| 772 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
| 773 "extp %[step1_20], $ac0, 31 \n\t" | |
| 774 | |
| 775 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) | |
| 776 : [const_2_power_13] "r" (const_2_power_13), | |
| 777 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), | |
| 778 [cospi_16_64] "r" (cospi_16_64) | |
| 779 ); | |
| 780 | |
| 781 temp21 = (step2_20 + step2_27) * cospi_16_64; | |
| 782 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
| 783 | |
| 784 __asm__ __volatile__ ( | |
| 785 "sub %[temp0], %[step2_26], %[step2_21] \n\t" | |
| 786 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 787 "mthi $zero, $ac0 \n\t" | |
| 788 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
| 789 "extp %[step1_21], $ac0, 31 \n\t" | |
| 790 | |
| 791 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) | |
| 792 : [const_2_power_13] "r" (const_2_power_13), | |
| 793 [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), | |
| 794 [cospi_16_64] "r" (cospi_16_64) | |
| 795 ); | |
| 796 | |
| 797 temp21 = (step2_21 + step2_26) * cospi_16_64; | |
| 798 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
| 799 | |
| 800 __asm__ __volatile__ ( | |
| 801 "sub %[temp0], %[step2_25], %[step2_22] \n\t" | |
| 802 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 803 "mthi $zero, $ac0 \n\t" | |
| 804 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
| 805 "extp %[step1_22], $ac0, 31 \n\t" | |
| 806 | |
| 807 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) | |
| 808 : [const_2_power_13] "r" (const_2_power_13), | |
| 809 [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), | |
| 810 [cospi_16_64] "r" (cospi_16_64) | |
| 811 ); | |
| 812 | |
| 813 temp21 = (step2_22 + step2_25) * cospi_16_64; | |
| 814 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
| 815 | |
| 816 __asm__ __volatile__ ( | |
| 817 "sub %[temp0], %[step2_24], %[step2_23] \n\t" | |
| 818 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 819 "mthi $zero, $ac0 \n\t" | |
| 820 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" | |
| 821 "extp %[step1_23], $ac0, 31 \n\t" | |
| 822 | |
| 823 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) | |
| 824 : [const_2_power_13] "r" (const_2_power_13), | |
| 825 [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), | |
| 826 [cospi_16_64] "r" (cospi_16_64) | |
| 827 ); | |
| 828 | |
| 829 temp21 = (step2_23 + step2_24) * cospi_16_64; | |
| 830 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; | |
| 831 | |
| 832 // final stage | |
| 833 output[0 * 32] = step1_0 + step2_31; | |
| 834 output[1 * 32] = step1_1 + step2_30; | |
| 835 output[2 * 32] = step1_2 + step2_29; | |
| 836 output[3 * 32] = step1_3 + step2_28; | |
| 837 output[4 * 32] = step1_4 + step1_27; | |
| 838 output[5 * 32] = step1_5 + step1_26; | |
| 839 output[6 * 32] = step1_6 + step1_25; | |
| 840 output[7 * 32] = step1_7 + step1_24; | |
| 841 output[8 * 32] = step1_8 + step1_23; | |
| 842 output[9 * 32] = step1_9 + step1_22; | |
| 843 output[10 * 32] = step1_10 + step1_21; | |
| 844 output[11 * 32] = step1_11 + step1_20; | |
| 845 output[12 * 32] = step1_12 + step2_19; | |
| 846 output[13 * 32] = step1_13 + step2_18; | |
| 847 output[14 * 32] = step1_14 + step2_17; | |
| 848 output[15 * 32] = step1_15 + step2_16; | |
| 849 output[16 * 32] = step1_15 - step2_16; | |
| 850 output[17 * 32] = step1_14 - step2_17; | |
| 851 output[18 * 32] = step1_13 - step2_18; | |
| 852 output[19 * 32] = step1_12 - step2_19; | |
| 853 output[20 * 32] = step1_11 - step1_20; | |
| 854 output[21 * 32] = step1_10 - step1_21; | |
| 855 output[22 * 32] = step1_9 - step1_22; | |
| 856 output[23 * 32] = step1_8 - step1_23; | |
| 857 output[24 * 32] = step1_7 - step1_24; | |
| 858 output[25 * 32] = step1_6 - step1_25; | |
| 859 output[26 * 32] = step1_5 - step1_26; | |
| 860 output[27 * 32] = step1_4 - step1_27; | |
| 861 output[28 * 32] = step1_3 - step2_28; | |
| 862 output[29 * 32] = step1_2 - step2_29; | |
| 863 output[30 * 32] = step1_1 - step2_30; | |
| 864 output[31 * 32] = step1_0 - step2_31; | |
| 865 | |
| 866 input += 32; | |
| 867 output += 1; | |
| 868 } | |
| 869 } | |
| 870 | |
| 871 void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 872 int dest_stride) { | |
| 873 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); | |
| 874 int16_t *outptr = out; | |
| 875 uint32_t pos = 45; | |
| 876 | |
| 877 /* bit positon for extract from acc */ | |
| 878 __asm__ __volatile__ ( | |
| 879 "wrdsp %[pos], 1 \n\t" | |
| 880 : | |
| 881 : [pos] "r" (pos) | |
| 882 ); | |
| 883 | |
| 884 // Rows | |
| 885 idct32_rows_dspr2(input, outptr, 32); | |
| 886 | |
| 887 // Columns | |
| 888 vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride); | |
| 889 } | |
| 890 | |
| 891 void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 892 int stride) { | |
| 893 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); | |
| 894 int16_t *outptr = out; | |
| 895 uint32_t i; | |
| 896 uint32_t pos = 45; | |
| 897 | |
| 898 /* bit positon for extract from acc */ | |
| 899 __asm__ __volatile__ ( | |
| 900 "wrdsp %[pos], 1 \n\t" | |
| 901 : | |
| 902 : [pos] "r" (pos) | |
| 903 ); | |
| 904 | |
| 905 // Rows | |
| 906 idct32_rows_dspr2(input, outptr, 8); | |
| 907 | |
| 908 outptr += 8; | |
| 909 __asm__ __volatile__ ( | |
| 910 "sw $zero, 0(%[outptr]) \n\t" | |
| 911 "sw $zero, 4(%[outptr]) \n\t" | |
| 912 "sw $zero, 8(%[outptr]) \n\t" | |
| 913 "sw $zero, 12(%[outptr]) \n\t" | |
| 914 "sw $zero, 16(%[outptr]) \n\t" | |
| 915 "sw $zero, 20(%[outptr]) \n\t" | |
| 916 "sw $zero, 24(%[outptr]) \n\t" | |
| 917 "sw $zero, 28(%[outptr]) \n\t" | |
| 918 "sw $zero, 32(%[outptr]) \n\t" | |
| 919 "sw $zero, 36(%[outptr]) \n\t" | |
| 920 "sw $zero, 40(%[outptr]) \n\t" | |
| 921 "sw $zero, 44(%[outptr]) \n\t" | |
| 922 | |
| 923 : | |
| 924 : [outptr] "r" (outptr) | |
| 925 ); | |
| 926 | |
| 927 for (i = 0; i < 31; ++i) { | |
| 928 outptr += 32; | |
| 929 | |
| 930 __asm__ __volatile__ ( | |
| 931 "sw $zero, 0(%[outptr]) \n\t" | |
| 932 "sw $zero, 4(%[outptr]) \n\t" | |
| 933 "sw $zero, 8(%[outptr]) \n\t" | |
| 934 "sw $zero, 12(%[outptr]) \n\t" | |
| 935 "sw $zero, 16(%[outptr]) \n\t" | |
| 936 "sw $zero, 20(%[outptr]) \n\t" | |
| 937 "sw $zero, 24(%[outptr]) \n\t" | |
| 938 "sw $zero, 28(%[outptr]) \n\t" | |
| 939 "sw $zero, 32(%[outptr]) \n\t" | |
| 940 "sw $zero, 36(%[outptr]) \n\t" | |
| 941 "sw $zero, 40(%[outptr]) \n\t" | |
| 942 "sw $zero, 44(%[outptr]) \n\t" | |
| 943 | |
| 944 : | |
| 945 : [outptr] "r" (outptr) | |
| 946 ); | |
| 947 } | |
| 948 | |
| 949 // Columns | |
| 950 vp9_idct32_cols_add_blk_dspr2(out, dest, stride); | |
| 951 } | |
| 952 | |
| 953 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 954 int stride) { | |
| 955 int r, out; | |
| 956 int32_t a1, absa1; | |
| 957 int32_t vector_a1; | |
| 958 int32_t t1, t2, t3, t4; | |
| 959 int32_t vector_1, vector_2, vector_3, vector_4; | |
| 960 uint32_t pos = 45; | |
| 961 | |
| 962 /* bit positon for extract from acc */ | |
| 963 __asm__ __volatile__ ( | |
| 964 "wrdsp %[pos], 1 \n\t" | |
| 965 | |
| 966 : | |
| 967 : [pos] "r" (pos) | |
| 968 ); | |
| 969 | |
| 970 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); | |
| 971 __asm__ __volatile__ ( | |
| 972 "addi %[out], %[out], 32 \n\t" | |
| 973 "sra %[a1], %[out], 6 \n\t" | |
| 974 | |
| 975 : [out] "+r" (out), [a1] "=r" (a1) | |
| 976 : | |
| 977 ); | |
| 978 | |
| 979 if (a1 < 0) { | |
| 980 /* use quad-byte | |
| 981 * input and output memory are four byte aligned */ | |
| 982 __asm__ __volatile__ ( | |
| 983 "abs %[absa1], %[a1] \n\t" | |
| 984 "replv.qb %[vector_a1], %[absa1] \n\t" | |
| 985 | |
| 986 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) | |
| 987 : [a1] "r" (a1) | |
| 988 ); | |
| 989 | |
| 990 for (r = 32; r--;) { | |
| 991 __asm__ __volatile__ ( | |
| 992 "lw %[t1], 0(%[dest]) \n\t" | |
| 993 "lw %[t2], 4(%[dest]) \n\t" | |
| 994 "lw %[t3], 8(%[dest]) \n\t" | |
| 995 "lw %[t4], 12(%[dest]) \n\t" | |
| 996 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 997 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 998 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
| 999 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
| 1000 "sw %[vector_1], 0(%[dest]) \n\t" | |
| 1001 "sw %[vector_2], 4(%[dest]) \n\t" | |
| 1002 "sw %[vector_3], 8(%[dest]) \n\t" | |
| 1003 "sw %[vector_4], 12(%[dest]) \n\t" | |
| 1004 | |
| 1005 "lw %[t1], 16(%[dest]) \n\t" | |
| 1006 "lw %[t2], 20(%[dest]) \n\t" | |
| 1007 "lw %[t3], 24(%[dest]) \n\t" | |
| 1008 "lw %[t4], 28(%[dest]) \n\t" | |
| 1009 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 1010 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 1011 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
| 1012 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
| 1013 "sw %[vector_1], 16(%[dest]) \n\t" | |
| 1014 "sw %[vector_2], 20(%[dest]) \n\t" | |
| 1015 "sw %[vector_3], 24(%[dest]) \n\t" | |
| 1016 "sw %[vector_4], 28(%[dest]) \n\t" | |
| 1017 | |
| 1018 "add %[dest], %[dest], %[stride] \n\t" | |
| 1019 | |
| 1020 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
| 1021 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
| 1022 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
| 1023 [dest] "+&r" (dest) | |
| 1024 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) | |
| 1025 ); | |
| 1026 } | |
| 1027 } else { | |
| 1028 /* use quad-byte | |
| 1029 * input and output memory are four byte aligned */ | |
| 1030 __asm__ __volatile__ ( | |
| 1031 "replv.qb %[vector_a1], %[a1] \n\t" | |
| 1032 | |
| 1033 : [vector_a1] "=r" (vector_a1) | |
| 1034 : [a1] "r" (a1) | |
| 1035 ); | |
| 1036 | |
| 1037 for (r = 32; r--;) { | |
| 1038 __asm__ __volatile__ ( | |
| 1039 "lw %[t1], 0(%[dest]) \n\t" | |
| 1040 "lw %[t2], 4(%[dest]) \n\t" | |
| 1041 "lw %[t3], 8(%[dest]) \n\t" | |
| 1042 "lw %[t4], 12(%[dest]) \n\t" | |
| 1043 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 1044 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 1045 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
| 1046 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
| 1047 "sw %[vector_1], 0(%[dest]) \n\t" | |
| 1048 "sw %[vector_2], 4(%[dest]) \n\t" | |
| 1049 "sw %[vector_3], 8(%[dest]) \n\t" | |
| 1050 "sw %[vector_4], 12(%[dest]) \n\t" | |
| 1051 | |
| 1052 "lw %[t1], 16(%[dest]) \n\t" | |
| 1053 "lw %[t2], 20(%[dest]) \n\t" | |
| 1054 "lw %[t3], 24(%[dest]) \n\t" | |
| 1055 "lw %[t4], 28(%[dest]) \n\t" | |
| 1056 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 1057 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 1058 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
| 1059 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
| 1060 "sw %[vector_1], 16(%[dest]) \n\t" | |
| 1061 "sw %[vector_2], 20(%[dest]) \n\t" | |
| 1062 "sw %[vector_3], 24(%[dest]) \n\t" | |
| 1063 "sw %[vector_4], 28(%[dest]) \n\t" | |
| 1064 | |
| 1065 "add %[dest], %[dest], %[stride] \n\t" | |
| 1066 | |
| 1067 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
| 1068 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
| 1069 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
| 1070 [dest] "+&r" (dest) | |
| 1071 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) | |
| 1072 ); | |
| 1073 } | |
| 1074 } | |
| 1075 } | |
| 1076 #endif // #if HAVE_DSPR2 | |
| OLD | NEW |