OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <assert.h> |
| 12 #include <stdio.h> |
| 13 |
| 14 #include "./vpx_config.h" |
| 15 #include "./vp9_rtcd.h" |
| 16 #include "vp9/common/vp9_common.h" |
| 17 #include "vp9/common/vp9_blockd.h" |
| 18 #include "vp9/common/vp9_idct.h" |
| 19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" |
| 20 |
| 21 #if HAVE_DSPR2 |
| 22 static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) { |
| 23 int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; |
| 24 int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; |
| 25 int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; |
| 26 int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; |
| 27 int16_t step1_28, step1_29, step1_30, step1_31; |
| 28 int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; |
| 29 int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; |
| 30 int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; |
| 31 int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; |
| 32 int16_t step2_28, step2_29, step2_30, step2_31; |
| 33 int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; |
| 34 int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; |
| 35 int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; |
| 36 int16_t step3_29, step3_30, step3_31; |
| 37 int temp0, temp1, temp2, temp3; |
| 38 int load1, load2, load3, load4; |
| 39 int result1, result2; |
| 40 int temp21; |
| 41 int i; |
| 42 const int const_2_power_13 = 8192; |
| 43 const int32_t *input_int; |
| 44 |
| 45 for (i = 32; i--; ) { |
| 46 input_int = (const int32_t *)input; |
| 47 |
| 48 if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | |
| 49 input_int[4] | input_int[5] | input_int[6] | input_int[7] | |
| 50 input_int[8] | input_int[9] | input_int[10] | input_int[11] | |
| 51 input_int[12] | input_int[13] | input_int[14] | input_int[15])) { |
| 52 input += 32; |
| 53 |
| 54 __asm__ __volatile__ ( |
| 55 "sh $zero, 0(%[output]) \n\t" |
| 56 "sh $zero, 64(%[output]) \n\t" |
| 57 "sh $zero, 128(%[output]) \n\t" |
| 58 "sh $zero, 192(%[output]) \n\t" |
| 59 "sh $zero, 256(%[output]) \n\t" |
| 60 "sh $zero, 320(%[output]) \n\t" |
| 61 "sh $zero, 384(%[output]) \n\t" |
| 62 "sh $zero, 448(%[output]) \n\t" |
| 63 "sh $zero, 512(%[output]) \n\t" |
| 64 "sh $zero, 576(%[output]) \n\t" |
| 65 "sh $zero, 640(%[output]) \n\t" |
| 66 "sh $zero, 704(%[output]) \n\t" |
| 67 "sh $zero, 768(%[output]) \n\t" |
| 68 "sh $zero, 832(%[output]) \n\t" |
| 69 "sh $zero, 896(%[output]) \n\t" |
| 70 "sh $zero, 960(%[output]) \n\t" |
| 71 "sh $zero, 1024(%[output]) \n\t" |
| 72 "sh $zero, 1088(%[output]) \n\t" |
| 73 "sh $zero, 1152(%[output]) \n\t" |
| 74 "sh $zero, 1216(%[output]) \n\t" |
| 75 "sh $zero, 1280(%[output]) \n\t" |
| 76 "sh $zero, 1344(%[output]) \n\t" |
| 77 "sh $zero, 1408(%[output]) \n\t" |
| 78 "sh $zero, 1472(%[output]) \n\t" |
| 79 "sh $zero, 1536(%[output]) \n\t" |
| 80 "sh $zero, 1600(%[output]) \n\t" |
| 81 "sh $zero, 1664(%[output]) \n\t" |
| 82 "sh $zero, 1728(%[output]) \n\t" |
| 83 "sh $zero, 1792(%[output]) \n\t" |
| 84 "sh $zero, 1856(%[output]) \n\t" |
| 85 "sh $zero, 1920(%[output]) \n\t" |
| 86 "sh $zero, 1984(%[output]) \n\t" |
| 87 |
| 88 : |
| 89 : [output] "r" (output) |
| 90 ); |
| 91 |
| 92 output += 1; |
| 93 |
| 94 continue; |
| 95 } |
| 96 |
| 97 /* prefetch row */ |
| 98 vp9_prefetch_load((const uint8_t *)(input + 32)); |
| 99 vp9_prefetch_load((const uint8_t *)(input + 48)); |
| 100 |
| 101 __asm__ __volatile__ ( |
| 102 "lh %[load1], 2(%[input]) \n\t" |
| 103 "lh %[load2], 62(%[input]) \n\t" |
| 104 "lh %[load3], 34(%[input]) \n\t" |
| 105 "lh %[load4], 30(%[input]) \n\t" |
| 106 |
| 107 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 108 "mthi $zero, $ac1 \n\t" |
| 109 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 110 "mthi $zero, $ac3 \n\t" |
| 111 |
| 112 "madd $ac1, %[load1], %[cospi_31_64] \n\t" |
| 113 "msub $ac1, %[load2], %[cospi_1_64] \n\t" |
| 114 "extp %[temp0], $ac1, 31 \n\t" |
| 115 |
| 116 "madd $ac3, %[load1], %[cospi_1_64] \n\t" |
| 117 "madd $ac3, %[load2], %[cospi_31_64] \n\t" |
| 118 "extp %[temp3], $ac3, 31 \n\t" |
| 119 |
| 120 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 121 "mthi $zero, $ac1 \n\t" |
| 122 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 123 "mthi $zero, $ac2 \n\t" |
| 124 |
| 125 "madd $ac2, %[load3], %[cospi_15_64] \n\t" |
| 126 "msub $ac2, %[load4], %[cospi_17_64] \n\t" |
| 127 "extp %[temp1], $ac2, 31 \n\t" |
| 128 |
| 129 "madd $ac1, %[load3], %[cospi_17_64] \n\t" |
| 130 "madd $ac1, %[load4], %[cospi_15_64] \n\t" |
| 131 "extp %[temp2], $ac1, 31 \n\t" |
| 132 |
| 133 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 134 "mthi $zero, $ac1 \n\t" |
| 135 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 136 "mthi $zero, $ac3 \n\t" |
| 137 |
| 138 "sub %[load1], %[temp3], %[temp2] \n\t" |
| 139 "sub %[load2], %[temp0], %[temp1] \n\t" |
| 140 |
| 141 "madd $ac1, %[load1], %[cospi_28_64] \n\t" |
| 142 "msub $ac1, %[load2], %[cospi_4_64] \n\t" |
| 143 "madd $ac3, %[load1], %[cospi_4_64] \n\t" |
| 144 "madd $ac3, %[load2], %[cospi_28_64] \n\t" |
| 145 |
| 146 "extp %[step1_17], $ac1, 31 \n\t" |
| 147 "extp %[step1_30], $ac3, 31 \n\t" |
| 148 "add %[step1_16], %[temp0], %[temp1] \n\t" |
| 149 "add %[step1_31], %[temp2], %[temp3] \n\t" |
| 150 |
| 151 : [load1] "=&r" (load1), [load2] "=&r" (load2), |
| 152 [load3] "=&r" (load3), [load4] "=&r" (load4), |
| 153 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 154 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), |
| 155 [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), |
| 156 [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) |
| 157 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), |
| 158 [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), |
| 159 [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), |
| 160 [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) |
| 161 ); |
| 162 |
| 163 __asm__ __volatile__ ( |
| 164 "lh %[load1], 18(%[input]) \n\t" |
| 165 "lh %[load2], 46(%[input]) \n\t" |
| 166 "lh %[load3], 50(%[input]) \n\t" |
| 167 "lh %[load4], 14(%[input]) \n\t" |
| 168 |
| 169 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 170 "mthi $zero, $ac1 \n\t" |
| 171 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 172 "mthi $zero, $ac3 \n\t" |
| 173 |
| 174 "madd $ac1, %[load1], %[cospi_23_64] \n\t" |
| 175 "msub $ac1, %[load2], %[cospi_9_64] \n\t" |
| 176 "extp %[temp0], $ac1, 31 \n\t" |
| 177 |
| 178 "madd $ac3, %[load1], %[cospi_9_64] \n\t" |
| 179 "madd $ac3, %[load2], %[cospi_23_64] \n\t" |
| 180 "extp %[temp3], $ac3, 31 \n\t" |
| 181 |
| 182 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 183 "mthi $zero, $ac1 \n\t" |
| 184 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 185 "mthi $zero, $ac2 \n\t" |
| 186 |
| 187 "madd $ac2, %[load3], %[cospi_7_64] \n\t" |
| 188 "msub $ac2, %[load4], %[cospi_25_64] \n\t" |
| 189 "extp %[temp1], $ac2, 31 \n\t" |
| 190 |
| 191 "madd $ac1, %[load3], %[cospi_25_64] \n\t" |
| 192 "madd $ac1, %[load4], %[cospi_7_64] \n\t" |
| 193 "extp %[temp2], $ac1, 31 \n\t" |
| 194 |
| 195 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 196 "mthi $zero, $ac1 \n\t" |
| 197 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 198 "mthi $zero, $ac3 \n\t" |
| 199 |
| 200 "sub %[load1], %[temp1], %[temp0] \n\t" |
| 201 "sub %[load2], %[temp2], %[temp3] \n\t" |
| 202 |
| 203 "msub $ac1, %[load1], %[cospi_28_64] \n\t" |
| 204 "msub $ac1, %[load2], %[cospi_4_64] \n\t" |
| 205 "msub $ac3, %[load1], %[cospi_4_64] \n\t" |
| 206 "madd $ac3, %[load2], %[cospi_28_64] \n\t" |
| 207 |
| 208 "extp %[step1_18], $ac1, 31 \n\t" |
| 209 "extp %[step1_29], $ac3, 31 \n\t" |
| 210 "add %[step1_19], %[temp0], %[temp1] \n\t" |
| 211 "add %[step1_28], %[temp2], %[temp3] \n\t" |
| 212 |
| 213 : [load1] "=&r" (load1), [load2] "=&r" (load2), |
| 214 [load3] "=&r" (load3), [load4] "=&r" (load4), |
| 215 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 216 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), |
| 217 [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), |
| 218 [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) |
| 219 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), |
| 220 [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), |
| 221 [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), |
| 222 [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) |
| 223 ); |
| 224 |
| 225 __asm__ __volatile__ ( |
| 226 "lh %[load1], 10(%[input]) \n\t" |
| 227 "lh %[load2], 54(%[input]) \n\t" |
| 228 "lh %[load3], 42(%[input]) \n\t" |
| 229 "lh %[load4], 22(%[input]) \n\t" |
| 230 |
| 231 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 232 "mthi $zero, $ac1 \n\t" |
| 233 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 234 "mthi $zero, $ac3 \n\t" |
| 235 |
| 236 "madd $ac1, %[load1], %[cospi_27_64] \n\t" |
| 237 "msub $ac1, %[load2], %[cospi_5_64] \n\t" |
| 238 "extp %[temp0], $ac1, 31 \n\t" |
| 239 |
| 240 "madd $ac3, %[load1], %[cospi_5_64] \n\t" |
| 241 "madd $ac3, %[load2], %[cospi_27_64] \n\t" |
| 242 "extp %[temp3], $ac3, 31 \n\t" |
| 243 |
| 244 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 245 "mthi $zero, $ac1 \n\t" |
| 246 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 247 "mthi $zero, $ac2 \n\t" |
| 248 |
| 249 "madd $ac2, %[load3], %[cospi_11_64] \n\t" |
| 250 "msub $ac2, %[load4], %[cospi_21_64] \n\t" |
| 251 "extp %[temp1], $ac2, 31 \n\t" |
| 252 |
| 253 "madd $ac1, %[load3], %[cospi_21_64] \n\t" |
| 254 "madd $ac1, %[load4], %[cospi_11_64] \n\t" |
| 255 "extp %[temp2], $ac1, 31 \n\t" |
| 256 |
| 257 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 258 "mthi $zero, $ac1 \n\t" |
| 259 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 260 "mthi $zero, $ac3 \n\t" |
| 261 |
| 262 "sub %[load1], %[temp0], %[temp1] \n\t" |
| 263 "sub %[load2], %[temp3], %[temp2] \n\t" |
| 264 |
| 265 "madd $ac1, %[load2], %[cospi_12_64] \n\t" |
| 266 "msub $ac1, %[load1], %[cospi_20_64] \n\t" |
| 267 "madd $ac3, %[load1], %[cospi_12_64] \n\t" |
| 268 "madd $ac3, %[load2], %[cospi_20_64] \n\t" |
| 269 |
| 270 "extp %[step1_21], $ac1, 31 \n\t" |
| 271 "extp %[step1_26], $ac3, 31 \n\t" |
| 272 "add %[step1_20], %[temp0], %[temp1] \n\t" |
| 273 "add %[step1_27], %[temp2], %[temp3] \n\t" |
| 274 |
| 275 : [load1] "=&r" (load1), [load2] "=&r" (load2), |
| 276 [load3] "=&r" (load3), [load4] "=&r" (load4), |
| 277 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 278 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), |
| 279 [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), |
| 280 [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) |
| 281 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), |
| 282 [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), |
| 283 [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), |
| 284 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) |
| 285 ); |
| 286 |
| 287 __asm__ __volatile__ ( |
| 288 "lh %[load1], 26(%[input]) \n\t" |
| 289 "lh %[load2], 38(%[input]) \n\t" |
| 290 "lh %[load3], 58(%[input]) \n\t" |
| 291 "lh %[load4], 6(%[input]) \n\t" |
| 292 |
| 293 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 294 "mthi $zero, $ac1 \n\t" |
| 295 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 296 "mthi $zero, $ac3 \n\t" |
| 297 |
| 298 "madd $ac1, %[load1], %[cospi_19_64] \n\t" |
| 299 "msub $ac1, %[load2], %[cospi_13_64] \n\t" |
| 300 "extp %[temp0], $ac1, 31 \n\t" |
| 301 |
| 302 "madd $ac3, %[load1], %[cospi_13_64] \n\t" |
| 303 "madd $ac3, %[load2], %[cospi_19_64] \n\t" |
| 304 "extp %[temp3], $ac3, 31 \n\t" |
| 305 |
| 306 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 307 "mthi $zero, $ac1 \n\t" |
| 308 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 309 "mthi $zero, $ac2 \n\t" |
| 310 |
| 311 "madd $ac2, %[load3], %[cospi_3_64] \n\t" |
| 312 "msub $ac2, %[load4], %[cospi_29_64] \n\t" |
| 313 "extp %[temp1], $ac2, 31 \n\t" |
| 314 |
| 315 "madd $ac1, %[load3], %[cospi_29_64] \n\t" |
| 316 "madd $ac1, %[load4], %[cospi_3_64] \n\t" |
| 317 "extp %[temp2], $ac1, 31 \n\t" |
| 318 |
| 319 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 320 "mthi $zero, $ac1 \n\t" |
| 321 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 322 "mthi $zero, $ac3 \n\t" |
| 323 |
| 324 "sub %[load1], %[temp1], %[temp0] \n\t" |
| 325 "sub %[load2], %[temp2], %[temp3] \n\t" |
| 326 |
| 327 "msub $ac1, %[load1], %[cospi_12_64] \n\t" |
| 328 "msub $ac1, %[load2], %[cospi_20_64] \n\t" |
| 329 "msub $ac3, %[load1], %[cospi_20_64] \n\t" |
| 330 "madd $ac3, %[load2], %[cospi_12_64] \n\t" |
| 331 |
| 332 "extp %[step1_22], $ac1, 31 \n\t" |
| 333 "extp %[step1_25], $ac3, 31 \n\t" |
| 334 "add %[step1_23], %[temp0], %[temp1] \n\t" |
| 335 "add %[step1_24], %[temp2], %[temp3] \n\t" |
| 336 |
| 337 : [load1] "=&r" (load1), [load2] "=&r" (load2), |
| 338 [load3] "=&r" (load3), [load4] "=&r" (load4), |
| 339 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 340 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), |
| 341 [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), |
| 342 [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) |
| 343 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), |
| 344 [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), |
| 345 [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), |
| 346 [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) |
| 347 ); |
| 348 |
| 349 __asm__ __volatile__ ( |
| 350 "lh %[load1], 4(%[input]) \n\t" |
| 351 "lh %[load2], 60(%[input]) \n\t" |
| 352 "lh %[load3], 36(%[input]) \n\t" |
| 353 "lh %[load4], 28(%[input]) \n\t" |
| 354 |
| 355 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 356 "mthi $zero, $ac1 \n\t" |
| 357 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 358 "mthi $zero, $ac3 \n\t" |
| 359 |
| 360 "madd $ac1, %[load1], %[cospi_30_64] \n\t" |
| 361 "msub $ac1, %[load2], %[cospi_2_64] \n\t" |
| 362 "extp %[temp0], $ac1, 31 \n\t" |
| 363 |
| 364 "madd $ac3, %[load1], %[cospi_2_64] \n\t" |
| 365 "madd $ac3, %[load2], %[cospi_30_64] \n\t" |
| 366 "extp %[temp3], $ac3, 31 \n\t" |
| 367 |
| 368 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 369 "mthi $zero, $ac1 \n\t" |
| 370 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 371 "mthi $zero, $ac2 \n\t" |
| 372 |
| 373 "madd $ac2, %[load3], %[cospi_14_64] \n\t" |
| 374 "msub $ac2, %[load4], %[cospi_18_64] \n\t" |
| 375 "extp %[temp1], $ac2, 31 \n\t" |
| 376 |
| 377 "madd $ac1, %[load3], %[cospi_18_64] \n\t" |
| 378 "madd $ac1, %[load4], %[cospi_14_64] \n\t" |
| 379 "extp %[temp2], $ac1, 31 \n\t" |
| 380 |
| 381 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 382 "mthi $zero, $ac1 \n\t" |
| 383 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 384 "mthi $zero, $ac3 \n\t" |
| 385 |
| 386 "sub %[load1], %[temp0], %[temp1] \n\t" |
| 387 "sub %[load2], %[temp3], %[temp2] \n\t" |
| 388 |
| 389 "msub $ac1, %[load1], %[cospi_8_64] \n\t" |
| 390 "madd $ac1, %[load2], %[cospi_24_64] \n\t" |
| 391 "madd $ac3, %[load1], %[cospi_24_64] \n\t" |
| 392 "madd $ac3, %[load2], %[cospi_8_64] \n\t" |
| 393 |
| 394 "extp %[step2_9], $ac1, 31 \n\t" |
| 395 "extp %[step2_14], $ac3, 31 \n\t" |
| 396 "add %[step2_8], %[temp0], %[temp1] \n\t" |
| 397 "add %[step2_15], %[temp2], %[temp3] \n\t" |
| 398 |
| 399 : [load1] "=&r" (load1), [load2] "=&r" (load2), |
| 400 [load3] "=&r" (load3), [load4] "=&r" (load4), |
| 401 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 402 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), |
| 403 [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), |
| 404 [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) |
| 405 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), |
| 406 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), |
| 407 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), |
| 408 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) |
| 409 ); |
| 410 |
| 411 __asm__ __volatile__ ( |
| 412 "lh %[load1], 20(%[input]) \n\t" |
| 413 "lh %[load2], 44(%[input]) \n\t" |
| 414 "lh %[load3], 52(%[input]) \n\t" |
| 415 "lh %[load4], 12(%[input]) \n\t" |
| 416 |
| 417 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 418 "mthi $zero, $ac1 \n\t" |
| 419 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 420 "mthi $zero, $ac3 \n\t" |
| 421 |
| 422 "madd $ac1, %[load1], %[cospi_22_64] \n\t" |
| 423 "msub $ac1, %[load2], %[cospi_10_64] \n\t" |
| 424 "extp %[temp0], $ac1, 31 \n\t" |
| 425 |
| 426 "madd $ac3, %[load1], %[cospi_10_64] \n\t" |
| 427 "madd $ac3, %[load2], %[cospi_22_64] \n\t" |
| 428 "extp %[temp3], $ac3, 31 \n\t" |
| 429 |
| 430 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 431 "mthi $zero, $ac1 \n\t" |
| 432 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 433 "mthi $zero, $ac2 \n\t" |
| 434 |
| 435 "madd $ac2, %[load3], %[cospi_6_64] \n\t" |
| 436 "msub $ac2, %[load4], %[cospi_26_64] \n\t" |
| 437 "extp %[temp1], $ac2, 31 \n\t" |
| 438 |
| 439 "madd $ac1, %[load3], %[cospi_26_64] \n\t" |
| 440 "madd $ac1, %[load4], %[cospi_6_64] \n\t" |
| 441 "extp %[temp2], $ac1, 31 \n\t" |
| 442 |
| 443 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 444 "mthi $zero, $ac1 \n\t" |
| 445 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 446 "mthi $zero, $ac3 \n\t" |
| 447 |
| 448 "sub %[load1], %[temp1], %[temp0] \n\t" |
| 449 "sub %[load2], %[temp2], %[temp3] \n\t" |
| 450 |
| 451 "msub $ac1, %[load1], %[cospi_24_64] \n\t" |
| 452 "msub $ac1, %[load2], %[cospi_8_64] \n\t" |
| 453 "madd $ac3, %[load2], %[cospi_24_64] \n\t" |
| 454 "msub $ac3, %[load1], %[cospi_8_64] \n\t" |
| 455 |
| 456 "extp %[step2_10], $ac1, 31 \n\t" |
| 457 "extp %[step2_13], $ac3, 31 \n\t" |
| 458 "add %[step2_11], %[temp0], %[temp1] \n\t" |
| 459 "add %[step2_12], %[temp2], %[temp3] \n\t" |
| 460 |
| 461 : [load1] "=&r" (load1), [load2] "=&r" (load2), |
| 462 [load3] "=&r" (load3), [load4] "=&r" (load4), |
| 463 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 464 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), |
| 465 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), |
| 466 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) |
| 467 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), |
| 468 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), |
| 469 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), |
| 470 [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) |
| 471 ); |
| 472 |
| 473 __asm__ __volatile__ ( |
| 474 "mtlo %[const_2_power_13], $ac0 \n\t" |
| 475 "mthi $zero, $ac0 \n\t" |
| 476 "sub %[temp0], %[step2_14], %[step2_13] \n\t" |
| 477 "sub %[temp0], %[temp0], %[step2_9] \n\t" |
| 478 "add %[temp0], %[temp0], %[step2_10] \n\t" |
| 479 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" |
| 480 |
| 481 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 482 "mthi $zero, $ac1 \n\t" |
| 483 "sub %[temp1], %[step2_14], %[step2_13] \n\t" |
| 484 "add %[temp1], %[temp1], %[step2_9] \n\t" |
| 485 "sub %[temp1], %[temp1], %[step2_10] \n\t" |
| 486 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" |
| 487 |
| 488 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 489 "mthi $zero, $ac2 \n\t" |
| 490 "sub %[temp0], %[step2_15], %[step2_12] \n\t" |
| 491 "sub %[temp0], %[temp0], %[step2_8] \n\t" |
| 492 "add %[temp0], %[temp0], %[step2_11] \n\t" |
| 493 "madd $ac2, %[temp0], %[cospi_16_64] \n\t" |
| 494 |
| 495 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 496 "mthi $zero, $ac3 \n\t" |
| 497 "sub %[temp1], %[step2_15], %[step2_12] \n\t" |
| 498 "add %[temp1], %[temp1], %[step2_8] \n\t" |
| 499 "sub %[temp1], %[temp1], %[step2_11] \n\t" |
| 500 "madd $ac3, %[temp1], %[cospi_16_64] \n\t" |
| 501 |
| 502 "add %[step3_8], %[step2_8], %[step2_11] \n\t" |
| 503 "add %[step3_9], %[step2_9], %[step2_10] \n\t" |
| 504 "add %[step3_14], %[step2_13], %[step2_14] \n\t" |
| 505 "add %[step3_15], %[step2_12], %[step2_15] \n\t" |
| 506 |
| 507 "extp %[step3_10], $ac0, 31 \n\t" |
| 508 "extp %[step3_13], $ac1, 31 \n\t" |
| 509 "extp %[step3_11], $ac2, 31 \n\t" |
| 510 "extp %[step3_12], $ac3, 31 \n\t" |
| 511 |
| 512 : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 513 [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), |
| 514 [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), |
| 515 [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), |
| 516 [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) |
| 517 : [const_2_power_13] "r" (const_2_power_13), |
| 518 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), |
| 519 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), |
| 520 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), |
| 521 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), |
| 522 [cospi_16_64] "r" (cospi_16_64) |
| 523 ); |
| 524 |
| 525 step2_18 = step1_17 - step1_18; |
| 526 step2_29 = step1_30 - step1_29; |
| 527 |
| 528 __asm__ __volatile__ ( |
| 529 "mtlo %[const_2_power_13], $ac0 \n\t" |
| 530 "mthi $zero, $ac0 \n\t" |
| 531 "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" |
| 532 "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" |
| 533 "extp %[step3_18], $ac0, 31 \n\t" |
| 534 |
| 535 : [step3_18] "=r" (step3_18) |
| 536 : [const_2_power_13] "r" (const_2_power_13), |
| 537 [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), |
| 538 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) |
| 539 ); |
| 540 |
| 541 temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; |
| 542 step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; |
| 543 |
| 544 step2_19 = step1_16 - step1_19; |
| 545 step2_28 = step1_31 - step1_28; |
| 546 |
| 547 __asm__ __volatile__ ( |
| 548 "mtlo %[const_2_power_13], $ac0 \n\t" |
| 549 "mthi $zero, $ac0 \n\t" |
| 550 "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" |
| 551 "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" |
| 552 "extp %[step3_19], $ac0, 31 \n\t" |
| 553 |
| 554 : [step3_19] "=r" (step3_19) |
| 555 : [const_2_power_13] "r" (const_2_power_13), |
| 556 [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), |
| 557 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) |
| 558 ); |
| 559 |
| 560 temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; |
| 561 step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; |
| 562 |
| 563 step3_16 = step1_16 + step1_19; |
| 564 step3_17 = step1_17 + step1_18; |
| 565 step3_30 = step1_29 + step1_30; |
| 566 step3_31 = step1_28 + step1_31; |
| 567 |
| 568 step2_20 = step1_23 - step1_20; |
| 569 step2_27 = step1_24 - step1_27; |
| 570 |
| 571 __asm__ __volatile__ ( |
| 572 "mtlo %[const_2_power_13], $ac0 \n\t" |
| 573 "mthi $zero, $ac0 \n\t" |
| 574 "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" |
| 575 "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" |
| 576 "extp %[step3_20], $ac0, 31 \n\t" |
| 577 |
| 578 : [step3_20] "=r" (step3_20) |
| 579 : [const_2_power_13] "r" (const_2_power_13), |
| 580 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), |
| 581 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) |
| 582 ); |
| 583 |
| 584 temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; |
| 585 step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; |
| 586 |
| 587 step2_21 = step1_22 - step1_21; |
| 588 step2_26 = step1_25 - step1_26; |
| 589 |
| 590 __asm__ __volatile__ ( |
| 591 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 592 "mthi $zero, $ac1 \n\t" |
| 593 "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" |
| 594 "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" |
| 595 "extp %[step3_21], $ac1, 31 \n\t" |
| 596 |
| 597 : [step3_21] "=r" (step3_21) |
| 598 : [const_2_power_13] "r" (const_2_power_13), |
| 599 [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), |
| 600 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) |
| 601 ); |
| 602 |
| 603 temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; |
| 604 step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; |
| 605 |
| 606 step3_22 = step1_21 + step1_22; |
| 607 step3_23 = step1_20 + step1_23; |
| 608 step3_24 = step1_24 + step1_27; |
| 609 step3_25 = step1_25 + step1_26; |
| 610 |
| 611 step2_16 = step3_16 + step3_23; |
| 612 step2_17 = step3_17 + step3_22; |
| 613 step2_18 = step3_18 + step3_21; |
| 614 step2_19 = step3_19 + step3_20; |
| 615 step2_20 = step3_19 - step3_20; |
| 616 step2_21 = step3_18 - step3_21; |
| 617 step2_22 = step3_17 - step3_22; |
| 618 step2_23 = step3_16 - step3_23; |
| 619 |
| 620 step2_24 = step3_31 - step3_24; |
| 621 step2_25 = step3_30 - step3_25; |
| 622 step2_26 = step3_29 - step3_26; |
| 623 step2_27 = step3_28 - step3_27; |
| 624 step2_28 = step3_28 + step3_27; |
| 625 step2_29 = step3_29 + step3_26; |
| 626 step2_30 = step3_30 + step3_25; |
| 627 step2_31 = step3_31 + step3_24; |
| 628 |
| 629 __asm__ __volatile__ ( |
| 630 "lh %[load1], 0(%[input]) \n\t" |
| 631 "lh %[load2], 32(%[input]) \n\t" |
| 632 "lh %[load3], 16(%[input]) \n\t" |
| 633 "lh %[load4], 48(%[input]) \n\t" |
| 634 |
| 635 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 636 "mthi $zero, $ac1 \n\t" |
| 637 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 638 "mthi $zero, $ac2 \n\t" |
| 639 "add %[result1], %[load1], %[load2] \n\t" |
| 640 "sub %[result2], %[load1], %[load2] \n\t" |
| 641 "madd $ac1, %[result1], %[cospi_16_64] \n\t" |
| 642 "madd $ac2, %[result2], %[cospi_16_64] \n\t" |
| 643 "extp %[temp0], $ac1, 31 \n\t" |
| 644 "extp %[temp1], $ac2, 31 \n\t" |
| 645 |
| 646 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 647 "mthi $zero, $ac3 \n\t" |
| 648 "madd $ac3, %[load3], %[cospi_24_64] \n\t" |
| 649 "msub $ac3, %[load4], %[cospi_8_64] \n\t" |
| 650 "extp %[temp2], $ac3, 31 \n\t" |
| 651 |
| 652 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 653 "mthi $zero, $ac1 \n\t" |
| 654 "madd $ac1, %[load3], %[cospi_8_64] \n\t" |
| 655 "madd $ac1, %[load4], %[cospi_24_64] \n\t" |
| 656 "extp %[temp3], $ac1, 31 \n\t" |
| 657 |
| 658 "add %[step1_0], %[temp0], %[temp3] \n\t" |
| 659 "add %[step1_1], %[temp1], %[temp2] \n\t" |
| 660 "sub %[step1_2], %[temp1], %[temp2] \n\t" |
| 661 "sub %[step1_3], %[temp0], %[temp3] \n\t" |
| 662 |
| 663 : [load1] "=&r" (load1), [load2] "=&r" (load2), |
| 664 [load3] "=&r" (load3), [load4] "=&r" (load4), |
| 665 [result1] "=&r" (result1), [result2] "=&r" (result2), |
| 666 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 667 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), |
| 668 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), |
| 669 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) |
| 670 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), |
| 671 [cospi_16_64] "r" (cospi_16_64), |
| 672 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) |
| 673 |
| 674 ); |
| 675 |
| 676 __asm__ __volatile__ ( |
| 677 "lh %[load1], 8(%[input]) \n\t" |
| 678 "lh %[load2], 56(%[input]) \n\t" |
| 679 "lh %[load3], 40(%[input]) \n\t" |
| 680 "lh %[load4], 24(%[input]) \n\t" |
| 681 |
| 682 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 683 "mthi $zero, $ac1 \n\t" |
| 684 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 685 "mthi $zero, $ac3 \n\t" |
| 686 |
| 687 "madd $ac1, %[load1], %[cospi_28_64] \n\t" |
| 688 "msub $ac1, %[load2], %[cospi_4_64] \n\t" |
| 689 "extp %[temp0], $ac1, 31 \n\t" |
| 690 |
| 691 "madd $ac3, %[load1], %[cospi_4_64] \n\t" |
| 692 "madd $ac3, %[load2], %[cospi_28_64] \n\t" |
| 693 "extp %[temp3], $ac3, 31 \n\t" |
| 694 |
| 695 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 696 "mthi $zero, $ac1 \n\t" |
| 697 "mtlo %[const_2_power_13], $ac2 \n\t" |
| 698 "mthi $zero, $ac2 \n\t" |
| 699 |
| 700 "madd $ac2, %[load3], %[cospi_12_64] \n\t" |
| 701 "msub $ac2, %[load4], %[cospi_20_64] \n\t" |
| 702 "extp %[temp1], $ac2, 31 \n\t" |
| 703 |
| 704 "madd $ac1, %[load3], %[cospi_20_64] \n\t" |
| 705 "madd $ac1, %[load4], %[cospi_12_64] \n\t" |
| 706 "extp %[temp2], $ac1, 31 \n\t" |
| 707 |
| 708 "mtlo %[const_2_power_13], $ac1 \n\t" |
| 709 "mthi $zero, $ac1 \n\t" |
| 710 "mtlo %[const_2_power_13], $ac3 \n\t" |
| 711 "mthi $zero, $ac3 \n\t" |
| 712 |
| 713 "sub %[load1], %[temp3], %[temp2] \n\t" |
| 714 "sub %[load1], %[load1], %[temp0] \n\t" |
| 715 "add %[load1], %[load1], %[temp1] \n\t" |
| 716 |
| 717 "sub %[load2], %[temp0], %[temp1] \n\t" |
| 718 "sub %[load2], %[load2], %[temp2] \n\t" |
| 719 "add %[load2], %[load2], %[temp3] \n\t" |
| 720 |
| 721 "madd $ac1, %[load1], %[cospi_16_64] \n\t" |
| 722 "madd $ac3, %[load2], %[cospi_16_64] \n\t" |
| 723 |
| 724 "extp %[step1_5], $ac1, 31 \n\t" |
| 725 "extp %[step1_6], $ac3, 31 \n\t" |
| 726 "add %[step1_4], %[temp0], %[temp1] \n\t" |
| 727 "add %[step1_7], %[temp3], %[temp2] \n\t" |
| 728 |
| 729 : [load1] "=&r" (load1), [load2] "=&r" (load2), |
| 730 [load3] "=&r" (load3), [load4] "=&r" (load4), |
| 731 [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), |
| 732 [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), |
| 733 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), |
| 734 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) |
| 735 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), |
| 736 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), |
| 737 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), |
| 738 [cospi_16_64] "r" (cospi_16_64) |
| 739 ); |
| 740 |
| 741 step2_0 = step1_0 + step1_7; |
| 742 step2_1 = step1_1 + step1_6; |
| 743 step2_2 = step1_2 + step1_5; |
| 744 step2_3 = step1_3 + step1_4; |
| 745 step2_4 = step1_3 - step1_4; |
| 746 step2_5 = step1_2 - step1_5; |
| 747 step2_6 = step1_1 - step1_6; |
| 748 step2_7 = step1_0 - step1_7; |
| 749 |
| 750 step1_0 = step2_0 + step3_15; |
| 751 step1_1 = step2_1 + step3_14; |
| 752 step1_2 = step2_2 + step3_13; |
| 753 step1_3 = step2_3 + step3_12; |
| 754 step1_4 = step2_4 + step3_11; |
| 755 step1_5 = step2_5 + step3_10; |
| 756 step1_6 = step2_6 + step3_9; |
| 757 step1_7 = step2_7 + step3_8; |
| 758 step1_8 = step2_7 - step3_8; |
| 759 step1_9 = step2_6 - step3_9; |
| 760 step1_10 = step2_5 - step3_10; |
| 761 step1_11 = step2_4 - step3_11; |
| 762 step1_12 = step2_3 - step3_12; |
| 763 step1_13 = step2_2 - step3_13; |
| 764 step1_14 = step2_1 - step3_14; |
| 765 step1_15 = step2_0 - step3_15; |
| 766 |
| 767 __asm__ __volatile__ ( |
| 768 "sub %[temp0], %[step2_27], %[step2_20] \n\t" |
| 769 "mtlo %[const_2_power_13], $ac0 \n\t" |
| 770 "mthi $zero, $ac0 \n\t" |
| 771 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" |
| 772 "extp %[step1_20], $ac0, 31 \n\t" |
| 773 |
| 774 : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) |
| 775 : [const_2_power_13] "r" (const_2_power_13), |
| 776 [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), |
| 777 [cospi_16_64] "r" (cospi_16_64) |
| 778 ); |
| 779 |
| 780 temp21 = (step2_20 + step2_27) * cospi_16_64; |
| 781 step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; |
| 782 |
| 783 __asm__ __volatile__ ( |
| 784 "sub %[temp0], %[step2_26], %[step2_21] \n\t" |
| 785 "mtlo %[const_2_power_13], $ac0 \n\t" |
| 786 "mthi $zero, $ac0 \n\t" |
| 787 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" |
| 788 "extp %[step1_21], $ac0, 31 \n\t" |
| 789 |
| 790 : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) |
| 791 : [const_2_power_13] "r" (const_2_power_13), |
| 792 [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), |
| 793 [cospi_16_64] "r" (cospi_16_64) |
| 794 ); |
| 795 |
| 796 temp21 = (step2_21 + step2_26) * cospi_16_64; |
| 797 step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; |
| 798 |
| 799 __asm__ __volatile__ ( |
| 800 "sub %[temp0], %[step2_25], %[step2_22] \n\t" |
| 801 "mtlo %[const_2_power_13], $ac0 \n\t" |
| 802 "mthi $zero, $ac0 \n\t" |
| 803 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" |
| 804 "extp %[step1_22], $ac0, 31 \n\t" |
| 805 |
| 806 : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) |
| 807 : [const_2_power_13] "r" (const_2_power_13), |
| 808 [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), |
| 809 [cospi_16_64] "r" (cospi_16_64) |
| 810 ); |
| 811 |
| 812 temp21 = (step2_22 + step2_25) * cospi_16_64; |
| 813 step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; |
| 814 |
| 815 __asm__ __volatile__ ( |
| 816 "sub %[temp0], %[step2_24], %[step2_23] \n\t" |
| 817 "mtlo %[const_2_power_13], $ac0 \n\t" |
| 818 "mthi $zero, $ac0 \n\t" |
| 819 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" |
| 820 "extp %[step1_23], $ac0, 31 \n\t" |
| 821 |
| 822 : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) |
| 823 : [const_2_power_13] "r" (const_2_power_13), |
| 824 [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), |
| 825 [cospi_16_64] "r" (cospi_16_64) |
| 826 ); |
| 827 |
| 828 temp21 = (step2_23 + step2_24) * cospi_16_64; |
| 829 step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; |
| 830 |
| 831 // final stage |
| 832 output[0 * 32] = step1_0 + step2_31; |
| 833 output[1 * 32] = step1_1 + step2_30; |
| 834 output[2 * 32] = step1_2 + step2_29; |
| 835 output[3 * 32] = step1_3 + step2_28; |
| 836 output[4 * 32] = step1_4 + step1_27; |
| 837 output[5 * 32] = step1_5 + step1_26; |
| 838 output[6 * 32] = step1_6 + step1_25; |
| 839 output[7 * 32] = step1_7 + step1_24; |
| 840 output[8 * 32] = step1_8 + step1_23; |
| 841 output[9 * 32] = step1_9 + step1_22; |
| 842 output[10 * 32] = step1_10 + step1_21; |
| 843 output[11 * 32] = step1_11 + step1_20; |
| 844 output[12 * 32] = step1_12 + step2_19; |
| 845 output[13 * 32] = step1_13 + step2_18; |
| 846 output[14 * 32] = step1_14 + step2_17; |
| 847 output[15 * 32] = step1_15 + step2_16; |
| 848 output[16 * 32] = step1_15 - step2_16; |
| 849 output[17 * 32] = step1_14 - step2_17; |
| 850 output[18 * 32] = step1_13 - step2_18; |
| 851 output[19 * 32] = step1_12 - step2_19; |
| 852 output[20 * 32] = step1_11 - step1_20; |
| 853 output[21 * 32] = step1_10 - step1_21; |
| 854 output[22 * 32] = step1_9 - step1_22; |
| 855 output[23 * 32] = step1_8 - step1_23; |
| 856 output[24 * 32] = step1_7 - step1_24; |
| 857 output[25 * 32] = step1_6 - step1_25; |
| 858 output[26 * 32] = step1_5 - step1_26; |
| 859 output[27 * 32] = step1_4 - step1_27; |
| 860 output[28 * 32] = step1_3 - step2_28; |
| 861 output[29 * 32] = step1_2 - step2_29; |
| 862 output[30 * 32] = step1_1 - step2_30; |
| 863 output[31 * 32] = step1_0 - step2_31; |
| 864 |
| 865 input += 32; |
| 866 output += 1; |
| 867 } |
| 868 } |
| 869 |
| 870 void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, |
| 871 int dest_stride) { |
| 872 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); |
| 873 int16_t *outptr = out; |
| 874 uint32_t pos = 45; |
| 875 |
| 876 /* bit positon for extract from acc */ |
| 877 __asm__ __volatile__ ( |
| 878 "wrdsp %[pos], 1 \n\t" |
| 879 : |
| 880 : [pos] "r" (pos) |
| 881 ); |
| 882 |
| 883 // Rows |
| 884 idct32_1d_rows_dspr2(input, outptr); |
| 885 |
| 886 // Columns |
| 887 vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); |
| 888 } |
| 889 |
| 890 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, |
| 891 int stride) { |
| 892 int r, out; |
| 893 int32_t a1, absa1; |
| 894 int32_t vector_a1; |
| 895 int32_t t1, t2, t3, t4; |
| 896 int32_t vector_1, vector_2, vector_3, vector_4; |
| 897 uint32_t pos = 45; |
| 898 |
| 899 /* bit positon for extract from acc */ |
| 900 __asm__ __volatile__ ( |
| 901 "wrdsp %[pos], 1 \n\t" |
| 902 |
| 903 : |
| 904 : [pos] "r" (pos) |
| 905 ); |
| 906 |
| 907 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); |
| 908 __asm__ __volatile__ ( |
| 909 "addi %[out], %[out], 32 \n\t" |
| 910 "sra %[a1], %[out], 6 \n\t" |
| 911 |
| 912 : [out] "+r" (out), [a1] "=r" (a1) |
| 913 : |
| 914 ); |
| 915 |
| 916 if (a1 < 0) { |
| 917 /* use quad-byte |
| 918 * input and output memory are four byte aligned */ |
| 919 __asm__ __volatile__ ( |
| 920 "abs %[absa1], %[a1] \n\t" |
| 921 "replv.qb %[vector_a1], %[absa1] \n\t" |
| 922 |
| 923 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) |
| 924 : [a1] "r" (a1) |
| 925 ); |
| 926 |
| 927 for (r = 32; r--;) { |
| 928 __asm__ __volatile__ ( |
| 929 "lw %[t1], 0(%[dest]) \n\t" |
| 930 "lw %[t2], 4(%[dest]) \n\t" |
| 931 "lw %[t3], 8(%[dest]) \n\t" |
| 932 "lw %[t4], 12(%[dest]) \n\t" |
| 933 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
| 934 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
| 935 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
| 936 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
| 937 "sw %[vector_1], 0(%[dest]) \n\t" |
| 938 "sw %[vector_2], 4(%[dest]) \n\t" |
| 939 "sw %[vector_3], 8(%[dest]) \n\t" |
| 940 "sw %[vector_4], 12(%[dest]) \n\t" |
| 941 |
| 942 "lw %[t1], 16(%[dest]) \n\t" |
| 943 "lw %[t2], 20(%[dest]) \n\t" |
| 944 "lw %[t3], 24(%[dest]) \n\t" |
| 945 "lw %[t4], 28(%[dest]) \n\t" |
| 946 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
| 947 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
| 948 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
| 949 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
| 950 "sw %[vector_1], 16(%[dest]) \n\t" |
| 951 "sw %[vector_2], 20(%[dest]) \n\t" |
| 952 "sw %[vector_3], 24(%[dest]) \n\t" |
| 953 "sw %[vector_4], 28(%[dest]) \n\t" |
| 954 |
| 955 "add %[dest], %[dest], %[stride] \n\t" |
| 956 |
| 957 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
| 958 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
| 959 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
| 960 [dest] "+&r" (dest) |
| 961 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) |
| 962 ); |
| 963 } |
| 964 } else { |
| 965 /* use quad-byte |
| 966 * input and output memory are four byte aligned */ |
| 967 __asm__ __volatile__ ( |
| 968 "replv.qb %[vector_a1], %[a1] \n\t" |
| 969 |
| 970 : [vector_a1] "=r" (vector_a1) |
| 971 : [a1] "r" (a1) |
| 972 ); |
| 973 |
| 974 for (r = 32; r--;) { |
| 975 __asm__ __volatile__ ( |
| 976 "lw %[t1], 0(%[dest]) \n\t" |
| 977 "lw %[t2], 4(%[dest]) \n\t" |
| 978 "lw %[t3], 8(%[dest]) \n\t" |
| 979 "lw %[t4], 12(%[dest]) \n\t" |
| 980 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
| 981 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
| 982 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
| 983 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
| 984 "sw %[vector_1], 0(%[dest]) \n\t" |
| 985 "sw %[vector_2], 4(%[dest]) \n\t" |
| 986 "sw %[vector_3], 8(%[dest]) \n\t" |
| 987 "sw %[vector_4], 12(%[dest]) \n\t" |
| 988 |
| 989 "lw %[t1], 16(%[dest]) \n\t" |
| 990 "lw %[t2], 20(%[dest]) \n\t" |
| 991 "lw %[t3], 24(%[dest]) \n\t" |
| 992 "lw %[t4], 28(%[dest]) \n\t" |
| 993 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" |
| 994 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" |
| 995 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" |
| 996 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" |
| 997 "sw %[vector_1], 16(%[dest]) \n\t" |
| 998 "sw %[vector_2], 20(%[dest]) \n\t" |
| 999 "sw %[vector_3], 24(%[dest]) \n\t" |
| 1000 "sw %[vector_4], 28(%[dest]) \n\t" |
| 1001 |
| 1002 "add %[dest], %[dest], %[stride] \n\t" |
| 1003 |
| 1004 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), |
| 1005 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), |
| 1006 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), |
| 1007 [dest] "+&r" (dest) |
| 1008 : [stride] "r" (stride), [vector_a1] "r" (vector_a1) |
| 1009 ); |
| 1010 } |
| 1011 } |
| 1012 } |
| 1013 #endif // #if HAVE_DSPR2 |
OLD | NEW |