| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | 11 #include <assert.h> |
| 12 #include <stdio.h> | 12 #include <stdio.h> |
| 13 | 13 |
| 14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
| 15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" |
| 16 #include "vp9/common/vp9_common.h" | 16 #include "vp9/common/vp9_common.h" |
| 17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
| 18 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | 18 #include "vpx_dsp/mips/inv_txfm_dspr2.h" |
| 19 #include "vpx_dsp/txfm_common.h" | 19 #include "vpx_dsp/txfm_common.h" |
| 20 #include "vpx_ports/mem.h" | 20 #include "vpx_ports/mem.h" |
| 21 | 21 |
| 22 #if HAVE_DSPR2 | 22 #if HAVE_DSPR2 |
| 23 static void idct8_rows_dspr2(const int16_t *input, int16_t *output, | |
| 24 uint32_t no_rows) { | |
| 25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | |
| 26 const int const_2_power_13 = 8192; | |
| 27 int Temp0, Temp1, Temp2, Temp3, Temp4; | |
| 28 int i; | |
| 29 | |
| 30 for (i = no_rows; i--; ) { | |
| 31 __asm__ __volatile__ ( | |
| 32 /* | |
| 33 temp_1 = (input[0] + input[4]) * cospi_16_64; | |
| 34 step2_0 = dct_const_round_shift(temp_1); | |
| 35 | |
| 36 temp_2 = (input[0] - input[4]) * cospi_16_64; | |
| 37 step2_1 = dct_const_round_shift(temp_2); | |
| 38 */ | |
| 39 "lh %[Temp0], 0(%[input]) \n\t" | |
| 40 "lh %[Temp1], 8(%[input]) \n\t" | |
| 41 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 42 "mthi $zero, $ac0 \n\t" | |
| 43 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 44 "mthi $zero, $ac1 \n\t" | |
| 45 "add %[Temp2], %[Temp0], %[Temp1] \n\t" | |
| 46 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" | |
| 47 "extp %[Temp4], $ac0, 31 \n\t" | |
| 48 | |
| 49 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" | |
| 50 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" | |
| 51 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 52 "mthi $zero, $ac0 \n\t" | |
| 53 "extp %[Temp2], $ac1, 31 \n\t" | |
| 54 | |
| 55 /* | |
| 56 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; | |
| 57 step2_2 = dct_const_round_shift(temp_1); | |
| 58 */ | |
| 59 "lh %[Temp0], 4(%[input]) \n\t" | |
| 60 "lh %[Temp1], 12(%[input]) \n\t" | |
| 61 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" | |
| 62 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" | |
| 63 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 64 "mthi $zero, $ac1 \n\t" | |
| 65 "extp %[Temp3], $ac0, 31 \n\t" | |
| 66 | |
| 67 /* | |
| 68 step1_1 = step2_1 + step2_2; | |
| 69 step1_2 = step2_1 - step2_2; | |
| 70 */ | |
| 71 "add %[step1_1], %[Temp2], %[Temp3] \n\t" | |
| 72 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" | |
| 73 | |
| 74 /* | |
| 75 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; | |
| 76 step2_3 = dct_const_round_shift(temp_2); | |
| 77 */ | |
| 78 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" | |
| 79 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" | |
| 80 "extp %[Temp1], $ac1, 31 \n\t" | |
| 81 | |
| 82 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 83 "mthi $zero, $ac0 \n\t" | |
| 84 | |
| 85 /* | |
| 86 step1_0 = step2_0 + step2_3; | |
| 87 step1_3 = step2_0 - step2_3; | |
| 88 */ | |
| 89 "add %[step1_0], %[Temp4], %[Temp1] \n\t" | |
| 90 "sub %[step1_3], %[Temp4], %[Temp1] \n\t" | |
| 91 | |
| 92 /* | |
| 93 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; | |
| 94 step1_4 = dct_const_round_shift(temp_1); | |
| 95 */ | |
| 96 "lh %[Temp0], 2(%[input]) \n\t" | |
| 97 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" | |
| 98 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 99 "mthi $zero, $ac1 \n\t" | |
| 100 "lh %[Temp1], 14(%[input]) \n\t" | |
| 101 "lh %[Temp0], 2(%[input]) \n\t" | |
| 102 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" | |
| 103 "extp %[step1_4], $ac0, 31 \n\t" | |
| 104 | |
| 105 /* | |
| 106 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; | |
| 107 step1_7 = dct_const_round_shift(temp_2); | |
| 108 */ | |
| 109 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" | |
| 110 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" | |
| 111 "extp %[step1_7], $ac1, 31 \n\t" | |
| 112 | |
| 113 /* | |
| 114 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; | |
| 115 step1_5 = dct_const_round_shift(temp_1); | |
| 116 */ | |
| 117 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 118 "mthi $zero, $ac0 \n\t" | |
| 119 "lh %[Temp0], 10(%[input]) \n\t" | |
| 120 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" | |
| 121 "lh %[Temp1], 6(%[input]) \n\t" | |
| 122 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" | |
| 123 "extp %[step1_5], $ac0, 31 \n\t" | |
| 124 | |
| 125 /* | |
| 126 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; | |
| 127 step1_6 = dct_const_round_shift(temp_2); | |
| 128 */ | |
| 129 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 130 "mthi $zero, $ac1 \n\t" | |
| 131 "lh %[Temp0], 10(%[input]) \n\t" | |
| 132 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" | |
| 133 "lh %[Temp1], 6(%[input]) \n\t" | |
| 134 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" | |
| 135 "extp %[step1_6], $ac1, 31 \n\t" | |
| 136 | |
| 137 /* | |
| 138 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; | |
| 139 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; | |
| 140 */ | |
| 141 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" | |
| 142 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" | |
| 143 "add %[Temp0], %[Temp0], %[step1_5] \n\t" | |
| 144 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" | |
| 145 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" | |
| 146 "add %[Temp1], %[Temp1], %[step1_7] \n\t" | |
| 147 | |
| 148 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 149 "mthi $zero, $ac0 \n\t" | |
| 150 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 151 "mthi $zero, $ac1 \n\t" | |
| 152 | |
| 153 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" | |
| 154 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" | |
| 155 | |
| 156 /* | |
| 157 step1_4 = step1_4 + step1_5; | |
| 158 step1_7 = step1_6 + step1_7; | |
| 159 */ | |
| 160 "add %[step1_4], %[step1_4], %[step1_5] \n\t" | |
| 161 "add %[step1_7], %[step1_7], %[step1_6] \n\t" | |
| 162 | |
| 163 "extp %[step1_5], $ac0, 31 \n\t" | |
| 164 "extp %[step1_6], $ac1, 31 \n\t" | |
| 165 | |
| 166 "add %[Temp0], %[step1_0], %[step1_7] \n\t" | |
| 167 "sh %[Temp0], 0(%[output]) \n\t" | |
| 168 "add %[Temp1], %[step1_1], %[step1_6] \n\t" | |
| 169 "sh %[Temp1], 16(%[output]) \n\t" | |
| 170 "add %[Temp0], %[step1_2], %[step1_5] \n\t" | |
| 171 "sh %[Temp0], 32(%[output]) \n\t" | |
| 172 "add %[Temp1], %[step1_3], %[step1_4] \n\t" | |
| 173 "sh %[Temp1], 48(%[output]) \n\t" | |
| 174 | |
| 175 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" | |
| 176 "sh %[Temp0], 64(%[output]) \n\t" | |
| 177 "sub %[Temp1], %[step1_2], %[step1_5] \n\t" | |
| 178 "sh %[Temp1], 80(%[output]) \n\t" | |
| 179 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" | |
| 180 "sh %[Temp0], 96(%[output]) \n\t" | |
| 181 "sub %[Temp1], %[step1_0], %[step1_7] \n\t" | |
| 182 "sh %[Temp1], 112(%[output]) \n\t" | |
| 183 | |
| 184 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), | |
| 185 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), | |
| 186 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), | |
| 187 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), | |
| 188 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), | |
| 189 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
| 190 [Temp4] "=&r" (Temp4) | |
| 191 : [const_2_power_13] "r" (const_2_power_13), | |
| 192 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), | |
| 193 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), | |
| 194 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), | |
| 195 [cospi_24_64] "r" (cospi_24_64), | |
| 196 [output] "r" (output), [input] "r" (input) | |
| 197 ); | |
| 198 | |
| 199 input += 8; | |
| 200 output += 1; | |
| 201 } | |
| 202 } | |
| 203 | |
| 204 static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, | |
| 205 int dest_stride) { | |
| 206 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | |
| 207 int Temp0, Temp1, Temp2, Temp3; | |
| 208 int i; | |
| 209 const int const_2_power_13 = 8192; | |
| 210 uint8_t *dest_pix; | |
| 211 uint8_t *cm = vpx_ff_cropTbl; | |
| 212 | |
| 213 /* prefetch vpx_ff_cropTbl */ | |
| 214 prefetch_load(vpx_ff_cropTbl); | |
| 215 prefetch_load(vpx_ff_cropTbl + 32); | |
| 216 prefetch_load(vpx_ff_cropTbl + 64); | |
| 217 prefetch_load(vpx_ff_cropTbl + 96); | |
| 218 prefetch_load(vpx_ff_cropTbl + 128); | |
| 219 prefetch_load(vpx_ff_cropTbl + 160); | |
| 220 prefetch_load(vpx_ff_cropTbl + 192); | |
| 221 prefetch_load(vpx_ff_cropTbl + 224); | |
| 222 | |
| 223 for (i = 0; i < 8; ++i) { | |
| 224 dest_pix = (dest + i); | |
| 225 | |
| 226 __asm__ __volatile__ ( | |
| 227 /* | |
| 228 temp_1 = (input[0] + input[4]) * cospi_16_64; | |
| 229 step2_0 = dct_const_round_shift(temp_1); | |
| 230 | |
| 231 temp_2 = (input[0] - input[4]) * cospi_16_64; | |
| 232 step2_1 = dct_const_round_shift(temp_2); | |
| 233 */ | |
| 234 "lh %[Temp0], 0(%[input]) \n\t" | |
| 235 "lh %[Temp1], 8(%[input]) \n\t" | |
| 236 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 237 "mthi $zero, $ac0 \n\t" | |
| 238 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 239 "mthi $zero, $ac1 \n\t" | |
| 240 "add %[Temp2], %[Temp0], %[Temp1] \n\t" | |
| 241 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" | |
| 242 "extp %[step1_6], $ac0, 31 \n\t" | |
| 243 | |
| 244 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" | |
| 245 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" | |
| 246 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 247 "mthi $zero, $ac0 \n\t" | |
| 248 "extp %[Temp2], $ac1, 31 \n\t" | |
| 249 | |
| 250 /* | |
| 251 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; | |
| 252 step2_2 = dct_const_round_shift(temp_1); | |
| 253 */ | |
| 254 "lh %[Temp0], 4(%[input]) \n\t" | |
| 255 "lh %[Temp1], 12(%[input]) \n\t" | |
| 256 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" | |
| 257 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" | |
| 258 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 259 "mthi $zero, $ac1 \n\t" | |
| 260 "extp %[Temp3], $ac0, 31 \n\t" | |
| 261 | |
| 262 /* | |
| 263 step1_1 = step2_1 + step2_2; | |
| 264 step1_2 = step2_1 - step2_2; | |
| 265 */ | |
| 266 "add %[step1_1], %[Temp2], %[Temp3] \n\t" | |
| 267 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" | |
| 268 | |
| 269 /* | |
| 270 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; | |
| 271 step2_3 = dct_const_round_shift(temp_2); | |
| 272 */ | |
| 273 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" | |
| 274 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" | |
| 275 "extp %[Temp1], $ac1, 31 \n\t" | |
| 276 | |
| 277 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 278 "mthi $zero, $ac0 \n\t" | |
| 279 | |
| 280 /* | |
| 281 step1_0 = step2_0 + step2_3; | |
| 282 step1_3 = step2_0 - step2_3; | |
| 283 */ | |
| 284 "add %[step1_0], %[step1_6], %[Temp1] \n\t" | |
| 285 "sub %[step1_3], %[step1_6], %[Temp1] \n\t" | |
| 286 | |
| 287 /* | |
| 288 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; | |
| 289 step1_4 = dct_const_round_shift(temp_1); | |
| 290 */ | |
| 291 "lh %[Temp0], 2(%[input]) \n\t" | |
| 292 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" | |
| 293 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 294 "mthi $zero, $ac1 \n\t" | |
| 295 "lh %[Temp1], 14(%[input]) \n\t" | |
| 296 "lh %[Temp0], 2(%[input]) \n\t" | |
| 297 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" | |
| 298 "extp %[step1_4], $ac0, 31 \n\t" | |
| 299 | |
| 300 /* | |
| 301 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; | |
| 302 step1_7 = dct_const_round_shift(temp_2); | |
| 303 */ | |
| 304 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" | |
| 305 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" | |
| 306 "extp %[step1_7], $ac1, 31 \n\t" | |
| 307 | |
| 308 /* | |
| 309 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; | |
| 310 step1_5 = dct_const_round_shift(temp_1); | |
| 311 */ | |
| 312 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 313 "mthi $zero, $ac0 \n\t" | |
| 314 "lh %[Temp0], 10(%[input]) \n\t" | |
| 315 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" | |
| 316 "lh %[Temp1], 6(%[input]) \n\t" | |
| 317 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" | |
| 318 "extp %[step1_5], $ac0, 31 \n\t" | |
| 319 | |
| 320 /* | |
| 321 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; | |
| 322 step1_6 = dct_const_round_shift(temp_2); | |
| 323 */ | |
| 324 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 325 "mthi $zero, $ac1 \n\t" | |
| 326 "lh %[Temp0], 10(%[input]) \n\t" | |
| 327 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" | |
| 328 "lh %[Temp1], 6(%[input]) \n\t" | |
| 329 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" | |
| 330 "extp %[step1_6], $ac1, 31 \n\t" | |
| 331 | |
| 332 /* | |
| 333 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; | |
| 334 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; | |
| 335 */ | |
| 336 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" | |
| 337 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" | |
| 338 "add %[Temp0], %[Temp0], %[step1_5] \n\t" | |
| 339 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" | |
| 340 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" | |
| 341 "add %[Temp1], %[Temp1], %[step1_7] \n\t" | |
| 342 | |
| 343 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 344 "mthi $zero, $ac0 \n\t" | |
| 345 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 346 "mthi $zero, $ac1 \n\t" | |
| 347 | |
| 348 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" | |
| 349 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" | |
| 350 | |
| 351 /* | |
| 352 step1_4 = step1_4 + step1_5; | |
| 353 step1_7 = step1_6 + step1_7; | |
| 354 */ | |
| 355 "add %[step1_4], %[step1_4], %[step1_5] \n\t" | |
| 356 "add %[step1_7], %[step1_7], %[step1_6] \n\t" | |
| 357 | |
| 358 "extp %[step1_5], $ac0, 31 \n\t" | |
| 359 "extp %[step1_6], $ac1, 31 \n\t" | |
| 360 | |
| 361 /* add block */ | |
| 362 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
| 363 "add %[Temp0], %[step1_0], %[step1_7] \n\t" | |
| 364 "addi %[Temp0], %[Temp0], 16 \n\t" | |
| 365 "sra %[Temp0], %[Temp0], 5 \n\t" | |
| 366 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
| 367 "add %[Temp0], %[step1_1], %[step1_6] \n\t" | |
| 368 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 369 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
| 370 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 371 | |
| 372 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
| 373 "addi %[Temp0], %[Temp0], 16 \n\t" | |
| 374 "sra %[Temp0], %[Temp0], 5 \n\t" | |
| 375 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
| 376 "add %[Temp0], %[step1_2], %[step1_5] \n\t" | |
| 377 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 378 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
| 379 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 380 | |
| 381 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
| 382 "addi %[Temp0], %[Temp0], 16 \n\t" | |
| 383 "sra %[Temp0], %[Temp0], 5 \n\t" | |
| 384 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
| 385 "add %[Temp0], %[step1_3], %[step1_4] \n\t" | |
| 386 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 387 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
| 388 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 389 | |
| 390 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
| 391 "addi %[Temp0], %[Temp0], 16 \n\t" | |
| 392 "sra %[Temp0], %[Temp0], 5 \n\t" | |
| 393 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
| 394 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" | |
| 395 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 396 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
| 397 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 398 | |
| 399 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
| 400 "addi %[Temp0], %[Temp0], 16 \n\t" | |
| 401 "sra %[Temp0], %[Temp0], 5 \n\t" | |
| 402 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
| 403 "sub %[Temp0], %[step1_2], %[step1_5] \n\t" | |
| 404 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 405 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
| 406 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 407 | |
| 408 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
| 409 "addi %[Temp0], %[Temp0], 16 \n\t" | |
| 410 "sra %[Temp0], %[Temp0], 5 \n\t" | |
| 411 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
| 412 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" | |
| 413 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 414 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
| 415 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 416 | |
| 417 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
| 418 "addi %[Temp0], %[Temp0], 16 \n\t" | |
| 419 "sra %[Temp0], %[Temp0], 5 \n\t" | |
| 420 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
| 421 "sub %[Temp0], %[step1_0], %[step1_7] \n\t" | |
| 422 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 423 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
| 424 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 425 | |
| 426 "lbu %[Temp1], 0(%[dest_pix]) \n\t" | |
| 427 "addi %[Temp0], %[Temp0], 16 \n\t" | |
| 428 "sra %[Temp0], %[Temp0], 5 \n\t" | |
| 429 "add %[Temp1], %[Temp1], %[Temp0] \n\t" | |
| 430 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 431 "sb %[Temp2], 0(%[dest_pix]) \n\t" | |
| 432 | |
| 433 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), | |
| 434 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), | |
| 435 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), | |
| 436 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), | |
| 437 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), | |
| 438 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
| 439 [dest_pix] "+r" (dest_pix) | |
| 440 : [const_2_power_13] "r" (const_2_power_13), | |
| 441 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), | |
| 442 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), | |
| 443 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), | |
| 444 [cospi_24_64] "r" (cospi_24_64), | |
| 445 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) | |
| 446 ); | |
| 447 | |
| 448 input += 8; | |
| 449 } | |
| 450 } | |
| 451 | |
| 452 void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 453 int dest_stride) { | |
| 454 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | |
| 455 int16_t *outptr = out; | |
| 456 uint32_t pos = 45; | |
| 457 | |
| 458 /* bit positon for extract from acc */ | |
| 459 __asm__ __volatile__ ( | |
| 460 "wrdsp %[pos], 1 \n\t" | |
| 461 : | |
| 462 : [pos] "r" (pos) | |
| 463 ); | |
| 464 | |
| 465 // First transform rows | |
| 466 idct8_rows_dspr2(input, outptr, 8); | |
| 467 | |
| 468 // Then transform columns and add to dest | |
| 469 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); | |
| 470 } | |
| 471 | |
| 472 static void iadst8_dspr2(const int16_t *input, int16_t *output) { | |
| 473 int s0, s1, s2, s3, s4, s5, s6, s7; | |
| 474 int x0, x1, x2, x3, x4, x5, x6, x7; | |
| 475 | |
| 476 x0 = input[7]; | |
| 477 x1 = input[0]; | |
| 478 x2 = input[5]; | |
| 479 x3 = input[2]; | |
| 480 x4 = input[3]; | |
| 481 x5 = input[4]; | |
| 482 x6 = input[1]; | |
| 483 x7 = input[6]; | |
| 484 | |
| 485 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { | |
| 486 output[0] = output[1] = output[2] = output[3] = output[4] | |
| 487 = output[5] = output[6] = output[7] = 0; | |
| 488 return; | |
| 489 } | |
| 490 | |
| 491 // stage 1 | |
| 492 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; | |
| 493 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; | |
| 494 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; | |
| 495 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; | |
| 496 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; | |
| 497 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; | |
| 498 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; | |
| 499 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; | |
| 500 | |
| 501 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); | |
| 502 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); | |
| 503 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); | |
| 504 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); | |
| 505 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); | |
| 506 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); | |
| 507 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); | |
| 508 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); | |
| 509 | |
| 510 // stage 2 | |
| 511 s0 = x0; | |
| 512 s1 = x1; | |
| 513 s2 = x2; | |
| 514 s3 = x3; | |
| 515 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; | |
| 516 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; | |
| 517 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; | |
| 518 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; | |
| 519 | |
| 520 x0 = s0 + s2; | |
| 521 x1 = s1 + s3; | |
| 522 x2 = s0 - s2; | |
| 523 x3 = s1 - s3; | |
| 524 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); | |
| 525 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); | |
| 526 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); | |
| 527 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); | |
| 528 | |
| 529 // stage 3 | |
| 530 s2 = cospi_16_64 * (x2 + x3); | |
| 531 s3 = cospi_16_64 * (x2 - x3); | |
| 532 s6 = cospi_16_64 * (x6 + x7); | |
| 533 s7 = cospi_16_64 * (x6 - x7); | |
| 534 | |
| 535 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); | |
| 536 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); | |
| 537 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); | |
| 538 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); | |
| 539 | |
| 540 output[0] = x0; | |
| 541 output[1] = -x4; | |
| 542 output[2] = x6; | |
| 543 output[3] = -x2; | |
| 544 output[4] = x3; | |
| 545 output[5] = -x7; | |
| 546 output[6] = x5; | |
| 547 output[7] = -x1; | |
| 548 } | |
| 549 | |
| 550 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, | 23 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, |
| 551 int dest_stride, int tx_type) { | 24 int dest_stride, int tx_type) { |
| 552 int i, j; | 25 int i, j; |
| 553 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | 26 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); |
| 554 int16_t *outptr = out; | 27 int16_t *outptr = out; |
| 555 int16_t temp_in[8 * 8], temp_out[8]; | 28 int16_t temp_in[8 * 8], temp_out[8]; |
| 556 uint32_t pos = 45; | 29 uint32_t pos = 45; |
| 557 | 30 |
| 558 /* bit positon for extract from acc */ | 31 /* bit positon for extract from acc */ |
| 559 __asm__ __volatile__ ( | 32 __asm__ __volatile__ ( |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 610 dest[j * dest_stride + i] = | 83 dest[j * dest_stride + i] = |
| 611 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) | 84 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) |
| 612 + dest[j * dest_stride + i]); | 85 + dest[j * dest_stride + i]); |
| 613 } | 86 } |
| 614 break; | 87 break; |
| 615 default: | 88 default: |
| 616 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); | 89 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); |
| 617 break; | 90 break; |
| 618 } | 91 } |
| 619 } | 92 } |
| 620 | |
| 621 void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 622 int dest_stride) { | |
| 623 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); | |
| 624 int16_t *outptr = out; | |
| 625 uint32_t pos = 45; | |
| 626 | |
| 627 /* bit positon for extract from acc */ | |
| 628 __asm__ __volatile__ ( | |
| 629 "wrdsp %[pos], 1 \n\t" | |
| 630 : | |
| 631 : [pos] "r" (pos) | |
| 632 ); | |
| 633 | |
| 634 // First transform rows | |
| 635 idct8_rows_dspr2(input, outptr, 4); | |
| 636 | |
| 637 outptr += 4; | |
| 638 | |
| 639 __asm__ __volatile__ ( | |
| 640 "sw $zero, 0(%[outptr]) \n\t" | |
| 641 "sw $zero, 4(%[outptr]) \n\t" | |
| 642 "sw $zero, 16(%[outptr]) \n\t" | |
| 643 "sw $zero, 20(%[outptr]) \n\t" | |
| 644 "sw $zero, 32(%[outptr]) \n\t" | |
| 645 "sw $zero, 36(%[outptr]) \n\t" | |
| 646 "sw $zero, 48(%[outptr]) \n\t" | |
| 647 "sw $zero, 52(%[outptr]) \n\t" | |
| 648 "sw $zero, 64(%[outptr]) \n\t" | |
| 649 "sw $zero, 68(%[outptr]) \n\t" | |
| 650 "sw $zero, 80(%[outptr]) \n\t" | |
| 651 "sw $zero, 84(%[outptr]) \n\t" | |
| 652 "sw $zero, 96(%[outptr]) \n\t" | |
| 653 "sw $zero, 100(%[outptr]) \n\t" | |
| 654 "sw $zero, 112(%[outptr]) \n\t" | |
| 655 "sw $zero, 116(%[outptr]) \n\t" | |
| 656 | |
| 657 : | |
| 658 : [outptr] "r" (outptr) | |
| 659 ); | |
| 660 | |
| 661 | |
| 662 // Then transform columns and add to dest | |
| 663 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); | |
| 664 } | |
| 665 | |
| 666 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 667 int dest_stride) { | |
| 668 uint32_t pos = 45; | |
| 669 int32_t out; | |
| 670 int32_t r; | |
| 671 int32_t a1, absa1; | |
| 672 int32_t t1, t2, vector_a1, vector_1, vector_2; | |
| 673 | |
| 674 /* bit positon for extract from acc */ | |
| 675 __asm__ __volatile__ ( | |
| 676 "wrdsp %[pos], 1 \n\t" | |
| 677 | |
| 678 : | |
| 679 : [pos] "r" (pos) | |
| 680 ); | |
| 681 | |
| 682 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); | |
| 683 __asm__ __volatile__ ( | |
| 684 "addi %[out], %[out], 16 \n\t" | |
| 685 "sra %[a1], %[out], 5 \n\t" | |
| 686 | |
| 687 : [out] "+r" (out), [a1] "=r" (a1) | |
| 688 : | |
| 689 ); | |
| 690 | |
| 691 if (a1 < 0) { | |
| 692 /* use quad-byte | |
| 693 * input and output memory are four byte aligned */ | |
| 694 __asm__ __volatile__ ( | |
| 695 "abs %[absa1], %[a1] \n\t" | |
| 696 "replv.qb %[vector_a1], %[absa1] \n\t" | |
| 697 | |
| 698 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) | |
| 699 : [a1] "r" (a1) | |
| 700 ); | |
| 701 | |
| 702 for (r = 8; r--;) { | |
| 703 __asm__ __volatile__ ( | |
| 704 "lw %[t1], 0(%[dest]) \n\t" | |
| 705 "lw %[t2], 4(%[dest]) \n\t" | |
| 706 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 707 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 708 "sw %[vector_1], 0(%[dest]) \n\t" | |
| 709 "sw %[vector_2], 4(%[dest]) \n\t" | |
| 710 "add %[dest], %[dest], %[dest_stride] \n\t" | |
| 711 | |
| 712 : [t1] "=&r" (t1), [t2] "=&r" (t2), | |
| 713 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
| 714 [dest] "+&r" (dest) | |
| 715 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
| 716 ); | |
| 717 } | |
| 718 } else { | |
| 719 /* use quad-byte | |
| 720 * input and output memory are four byte aligned */ | |
| 721 __asm__ __volatile__ ( | |
| 722 "replv.qb %[vector_a1], %[a1] \n\t" | |
| 723 | |
| 724 : [vector_a1] "=r" (vector_a1) | |
| 725 : [a1] "r" (a1) | |
| 726 ); | |
| 727 | |
| 728 for (r = 8; r--;) { | |
| 729 __asm__ __volatile__ ( | |
| 730 "lw %[t1], 0(%[dest]) \n\t" | |
| 731 "lw %[t2], 4(%[dest]) \n\t" | |
| 732 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 733 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 734 "sw %[vector_1], 0(%[dest]) \n\t" | |
| 735 "sw %[vector_2], 4(%[dest]) \n\t" | |
| 736 "add %[dest], %[dest], %[dest_stride] \n\t" | |
| 737 | |
| 738 : [t1] "=&r" (t1), [t2] "=&r" (t2), | |
| 739 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
| 740 [dest] "+r" (dest) | |
| 741 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
| 742 ); | |
| 743 } | |
| 744 } | |
| 745 } | |
| 746 #endif // #if HAVE_DSPR2 | 93 #endif // #if HAVE_DSPR2 |
| OLD | NEW |