| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | 11 #include <assert.h> |
| 12 #include <stdio.h> | 12 #include <stdio.h> |
| 13 | 13 |
| 14 #include "./vpx_config.h" | 14 #include "./vpx_config.h" |
| 15 #include "./vp9_rtcd.h" | 15 #include "./vp9_rtcd.h" |
| 16 #include "vp9/common/vp9_common.h" | 16 #include "vp9/common/vp9_common.h" |
| 17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
| 18 #include "vp9/common/vp9_idct.h" | 18 #include "vp9/common/vp9_idct.h" |
| 19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" | 19 #include "vpx_dsp/mips/inv_txfm_dspr2.h" |
| 20 #include "vpx_dsp/txfm_common.h" | 20 #include "vpx_dsp/txfm_common.h" |
| 21 #include "vpx_ports/mem.h" | 21 #include "vpx_ports/mem.h" |
| 22 | 22 |
| 23 #if HAVE_DSPR2 | 23 #if HAVE_DSPR2 |
| 24 static void idct16_rows_dspr2(const int16_t *input, int16_t *output, | |
| 25 uint32_t no_rows) { | |
| 26 int i; | |
| 27 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | |
| 28 int step1_10, step1_11, step1_12, step1_13; | |
| 29 int step2_0, step2_1, step2_2, step2_3; | |
| 30 int step2_8, step2_9, step2_10, step2_11; | |
| 31 int step2_12, step2_13, step2_14, step2_15; | |
| 32 int load1, load2, load3, load4, load5, load6, load7, load8; | |
| 33 int result1, result2, result3, result4; | |
| 34 const int const_2_power_13 = 8192; | |
| 35 | |
| 36 for (i = no_rows; i--; ) { | |
| 37 /* prefetch row */ | |
| 38 prefetch_load((const uint8_t *)(input + 16)); | |
| 39 | |
| 40 __asm__ __volatile__ ( | |
| 41 "lh %[load1], 0(%[input]) \n\t" | |
| 42 "lh %[load2], 16(%[input]) \n\t" | |
| 43 "lh %[load3], 8(%[input]) \n\t" | |
| 44 "lh %[load4], 24(%[input]) \n\t" | |
| 45 | |
| 46 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 47 "mthi $zero, $ac1 \n\t" | |
| 48 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 49 "mthi $zero, $ac2 \n\t" | |
| 50 "add %[result1], %[load1], %[load2] \n\t" | |
| 51 "sub %[result2], %[load1], %[load2] \n\t" | |
| 52 "madd $ac1, %[result1], %[cospi_16_64] \n\t" | |
| 53 "madd $ac2, %[result2], %[cospi_16_64] \n\t" | |
| 54 "extp %[step2_0], $ac1, 31 \n\t" | |
| 55 "extp %[step2_1], $ac2, 31 \n\t" | |
| 56 | |
| 57 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 58 "mthi $zero, $ac3 \n\t" | |
| 59 "madd $ac3, %[load3], %[cospi_24_64] \n\t" | |
| 60 "msub $ac3, %[load4], %[cospi_8_64] \n\t" | |
| 61 "extp %[step2_2], $ac3, 31 \n\t" | |
| 62 | |
| 63 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 64 "mthi $zero, $ac1 \n\t" | |
| 65 "madd $ac1, %[load3], %[cospi_8_64] \n\t" | |
| 66 "madd $ac1, %[load4], %[cospi_24_64] \n\t" | |
| 67 "extp %[step2_3], $ac1, 31 \n\t" | |
| 68 | |
| 69 "add %[step1_0], %[step2_0], %[step2_3] \n\t" | |
| 70 "add %[step1_1], %[step2_1], %[step2_2] \n\t" | |
| 71 "sub %[step1_2], %[step2_1], %[step2_2] \n\t" | |
| 72 "sub %[step1_3], %[step2_0], %[step2_3] \n\t" | |
| 73 | |
| 74 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 75 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 76 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 77 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), | |
| 78 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), | |
| 79 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), | |
| 80 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) | |
| 81 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 82 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), | |
| 83 [cospi_16_64] "r" (cospi_16_64) | |
| 84 ); | |
| 85 | |
| 86 __asm__ __volatile__ ( | |
| 87 "lh %[load5], 2(%[input]) \n\t" | |
| 88 "lh %[load6], 30(%[input]) \n\t" | |
| 89 "lh %[load7], 18(%[input]) \n\t" | |
| 90 "lh %[load8], 14(%[input]) \n\t" | |
| 91 | |
| 92 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 93 "mthi $zero, $ac1 \n\t" | |
| 94 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 95 "mthi $zero, $ac3 \n\t" | |
| 96 | |
| 97 "madd $ac1, %[load5], %[cospi_30_64] \n\t" | |
| 98 "msub $ac1, %[load6], %[cospi_2_64] \n\t" | |
| 99 "extp %[result1], $ac1, 31 \n\t" | |
| 100 | |
| 101 "madd $ac3, %[load7], %[cospi_14_64] \n\t" | |
| 102 "msub $ac3, %[load8], %[cospi_18_64] \n\t" | |
| 103 "extp %[result2], $ac3, 31 \n\t" | |
| 104 | |
| 105 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 106 "mthi $zero, $ac1 \n\t" | |
| 107 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 108 "mthi $zero, $ac2 \n\t" | |
| 109 | |
| 110 "madd $ac1, %[load7], %[cospi_18_64] \n\t" | |
| 111 "madd $ac1, %[load8], %[cospi_14_64] \n\t" | |
| 112 "extp %[result3], $ac1, 31 \n\t" | |
| 113 | |
| 114 "madd $ac2, %[load5], %[cospi_2_64] \n\t" | |
| 115 "madd $ac2, %[load6], %[cospi_30_64] \n\t" | |
| 116 "extp %[result4], $ac2, 31 \n\t" | |
| 117 | |
| 118 "sub %[load5], %[result1], %[result2] \n\t" | |
| 119 "sub %[load6], %[result4], %[result3] \n\t" | |
| 120 | |
| 121 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 122 "mthi $zero, $ac1 \n\t" | |
| 123 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 124 "mthi $zero, $ac3 \n\t" | |
| 125 | |
| 126 "madd $ac1, %[load6], %[cospi_24_64] \n\t" | |
| 127 "msub $ac1, %[load5], %[cospi_8_64] \n\t" | |
| 128 "madd $ac3, %[load5], %[cospi_24_64] \n\t" | |
| 129 "madd $ac3, %[load6], %[cospi_8_64] \n\t" | |
| 130 | |
| 131 "extp %[step2_9], $ac1, 31 \n\t" | |
| 132 "extp %[step2_14], $ac3, 31 \n\t" | |
| 133 "add %[step2_8], %[result1], %[result2] \n\t" | |
| 134 "add %[step2_15], %[result4], %[result3] \n\t" | |
| 135 | |
| 136 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
| 137 [load7] "=&r" (load7), [load8] "=&r" (load8), | |
| 138 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 139 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
| 140 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), | |
| 141 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) | |
| 142 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 143 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), | |
| 144 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), | |
| 145 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 146 ); | |
| 147 | |
| 148 __asm__ __volatile__ ( | |
| 149 "lh %[load1], 10(%[input]) \n\t" | |
| 150 "lh %[load2], 22(%[input]) \n\t" | |
| 151 "lh %[load3], 26(%[input]) \n\t" | |
| 152 "lh %[load4], 6(%[input]) \n\t" | |
| 153 | |
| 154 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 155 "mthi $zero, $ac1 \n\t" | |
| 156 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 157 "mthi $zero, $ac3 \n\t" | |
| 158 | |
| 159 "madd $ac1, %[load1], %[cospi_22_64] \n\t" | |
| 160 "msub $ac1, %[load2], %[cospi_10_64] \n\t" | |
| 161 "extp %[result1], $ac1, 31 \n\t" | |
| 162 | |
| 163 "madd $ac3, %[load3], %[cospi_6_64] \n\t" | |
| 164 "msub $ac3, %[load4], %[cospi_26_64] \n\t" | |
| 165 "extp %[result2], $ac3, 31 \n\t" | |
| 166 | |
| 167 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 168 "mthi $zero, $ac1 \n\t" | |
| 169 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 170 "mthi $zero, $ac2 \n\t" | |
| 171 | |
| 172 "madd $ac1, %[load1], %[cospi_10_64] \n\t" | |
| 173 "madd $ac1, %[load2], %[cospi_22_64] \n\t" | |
| 174 "extp %[result3], $ac1, 31 \n\t" | |
| 175 | |
| 176 "madd $ac2, %[load3], %[cospi_26_64] \n\t" | |
| 177 "madd $ac2, %[load4], %[cospi_6_64] \n\t" | |
| 178 "extp %[result4], $ac2, 31 \n\t" | |
| 179 | |
| 180 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 181 "mthi $zero, $ac1 \n\t" | |
| 182 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 183 "mthi $zero, $ac3 \n\t" | |
| 184 | |
| 185 "sub %[load1], %[result2], %[result1] \n\t" | |
| 186 "sub %[load2], %[result4], %[result3] \n\t" | |
| 187 | |
| 188 "msub $ac1, %[load1], %[cospi_24_64] \n\t" | |
| 189 "msub $ac1, %[load2], %[cospi_8_64] \n\t" | |
| 190 "madd $ac3, %[load2], %[cospi_24_64] \n\t" | |
| 191 "msub $ac3, %[load1], %[cospi_8_64] \n\t" | |
| 192 | |
| 193 "extp %[step2_10], $ac1, 31 \n\t" | |
| 194 "extp %[step2_13], $ac3, 31 \n\t" | |
| 195 "add %[step2_11], %[result1], %[result2] \n\t" | |
| 196 "add %[step2_12], %[result4], %[result3] \n\t" | |
| 197 | |
| 198 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 199 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 200 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 201 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
| 202 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), | |
| 203 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) | |
| 204 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 205 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), | |
| 206 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), | |
| 207 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 208 ); | |
| 209 | |
| 210 __asm__ __volatile__ ( | |
| 211 "lh %[load5], 4(%[input]) \n\t" | |
| 212 "lh %[load6], 28(%[input]) \n\t" | |
| 213 "lh %[load7], 20(%[input]) \n\t" | |
| 214 "lh %[load8], 12(%[input]) \n\t" | |
| 215 | |
| 216 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 217 "mthi $zero, $ac1 \n\t" | |
| 218 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 219 "mthi $zero, $ac3 \n\t" | |
| 220 | |
| 221 "madd $ac1, %[load5], %[cospi_28_64] \n\t" | |
| 222 "msub $ac1, %[load6], %[cospi_4_64] \n\t" | |
| 223 "extp %[result1], $ac1, 31 \n\t" | |
| 224 | |
| 225 "madd $ac3, %[load7], %[cospi_12_64] \n\t" | |
| 226 "msub $ac3, %[load8], %[cospi_20_64] \n\t" | |
| 227 "extp %[result2], $ac3, 31 \n\t" | |
| 228 | |
| 229 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 230 "mthi $zero, $ac1 \n\t" | |
| 231 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 232 "mthi $zero, $ac2 \n\t" | |
| 233 | |
| 234 "madd $ac1, %[load7], %[cospi_20_64] \n\t" | |
| 235 "madd $ac1, %[load8], %[cospi_12_64] \n\t" | |
| 236 "extp %[result3], $ac1, 31 \n\t" | |
| 237 | |
| 238 "madd $ac2, %[load5], %[cospi_4_64] \n\t" | |
| 239 "madd $ac2, %[load6], %[cospi_28_64] \n\t" | |
| 240 "extp %[result4], $ac2, 31 \n\t" | |
| 241 | |
| 242 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 243 "mthi $zero, $ac1 \n\t" | |
| 244 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 245 "mthi $zero, $ac3 \n\t" | |
| 246 | |
| 247 "sub %[load5], %[result4], %[result3] \n\t" | |
| 248 "sub %[load5], %[load5], %[result1] \n\t" | |
| 249 "add %[load5], %[load5], %[result2] \n\t" | |
| 250 | |
| 251 "sub %[load6], %[result1], %[result2] \n\t" | |
| 252 "sub %[load6], %[load6], %[result3] \n\t" | |
| 253 "add %[load6], %[load6], %[result4] \n\t" | |
| 254 | |
| 255 "madd $ac1, %[load5], %[cospi_16_64] \n\t" | |
| 256 "madd $ac3, %[load6], %[cospi_16_64] \n\t" | |
| 257 | |
| 258 "extp %[step1_5], $ac1, 31 \n\t" | |
| 259 "extp %[step1_6], $ac3, 31 \n\t" | |
| 260 "add %[step1_4], %[result1], %[result2] \n\t" | |
| 261 "add %[step1_7], %[result4], %[result3] \n\t" | |
| 262 | |
| 263 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
| 264 [load7] "=&r" (load7), [load8] "=&r" (load8), | |
| 265 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 266 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
| 267 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), | |
| 268 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) | |
| 269 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 270 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), | |
| 271 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), | |
| 272 [cospi_16_64] "r" (cospi_16_64) | |
| 273 ); | |
| 274 | |
| 275 __asm__ __volatile__ ( | |
| 276 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 277 "mthi $zero, $ac0 \n\t" | |
| 278 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 279 "mthi $zero, $ac1 \n\t" | |
| 280 | |
| 281 "sub %[load5], %[step2_14], %[step2_13] \n\t" | |
| 282 "sub %[load5], %[load5], %[step2_9] \n\t" | |
| 283 "add %[load5], %[load5], %[step2_10] \n\t" | |
| 284 | |
| 285 "madd $ac0, %[load5], %[cospi_16_64] \n\t" | |
| 286 | |
| 287 "sub %[load6], %[step2_14], %[step2_13] \n\t" | |
| 288 "sub %[load6], %[load6], %[step2_10] \n\t" | |
| 289 "add %[load6], %[load6], %[step2_9] \n\t" | |
| 290 | |
| 291 "madd $ac1, %[load6], %[cospi_16_64] \n\t" | |
| 292 | |
| 293 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 294 "mthi $zero, $ac2 \n\t" | |
| 295 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 296 "mthi $zero, $ac3 \n\t" | |
| 297 | |
| 298 "sub %[load5], %[step2_15], %[step2_12] \n\t" | |
| 299 "sub %[load5], %[load5], %[step2_8] \n\t" | |
| 300 "add %[load5], %[load5], %[step2_11] \n\t" | |
| 301 | |
| 302 "madd $ac2, %[load5], %[cospi_16_64] \n\t" | |
| 303 | |
| 304 "sub %[load6], %[step2_15], %[step2_12] \n\t" | |
| 305 "sub %[load6], %[load6], %[step2_11] \n\t" | |
| 306 "add %[load6], %[load6], %[step2_8] \n\t" | |
| 307 | |
| 308 "madd $ac3, %[load6], %[cospi_16_64] \n\t" | |
| 309 | |
| 310 "extp %[step1_10], $ac0, 31 \n\t" | |
| 311 "extp %[step1_13], $ac1, 31 \n\t" | |
| 312 "extp %[step1_11], $ac2, 31 \n\t" | |
| 313 "extp %[step1_12], $ac3, 31 \n\t" | |
| 314 | |
| 315 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
| 316 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), | |
| 317 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) | |
| 318 : [const_2_power_13] "r" (const_2_power_13), | |
| 319 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), | |
| 320 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), | |
| 321 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), | |
| 322 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), | |
| 323 [cospi_16_64] "r" (cospi_16_64) | |
| 324 ); | |
| 325 | |
| 326 __asm__ __volatile__ ( | |
| 327 "add %[load5], %[step1_0], %[step1_7] \n\t" | |
| 328 "add %[load5], %[load5], %[step2_12] \n\t" | |
| 329 "add %[load5], %[load5], %[step2_15] \n\t" | |
| 330 "add %[load6], %[step1_1], %[step1_6] \n\t" | |
| 331 "add %[load6], %[load6], %[step2_13] \n\t" | |
| 332 "add %[load6], %[load6], %[step2_14] \n\t" | |
| 333 "sh %[load5], 0(%[output]) \n\t" | |
| 334 "sh %[load6], 32(%[output]) \n\t" | |
| 335 "sub %[load5], %[step1_1], %[step1_6] \n\t" | |
| 336 "add %[load5], %[load5], %[step2_9] \n\t" | |
| 337 "add %[load5], %[load5], %[step2_10] \n\t" | |
| 338 "sub %[load6], %[step1_0], %[step1_7] \n\t" | |
| 339 "add %[load6], %[load6], %[step2_8] \n\t" | |
| 340 "add %[load6], %[load6], %[step2_11] \n\t" | |
| 341 "sh %[load5], 192(%[output]) \n\t" | |
| 342 "sh %[load6], 224(%[output]) \n\t" | |
| 343 "sub %[load5], %[step1_0], %[step1_7] \n\t" | |
| 344 "sub %[load5], %[load5], %[step2_8] \n\t" | |
| 345 "sub %[load5], %[load5], %[step2_11] \n\t" | |
| 346 "sub %[load6], %[step1_1], %[step1_6] \n\t" | |
| 347 "sub %[load6], %[load6], %[step2_9] \n\t" | |
| 348 "sub %[load6], %[load6], %[step2_10] \n\t" | |
| 349 "sh %[load5], 256(%[output]) \n\t" | |
| 350 "sh %[load6], 288(%[output]) \n\t" | |
| 351 "add %[load5], %[step1_1], %[step1_6] \n\t" | |
| 352 "sub %[load5], %[load5], %[step2_13] \n\t" | |
| 353 "sub %[load5], %[load5], %[step2_14] \n\t" | |
| 354 "add %[load6], %[step1_0], %[step1_7] \n\t" | |
| 355 "sub %[load6], %[load6], %[step2_12] \n\t" | |
| 356 "sub %[load6], %[load6], %[step2_15] \n\t" | |
| 357 "sh %[load5], 448(%[output]) \n\t" | |
| 358 "sh %[load6], 480(%[output]) \n\t" | |
| 359 | |
| 360 : [load5] "=&r" (load5), [load6] "=&r" (load6) | |
| 361 : [output] "r" (output), | |
| 362 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), | |
| 363 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), | |
| 364 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), | |
| 365 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), | |
| 366 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), | |
| 367 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15) | |
| 368 ); | |
| 369 | |
| 370 __asm__ __volatile__ ( | |
| 371 "add %[load5], %[step1_2], %[step1_5] \n\t" | |
| 372 "add %[load5], %[load5], %[step1_13] \n\t" | |
| 373 "add %[load6], %[step1_3], %[step1_4] \n\t" | |
| 374 "add %[load6], %[load6], %[step1_12] \n\t" | |
| 375 "sh %[load5], 64(%[output]) \n\t" | |
| 376 "sh %[load6], 96(%[output]) \n\t" | |
| 377 "sub %[load5], %[step1_3], %[step1_4] \n\t" | |
| 378 "add %[load5], %[load5], %[step1_11] \n\t" | |
| 379 "sub %[load6], %[step1_2], %[step1_5] \n\t" | |
| 380 "add %[load6], %[load6], %[step1_10] \n\t" | |
| 381 "sh %[load5], 128(%[output]) \n\t" | |
| 382 "sh %[load6], 160(%[output]) \n\t" | |
| 383 "sub %[load5], %[step1_2], %[step1_5] \n\t" | |
| 384 "sub %[load5], %[load5], %[step1_10] \n\t" | |
| 385 "sub %[load6], %[step1_3], %[step1_4] \n\t" | |
| 386 "sub %[load6], %[load6], %[step1_11] \n\t" | |
| 387 "sh %[load5], 320(%[output]) \n\t" | |
| 388 "sh %[load6], 352(%[output]) \n\t" | |
| 389 "add %[load5], %[step1_3], %[step1_4] \n\t" | |
| 390 "sub %[load5], %[load5], %[step1_12] \n\t" | |
| 391 "add %[load6], %[step1_2], %[step1_5] \n\t" | |
| 392 "sub %[load6], %[load6], %[step1_13] \n\t" | |
| 393 "sh %[load5], 384(%[output]) \n\t" | |
| 394 "sh %[load6], 416(%[output]) \n\t" | |
| 395 | |
| 396 : [load5] "=&r" (load5), [load6] "=&r" (load6) | |
| 397 : [output] "r" (output), | |
| 398 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), | |
| 399 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), | |
| 400 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | |
| 401 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) | |
| 402 ); | |
| 403 | |
| 404 input += 16; | |
| 405 output += 1; | |
| 406 } | |
| 407 } | |
| 408 | |
| 409 static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, | |
| 410 int dest_stride) { | |
| 411 int i; | |
| 412 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; | |
| 413 int step1_8, step1_9, step1_10, step1_11; | |
| 414 int step1_12, step1_13, step1_14, step1_15; | |
| 415 int step2_0, step2_1, step2_2, step2_3; | |
| 416 int step2_8, step2_9, step2_10, step2_11; | |
| 417 int step2_12, step2_13, step2_14, step2_15; | |
| 418 int load1, load2, load3, load4, load5, load6, load7, load8; | |
| 419 int result1, result2, result3, result4; | |
| 420 const int const_2_power_13 = 8192; | |
| 421 uint8_t *dest_pix; | |
| 422 uint8_t *cm = vpx_ff_cropTbl; | |
| 423 | |
| 424 /* prefetch vpx_ff_cropTbl */ | |
| 425 prefetch_load(vpx_ff_cropTbl); | |
| 426 prefetch_load(vpx_ff_cropTbl + 32); | |
| 427 prefetch_load(vpx_ff_cropTbl + 64); | |
| 428 prefetch_load(vpx_ff_cropTbl + 96); | |
| 429 prefetch_load(vpx_ff_cropTbl + 128); | |
| 430 prefetch_load(vpx_ff_cropTbl + 160); | |
| 431 prefetch_load(vpx_ff_cropTbl + 192); | |
| 432 prefetch_load(vpx_ff_cropTbl + 224); | |
| 433 | |
| 434 for (i = 0; i < 16; ++i) { | |
| 435 dest_pix = (dest + i); | |
| 436 __asm__ __volatile__ ( | |
| 437 "lh %[load1], 0(%[input]) \n\t" | |
| 438 "lh %[load2], 16(%[input]) \n\t" | |
| 439 "lh %[load3], 8(%[input]) \n\t" | |
| 440 "lh %[load4], 24(%[input]) \n\t" | |
| 441 | |
| 442 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 443 "mthi $zero, $ac1 \n\t" | |
| 444 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 445 "mthi $zero, $ac2 \n\t" | |
| 446 "add %[result1], %[load1], %[load2] \n\t" | |
| 447 "sub %[result2], %[load1], %[load2] \n\t" | |
| 448 "madd $ac1, %[result1], %[cospi_16_64] \n\t" | |
| 449 "madd $ac2, %[result2], %[cospi_16_64] \n\t" | |
| 450 "extp %[step2_0], $ac1, 31 \n\t" | |
| 451 "extp %[step2_1], $ac2, 31 \n\t" | |
| 452 | |
| 453 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 454 "mthi $zero, $ac3 \n\t" | |
| 455 "madd $ac3, %[load3], %[cospi_24_64] \n\t" | |
| 456 "msub $ac3, %[load4], %[cospi_8_64] \n\t" | |
| 457 "extp %[step2_2], $ac3, 31 \n\t" | |
| 458 | |
| 459 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 460 "mthi $zero, $ac1 \n\t" | |
| 461 "madd $ac1, %[load3], %[cospi_8_64] \n\t" | |
| 462 "madd $ac1, %[load4], %[cospi_24_64] \n\t" | |
| 463 "extp %[step2_3], $ac1, 31 \n\t" | |
| 464 | |
| 465 "add %[step1_0], %[step2_0], %[step2_3] \n\t" | |
| 466 "add %[step1_1], %[step2_1], %[step2_2] \n\t" | |
| 467 "sub %[step1_2], %[step2_1], %[step2_2] \n\t" | |
| 468 "sub %[step1_3], %[step2_0], %[step2_3] \n\t" | |
| 469 | |
| 470 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 471 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 472 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 473 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), | |
| 474 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), | |
| 475 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), | |
| 476 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) | |
| 477 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 478 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), | |
| 479 [cospi_16_64] "r" (cospi_16_64) | |
| 480 ); | |
| 481 | |
| 482 __asm__ __volatile__ ( | |
| 483 "lh %[load5], 2(%[input]) \n\t" | |
| 484 "lh %[load6], 30(%[input]) \n\t" | |
| 485 "lh %[load7], 18(%[input]) \n\t" | |
| 486 "lh %[load8], 14(%[input]) \n\t" | |
| 487 | |
| 488 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 489 "mthi $zero, $ac1 \n\t" | |
| 490 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 491 "mthi $zero, $ac3 \n\t" | |
| 492 | |
| 493 "madd $ac1, %[load5], %[cospi_30_64] \n\t" | |
| 494 "msub $ac1, %[load6], %[cospi_2_64] \n\t" | |
| 495 "extp %[result1], $ac1, 31 \n\t" | |
| 496 | |
| 497 "madd $ac3, %[load7], %[cospi_14_64] \n\t" | |
| 498 "msub $ac3, %[load8], %[cospi_18_64] \n\t" | |
| 499 "extp %[result2], $ac3, 31 \n\t" | |
| 500 | |
| 501 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 502 "mthi $zero, $ac1 \n\t" | |
| 503 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 504 "mthi $zero, $ac2 \n\t" | |
| 505 | |
| 506 "madd $ac1, %[load7], %[cospi_18_64] \n\t" | |
| 507 "madd $ac1, %[load8], %[cospi_14_64] \n\t" | |
| 508 "extp %[result3], $ac1, 31 \n\t" | |
| 509 | |
| 510 "madd $ac2, %[load5], %[cospi_2_64] \n\t" | |
| 511 "madd $ac2, %[load6], %[cospi_30_64] \n\t" | |
| 512 "extp %[result4], $ac2, 31 \n\t" | |
| 513 | |
| 514 "sub %[load5], %[result1], %[result2] \n\t" | |
| 515 "sub %[load6], %[result4], %[result3] \n\t" | |
| 516 | |
| 517 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 518 "mthi $zero, $ac1 \n\t" | |
| 519 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 520 "mthi $zero, $ac3 \n\t" | |
| 521 | |
| 522 "madd $ac1, %[load6], %[cospi_24_64] \n\t" | |
| 523 "msub $ac1, %[load5], %[cospi_8_64] \n\t" | |
| 524 "madd $ac3, %[load5], %[cospi_24_64] \n\t" | |
| 525 "madd $ac3, %[load6], %[cospi_8_64] \n\t" | |
| 526 | |
| 527 "extp %[step2_9], $ac1, 31 \n\t" | |
| 528 "extp %[step2_14], $ac3, 31 \n\t" | |
| 529 "add %[step2_8], %[result1], %[result2] \n\t" | |
| 530 "add %[step2_15], %[result4], %[result3] \n\t" | |
| 531 | |
| 532 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
| 533 [load7] "=&r" (load7), [load8] "=&r" (load8), | |
| 534 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 535 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
| 536 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), | |
| 537 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) | |
| 538 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 539 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), | |
| 540 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), | |
| 541 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 542 ); | |
| 543 | |
| 544 __asm__ __volatile__ ( | |
| 545 "lh %[load1], 10(%[input]) \n\t" | |
| 546 "lh %[load2], 22(%[input]) \n\t" | |
| 547 "lh %[load3], 26(%[input]) \n\t" | |
| 548 "lh %[load4], 6(%[input]) \n\t" | |
| 549 | |
| 550 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 551 "mthi $zero, $ac1 \n\t" | |
| 552 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 553 "mthi $zero, $ac3 \n\t" | |
| 554 | |
| 555 "madd $ac1, %[load1], %[cospi_22_64] \n\t" | |
| 556 "msub $ac1, %[load2], %[cospi_10_64] \n\t" | |
| 557 "extp %[result1], $ac1, 31 \n\t" | |
| 558 | |
| 559 "madd $ac3, %[load3], %[cospi_6_64] \n\t" | |
| 560 "msub $ac3, %[load4], %[cospi_26_64] \n\t" | |
| 561 "extp %[result2], $ac3, 31 \n\t" | |
| 562 | |
| 563 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 564 "mthi $zero, $ac1 \n\t" | |
| 565 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 566 "mthi $zero, $ac2 \n\t" | |
| 567 | |
| 568 "madd $ac1, %[load1], %[cospi_10_64] \n\t" | |
| 569 "madd $ac1, %[load2], %[cospi_22_64] \n\t" | |
| 570 "extp %[result3], $ac1, 31 \n\t" | |
| 571 | |
| 572 "madd $ac2, %[load3], %[cospi_26_64] \n\t" | |
| 573 "madd $ac2, %[load4], %[cospi_6_64] \n\t" | |
| 574 "extp %[result4], $ac2, 31 \n\t" | |
| 575 | |
| 576 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 577 "mthi $zero, $ac1 \n\t" | |
| 578 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 579 "mthi $zero, $ac3 \n\t" | |
| 580 | |
| 581 "sub %[load1], %[result2], %[result1] \n\t" | |
| 582 "sub %[load2], %[result4], %[result3] \n\t" | |
| 583 | |
| 584 "msub $ac1, %[load1], %[cospi_24_64] \n\t" | |
| 585 "msub $ac1, %[load2], %[cospi_8_64] \n\t" | |
| 586 "madd $ac3, %[load2], %[cospi_24_64] \n\t" | |
| 587 "msub $ac3, %[load1], %[cospi_8_64] \n\t" | |
| 588 | |
| 589 "extp %[step2_10], $ac1, 31 \n\t" | |
| 590 "extp %[step2_13], $ac3, 31 \n\t" | |
| 591 "add %[step2_11], %[result1], %[result2] \n\t" | |
| 592 "add %[step2_12], %[result4], %[result3] \n\t" | |
| 593 | |
| 594 : [load1] "=&r" (load1), [load2] "=&r" (load2), | |
| 595 [load3] "=&r" (load3), [load4] "=&r" (load4), | |
| 596 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 597 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
| 598 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), | |
| 599 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) | |
| 600 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 601 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), | |
| 602 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), | |
| 603 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) | |
| 604 ); | |
| 605 | |
| 606 __asm__ __volatile__ ( | |
| 607 "lh %[load5], 4(%[input]) \n\t" | |
| 608 "lh %[load6], 28(%[input]) \n\t" | |
| 609 "lh %[load7], 20(%[input]) \n\t" | |
| 610 "lh %[load8], 12(%[input]) \n\t" | |
| 611 | |
| 612 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 613 "mthi $zero, $ac1 \n\t" | |
| 614 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 615 "mthi $zero, $ac3 \n\t" | |
| 616 | |
| 617 "madd $ac1, %[load5], %[cospi_28_64] \n\t" | |
| 618 "msub $ac1, %[load6], %[cospi_4_64] \n\t" | |
| 619 "extp %[result1], $ac1, 31 \n\t" | |
| 620 | |
| 621 "madd $ac3, %[load7], %[cospi_12_64] \n\t" | |
| 622 "msub $ac3, %[load8], %[cospi_20_64] \n\t" | |
| 623 "extp %[result2], $ac3, 31 \n\t" | |
| 624 | |
| 625 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 626 "mthi $zero, $ac1 \n\t" | |
| 627 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 628 "mthi $zero, $ac2 \n\t" | |
| 629 | |
| 630 "madd $ac1, %[load7], %[cospi_20_64] \n\t" | |
| 631 "madd $ac1, %[load8], %[cospi_12_64] \n\t" | |
| 632 "extp %[result3], $ac1, 31 \n\t" | |
| 633 | |
| 634 "madd $ac2, %[load5], %[cospi_4_64] \n\t" | |
| 635 "madd $ac2, %[load6], %[cospi_28_64] \n\t" | |
| 636 "extp %[result4], $ac2, 31 \n\t" | |
| 637 | |
| 638 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 639 "mthi $zero, $ac1 \n\t" | |
| 640 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 641 "mthi $zero, $ac3 \n\t" | |
| 642 | |
| 643 "sub %[load5], %[result4], %[result3] \n\t" | |
| 644 "sub %[load5], %[load5], %[result1] \n\t" | |
| 645 "add %[load5], %[load5], %[result2] \n\t" | |
| 646 | |
| 647 "sub %[load6], %[result1], %[result2] \n\t" | |
| 648 "sub %[load6], %[load6], %[result3] \n\t" | |
| 649 "add %[load6], %[load6], %[result4] \n\t" | |
| 650 | |
| 651 "madd $ac1, %[load5], %[cospi_16_64] \n\t" | |
| 652 "madd $ac3, %[load6], %[cospi_16_64] \n\t" | |
| 653 | |
| 654 "extp %[step1_5], $ac1, 31 \n\t" | |
| 655 "extp %[step1_6], $ac3, 31 \n\t" | |
| 656 | |
| 657 "add %[step1_4], %[result1], %[result2] \n\t" | |
| 658 "add %[step1_7], %[result4], %[result3] \n\t" | |
| 659 | |
| 660 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
| 661 [load7] "=&r" (load7), [load8] "=&r" (load8), | |
| 662 [result1] "=&r" (result1), [result2] "=&r" (result2), | |
| 663 [result3] "=&r" (result3), [result4] "=&r" (result4), | |
| 664 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), | |
| 665 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) | |
| 666 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), | |
| 667 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), | |
| 668 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), | |
| 669 [cospi_16_64] "r" (cospi_16_64) | |
| 670 ); | |
| 671 | |
| 672 __asm__ __volatile__ ( | |
| 673 "mtlo %[const_2_power_13], $ac0 \n\t" | |
| 674 "mthi $zero, $ac0 \n\t" | |
| 675 "mtlo %[const_2_power_13], $ac1 \n\t" | |
| 676 "mthi $zero, $ac1 \n\t" | |
| 677 | |
| 678 "sub %[load5], %[step2_14], %[step2_13] \n\t" | |
| 679 "sub %[load5], %[load5], %[step2_9] \n\t" | |
| 680 "add %[load5], %[load5], %[step2_10] \n\t" | |
| 681 | |
| 682 "madd $ac0, %[load5], %[cospi_16_64] \n\t" | |
| 683 | |
| 684 "sub %[load6], %[step2_14], %[step2_13] \n\t" | |
| 685 "sub %[load6], %[load6], %[step2_10] \n\t" | |
| 686 "add %[load6], %[load6], %[step2_9] \n\t" | |
| 687 | |
| 688 "madd $ac1, %[load6], %[cospi_16_64] \n\t" | |
| 689 | |
| 690 "mtlo %[const_2_power_13], $ac2 \n\t" | |
| 691 "mthi $zero, $ac2 \n\t" | |
| 692 "mtlo %[const_2_power_13], $ac3 \n\t" | |
| 693 "mthi $zero, $ac3 \n\t" | |
| 694 | |
| 695 "sub %[load5], %[step2_15], %[step2_12] \n\t" | |
| 696 "sub %[load5], %[load5], %[step2_8] \n\t" | |
| 697 "add %[load5], %[load5], %[step2_11] \n\t" | |
| 698 | |
| 699 "madd $ac2, %[load5], %[cospi_16_64] \n\t" | |
| 700 | |
| 701 "sub %[load6], %[step2_15], %[step2_12] \n\t" | |
| 702 "sub %[load6], %[load6], %[step2_11] \n\t" | |
| 703 "add %[load6], %[load6], %[step2_8] \n\t" | |
| 704 | |
| 705 "madd $ac3, %[load6], %[cospi_16_64] \n\t" | |
| 706 | |
| 707 "extp %[step1_10], $ac0, 31 \n\t" | |
| 708 "extp %[step1_13], $ac1, 31 \n\t" | |
| 709 "extp %[step1_11], $ac2, 31 \n\t" | |
| 710 "extp %[step1_12], $ac3, 31 \n\t" | |
| 711 | |
| 712 : [load5] "=&r" (load5), [load6] "=&r" (load6), | |
| 713 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), | |
| 714 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) | |
| 715 : [const_2_power_13] "r" (const_2_power_13), | |
| 716 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), | |
| 717 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), | |
| 718 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), | |
| 719 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), | |
| 720 [cospi_16_64] "r" (cospi_16_64) | |
| 721 ); | |
| 722 | |
| 723 step1_8 = step2_8 + step2_11; | |
| 724 step1_9 = step2_9 + step2_10; | |
| 725 step1_14 = step2_13 + step2_14; | |
| 726 step1_15 = step2_12 + step2_15; | |
| 727 | |
| 728 __asm__ __volatile__ ( | |
| 729 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
| 730 "add %[load5], %[step1_0], %[step1_7] \n\t" | |
| 731 "add %[load5], %[load5], %[step1_15] \n\t" | |
| 732 "addi %[load5], %[load5], 32 \n\t" | |
| 733 "sra %[load5], %[load5], 6 \n\t" | |
| 734 "add %[load7], %[load7], %[load5] \n\t" | |
| 735 "lbux %[load5], %[load7](%[cm]) \n\t" | |
| 736 "add %[load6], %[step1_1], %[step1_6] \n\t" | |
| 737 "add %[load6], %[load6], %[step1_14] \n\t" | |
| 738 "sb %[load5], 0(%[dest_pix]) \n\t" | |
| 739 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 740 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
| 741 "addi %[load6], %[load6], 32 \n\t" | |
| 742 "sra %[load6], %[load6], 6 \n\t" | |
| 743 "add %[load8], %[load8], %[load6] \n\t" | |
| 744 "lbux %[load6], %[load8](%[cm]) \n\t" | |
| 745 "sb %[load6], 0(%[dest_pix]) \n\t" | |
| 746 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 747 | |
| 748 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
| 749 "add %[load5], %[step1_2], %[step1_5] \n\t" | |
| 750 "add %[load5], %[load5], %[step1_13] \n\t" | |
| 751 "addi %[load5], %[load5], 32 \n\t" | |
| 752 "sra %[load5], %[load5], 6 \n\t" | |
| 753 "add %[load7], %[load7], %[load5] \n\t" | |
| 754 "lbux %[load5], %[load7](%[cm]) \n\t" | |
| 755 "add %[load6], %[step1_3], %[step1_4] \n\t" | |
| 756 "add %[load6], %[load6], %[step1_12] \n\t" | |
| 757 "sb %[load5], 0(%[dest_pix]) \n\t" | |
| 758 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 759 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
| 760 "addi %[load6], %[load6], 32 \n\t" | |
| 761 "sra %[load6], %[load6], 6 \n\t" | |
| 762 "add %[load8], %[load8], %[load6] \n\t" | |
| 763 "lbux %[load6], %[load8](%[cm]) \n\t" | |
| 764 "sb %[load6], 0(%[dest_pix]) \n\t" | |
| 765 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 766 | |
| 767 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
| 768 "sub %[load5], %[step1_3], %[step1_4] \n\t" | |
| 769 "add %[load5], %[load5], %[step1_11] \n\t" | |
| 770 "addi %[load5], %[load5], 32 \n\t" | |
| 771 "sra %[load5], %[load5], 6 \n\t" | |
| 772 "add %[load7], %[load7], %[load5] \n\t" | |
| 773 "lbux %[load5], %[load7](%[cm]) \n\t" | |
| 774 "sub %[load6], %[step1_2], %[step1_5] \n\t" | |
| 775 "add %[load6], %[load6], %[step1_10] \n\t" | |
| 776 "sb %[load5], 0(%[dest_pix]) \n\t" | |
| 777 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 778 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
| 779 "addi %[load6], %[load6], 32 \n\t" | |
| 780 "sra %[load6], %[load6], 6 \n\t" | |
| 781 "add %[load8], %[load8], %[load6] \n\t" | |
| 782 "lbux %[load6], %[load8](%[cm]) \n\t" | |
| 783 "sb %[load6], 0(%[dest_pix]) \n\t" | |
| 784 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 785 | |
| 786 "sub %[load5], %[step1_1], %[step1_6] \n\t" | |
| 787 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
| 788 "add %[load5], %[load5], %[step1_9] \n\t" | |
| 789 "addi %[load5], %[load5], 32 \n\t" | |
| 790 "sra %[load5], %[load5], 6 \n\t" | |
| 791 "add %[load7], %[load7], %[load5] \n\t" | |
| 792 "lbux %[load5], %[load7](%[cm]) \n\t" | |
| 793 "sub %[load6], %[step1_0], %[step1_7] \n\t" | |
| 794 "add %[load6], %[load6], %[step1_8] \n\t" | |
| 795 "sb %[load5], 0(%[dest_pix]) \n\t" | |
| 796 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 797 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
| 798 "addi %[load6], %[load6], 32 \n\t" | |
| 799 "sra %[load6], %[load6], 6 \n\t" | |
| 800 "add %[load8], %[load8], %[load6] \n\t" | |
| 801 "lbux %[load6], %[load8](%[cm]) \n\t" | |
| 802 "sb %[load6], 0(%[dest_pix]) \n\t" | |
| 803 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 804 | |
| 805 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
| 806 "sub %[load5], %[step1_0], %[step1_7] \n\t" | |
| 807 "sub %[load5], %[load5], %[step1_8] \n\t" | |
| 808 "addi %[load5], %[load5], 32 \n\t" | |
| 809 "sra %[load5], %[load5], 6 \n\t" | |
| 810 "add %[load7], %[load7], %[load5] \n\t" | |
| 811 "lbux %[load5], %[load7](%[cm]) \n\t" | |
| 812 "sub %[load6], %[step1_1], %[step1_6] \n\t" | |
| 813 "sub %[load6], %[load6], %[step1_9] \n\t" | |
| 814 "sb %[load5], 0(%[dest_pix]) \n\t" | |
| 815 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 816 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
| 817 "addi %[load6], %[load6], 32 \n\t" | |
| 818 "sra %[load6], %[load6], 6 \n\t" | |
| 819 "add %[load8], %[load8], %[load6] \n\t" | |
| 820 "lbux %[load6], %[load8](%[cm]) \n\t" | |
| 821 "sb %[load6], 0(%[dest_pix]) \n\t" | |
| 822 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 823 | |
| 824 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
| 825 "sub %[load5], %[step1_2], %[step1_5] \n\t" | |
| 826 "sub %[load5], %[load5], %[step1_10] \n\t" | |
| 827 "addi %[load5], %[load5], 32 \n\t" | |
| 828 "sra %[load5], %[load5], 6 \n\t" | |
| 829 "add %[load7], %[load7], %[load5] \n\t" | |
| 830 "lbux %[load5], %[load7](%[cm]) \n\t" | |
| 831 "sub %[load6], %[step1_3], %[step1_4] \n\t" | |
| 832 "sub %[load6], %[load6], %[step1_11] \n\t" | |
| 833 "sb %[load5], 0(%[dest_pix]) \n\t" | |
| 834 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 835 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
| 836 "addi %[load6], %[load6], 32 \n\t" | |
| 837 "sra %[load6], %[load6], 6 \n\t" | |
| 838 "add %[load8], %[load8], %[load6] \n\t" | |
| 839 "lbux %[load6], %[load8](%[cm]) \n\t" | |
| 840 "sb %[load6], 0(%[dest_pix]) \n\t" | |
| 841 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 842 | |
| 843 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
| 844 "add %[load5], %[step1_3], %[step1_4] \n\t" | |
| 845 "sub %[load5], %[load5], %[step1_12] \n\t" | |
| 846 "addi %[load5], %[load5], 32 \n\t" | |
| 847 "sra %[load5], %[load5], 6 \n\t" | |
| 848 "add %[load7], %[load7], %[load5] \n\t" | |
| 849 "lbux %[load5], %[load7](%[cm]) \n\t" | |
| 850 "add %[load6], %[step1_2], %[step1_5] \n\t" | |
| 851 "sub %[load6], %[load6], %[step1_13] \n\t" | |
| 852 "sb %[load5], 0(%[dest_pix]) \n\t" | |
| 853 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 854 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
| 855 "addi %[load6], %[load6], 32 \n\t" | |
| 856 "sra %[load6], %[load6], 6 \n\t" | |
| 857 "add %[load8], %[load8], %[load6] \n\t" | |
| 858 "lbux %[load6], %[load8](%[cm]) \n\t" | |
| 859 "sb %[load6], 0(%[dest_pix]) \n\t" | |
| 860 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 861 | |
| 862 "lbu %[load7], 0(%[dest_pix]) \n\t" | |
| 863 "add %[load5], %[step1_1], %[step1_6] \n\t" | |
| 864 "sub %[load5], %[load5], %[step1_14] \n\t" | |
| 865 "addi %[load5], %[load5], 32 \n\t" | |
| 866 "sra %[load5], %[load5], 6 \n\t" | |
| 867 "add %[load7], %[load7], %[load5] \n\t" | |
| 868 "lbux %[load5], %[load7](%[cm]) \n\t" | |
| 869 "add %[load6], %[step1_0], %[step1_7] \n\t" | |
| 870 "sub %[load6], %[load6], %[step1_15] \n\t" | |
| 871 "sb %[load5], 0(%[dest_pix]) \n\t" | |
| 872 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" | |
| 873 "lbu %[load8], 0(%[dest_pix]) \n\t" | |
| 874 "addi %[load6], %[load6], 32 \n\t" | |
| 875 "sra %[load6], %[load6], 6 \n\t" | |
| 876 "add %[load8], %[load8], %[load6] \n\t" | |
| 877 "lbux %[load6], %[load8](%[cm]) \n\t" | |
| 878 "sb %[load6], 0(%[dest_pix]) \n\t" | |
| 879 | |
| 880 : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), | |
| 881 [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) | |
| 882 : [cm] "r" (cm), [dest_stride] "r" (dest_stride), | |
| 883 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), | |
| 884 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), | |
| 885 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), | |
| 886 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), | |
| 887 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), | |
| 888 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), | |
| 889 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), | |
| 890 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) | |
| 891 ); | |
| 892 | |
| 893 input += 16; | |
| 894 } | |
| 895 } | |
| 896 | |
| 897 void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 898 int dest_stride) { | |
| 899 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | |
| 900 uint32_t pos = 45; | |
| 901 | |
| 902 /* bit positon for extract from acc */ | |
| 903 __asm__ __volatile__ ( | |
| 904 "wrdsp %[pos], 1 \n\t" | |
| 905 : | |
| 906 : [pos] "r" (pos) | |
| 907 ); | |
| 908 | |
| 909 // First transform rows | |
| 910 idct16_rows_dspr2(input, out, 16); | |
| 911 | |
| 912 // Then transform columns and add to dest | |
| 913 idct16_cols_add_blk_dspr2(out, dest, dest_stride); | |
| 914 } | |
| 915 | |
| 916 static void iadst16(const int16_t *input, int16_t *output) { | |
| 917 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; | |
| 918 | |
| 919 int x0 = input[15]; | |
| 920 int x1 = input[0]; | |
| 921 int x2 = input[13]; | |
| 922 int x3 = input[2]; | |
| 923 int x4 = input[11]; | |
| 924 int x5 = input[4]; | |
| 925 int x6 = input[9]; | |
| 926 int x7 = input[6]; | |
| 927 int x8 = input[7]; | |
| 928 int x9 = input[8]; | |
| 929 int x10 = input[5]; | |
| 930 int x11 = input[10]; | |
| 931 int x12 = input[3]; | |
| 932 int x13 = input[12]; | |
| 933 int x14 = input[1]; | |
| 934 int x15 = input[14]; | |
| 935 | |
| 936 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | |
| 937 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { | |
| 938 output[0] = output[1] = output[2] = output[3] = output[4] | |
| 939 = output[5] = output[6] = output[7] = output[8] | |
| 940 = output[9] = output[10] = output[11] = output[12] | |
| 941 = output[13] = output[14] = output[15] = 0; | |
| 942 return; | |
| 943 } | |
| 944 | |
| 945 // stage 1 | |
| 946 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; | |
| 947 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; | |
| 948 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; | |
| 949 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; | |
| 950 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; | |
| 951 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; | |
| 952 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; | |
| 953 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; | |
| 954 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; | |
| 955 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; | |
| 956 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; | |
| 957 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; | |
| 958 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; | |
| 959 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; | |
| 960 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; | |
| 961 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; | |
| 962 | |
| 963 x0 = dct_const_round_shift(s0 + s8); | |
| 964 x1 = dct_const_round_shift(s1 + s9); | |
| 965 x2 = dct_const_round_shift(s2 + s10); | |
| 966 x3 = dct_const_round_shift(s3 + s11); | |
| 967 x4 = dct_const_round_shift(s4 + s12); | |
| 968 x5 = dct_const_round_shift(s5 + s13); | |
| 969 x6 = dct_const_round_shift(s6 + s14); | |
| 970 x7 = dct_const_round_shift(s7 + s15); | |
| 971 x8 = dct_const_round_shift(s0 - s8); | |
| 972 x9 = dct_const_round_shift(s1 - s9); | |
| 973 x10 = dct_const_round_shift(s2 - s10); | |
| 974 x11 = dct_const_round_shift(s3 - s11); | |
| 975 x12 = dct_const_round_shift(s4 - s12); | |
| 976 x13 = dct_const_round_shift(s5 - s13); | |
| 977 x14 = dct_const_round_shift(s6 - s14); | |
| 978 x15 = dct_const_round_shift(s7 - s15); | |
| 979 | |
| 980 // stage 2 | |
| 981 s0 = x0; | |
| 982 s1 = x1; | |
| 983 s2 = x2; | |
| 984 s3 = x3; | |
| 985 s4 = x4; | |
| 986 s5 = x5; | |
| 987 s6 = x6; | |
| 988 s7 = x7; | |
| 989 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; | |
| 990 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; | |
| 991 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; | |
| 992 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; | |
| 993 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; | |
| 994 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; | |
| 995 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; | |
| 996 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; | |
| 997 | |
| 998 x0 = s0 + s4; | |
| 999 x1 = s1 + s5; | |
| 1000 x2 = s2 + s6; | |
| 1001 x3 = s3 + s7; | |
| 1002 x4 = s0 - s4; | |
| 1003 x5 = s1 - s5; | |
| 1004 x6 = s2 - s6; | |
| 1005 x7 = s3 - s7; | |
| 1006 x8 = dct_const_round_shift(s8 + s12); | |
| 1007 x9 = dct_const_round_shift(s9 + s13); | |
| 1008 x10 = dct_const_round_shift(s10 + s14); | |
| 1009 x11 = dct_const_round_shift(s11 + s15); | |
| 1010 x12 = dct_const_round_shift(s8 - s12); | |
| 1011 x13 = dct_const_round_shift(s9 - s13); | |
| 1012 x14 = dct_const_round_shift(s10 - s14); | |
| 1013 x15 = dct_const_round_shift(s11 - s15); | |
| 1014 | |
| 1015 // stage 3 | |
| 1016 s0 = x0; | |
| 1017 s1 = x1; | |
| 1018 s2 = x2; | |
| 1019 s3 = x3; | |
| 1020 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; | |
| 1021 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; | |
| 1022 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; | |
| 1023 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; | |
| 1024 s8 = x8; | |
| 1025 s9 = x9; | |
| 1026 s10 = x10; | |
| 1027 s11 = x11; | |
| 1028 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; | |
| 1029 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; | |
| 1030 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; | |
| 1031 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; | |
| 1032 | |
| 1033 x0 = s0 + s2; | |
| 1034 x1 = s1 + s3; | |
| 1035 x2 = s0 - s2; | |
| 1036 x3 = s1 - s3; | |
| 1037 x4 = dct_const_round_shift(s4 + s6); | |
| 1038 x5 = dct_const_round_shift(s5 + s7); | |
| 1039 x6 = dct_const_round_shift(s4 - s6); | |
| 1040 x7 = dct_const_round_shift(s5 - s7); | |
| 1041 x8 = s8 + s10; | |
| 1042 x9 = s9 + s11; | |
| 1043 x10 = s8 - s10; | |
| 1044 x11 = s9 - s11; | |
| 1045 x12 = dct_const_round_shift(s12 + s14); | |
| 1046 x13 = dct_const_round_shift(s13 + s15); | |
| 1047 x14 = dct_const_round_shift(s12 - s14); | |
| 1048 x15 = dct_const_round_shift(s13 - s15); | |
| 1049 | |
| 1050 // stage 4 | |
| 1051 s2 = (- cospi_16_64) * (x2 + x3); | |
| 1052 s3 = cospi_16_64 * (x2 - x3); | |
| 1053 s6 = cospi_16_64 * (x6 + x7); | |
| 1054 s7 = cospi_16_64 * (- x6 + x7); | |
| 1055 s10 = cospi_16_64 * (x10 + x11); | |
| 1056 s11 = cospi_16_64 * (- x10 + x11); | |
| 1057 s14 = (- cospi_16_64) * (x14 + x15); | |
| 1058 s15 = cospi_16_64 * (x14 - x15); | |
| 1059 | |
| 1060 x2 = dct_const_round_shift(s2); | |
| 1061 x3 = dct_const_round_shift(s3); | |
| 1062 x6 = dct_const_round_shift(s6); | |
| 1063 x7 = dct_const_round_shift(s7); | |
| 1064 x10 = dct_const_round_shift(s10); | |
| 1065 x11 = dct_const_round_shift(s11); | |
| 1066 x14 = dct_const_round_shift(s14); | |
| 1067 x15 = dct_const_round_shift(s15); | |
| 1068 | |
| 1069 output[0] = x0; | |
| 1070 output[1] = -x8; | |
| 1071 output[2] = x12; | |
| 1072 output[3] = -x4; | |
| 1073 output[4] = x6; | |
| 1074 output[5] = x14; | |
| 1075 output[6] = x10; | |
| 1076 output[7] = x2; | |
| 1077 output[8] = x3; | |
| 1078 output[9] = x11; | |
| 1079 output[10] = x15; | |
| 1080 output[11] = x7; | |
| 1081 output[12] = x5; | |
| 1082 output[13] = -x13; | |
| 1083 output[14] = x9; | |
| 1084 output[15] = -x1; | |
| 1085 } | |
| 1086 | |
| 1087 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, | 24 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, |
| 1088 int pitch, int tx_type) { | 25 int pitch, int tx_type) { |
| 1089 int i, j; | 26 int i, j; |
| 1090 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | 27 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); |
| 1091 int16_t *outptr = out; | 28 int16_t *outptr = out; |
| 1092 int16_t temp_out[16]; | 29 int16_t temp_out[16]; |
| 1093 uint32_t pos = 45; | 30 uint32_t pos = 45; |
| 1094 | 31 |
| 1095 /* bit positon for extract from acc */ | 32 /* bit positon for extract from acc */ |
| 1096 __asm__ __volatile__ ( | 33 __asm__ __volatile__ ( |
| 1097 "wrdsp %[pos], 1 \n\t" | 34 "wrdsp %[pos], 1 \n\t" |
| 1098 : | 35 : |
| 1099 : [pos] "r" (pos) | 36 : [pos] "r" (pos) |
| 1100 ); | 37 ); |
| 1101 | 38 |
| 1102 switch (tx_type) { | 39 switch (tx_type) { |
| 1103 case DCT_DCT: // DCT in both horizontal and vertical | 40 case DCT_DCT: // DCT in both horizontal and vertical |
| 1104 idct16_rows_dspr2(input, outptr, 16); | 41 idct16_rows_dspr2(input, outptr, 16); |
| 1105 idct16_cols_add_blk_dspr2(out, dest, pitch); | 42 idct16_cols_add_blk_dspr2(out, dest, pitch); |
| 1106 break; | 43 break; |
| 1107 case ADST_DCT: // ADST in vertical, DCT in horizontal | 44 case ADST_DCT: // ADST in vertical, DCT in horizontal |
| 1108 idct16_rows_dspr2(input, outptr, 16); | 45 idct16_rows_dspr2(input, outptr, 16); |
| 1109 | 46 |
| 1110 outptr = out; | 47 outptr = out; |
| 1111 | 48 |
| 1112 for (i = 0; i < 16; ++i) { | 49 for (i = 0; i < 16; ++i) { |
| 1113 iadst16(outptr, temp_out); | 50 iadst16_dspr2(outptr, temp_out); |
| 1114 | 51 |
| 1115 for (j = 0; j < 16; ++j) | 52 for (j = 0; j < 16; ++j) |
| 1116 dest[j * pitch + i] = | 53 dest[j * pitch + i] = |
| 1117 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 54 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
| 1118 + dest[j * pitch + i]); | 55 + dest[j * pitch + i]); |
| 1119 outptr += 16; | 56 outptr += 16; |
| 1120 } | 57 } |
| 1121 break; | 58 break; |
| 1122 case DCT_ADST: // DCT in vertical, ADST in horizontal | 59 case DCT_ADST: // DCT in vertical, ADST in horizontal |
| 1123 { | 60 { |
| 1124 int16_t temp_in[16 * 16]; | 61 int16_t temp_in[16 * 16]; |
| 1125 | 62 |
| 1126 for (i = 0; i < 16; ++i) { | 63 for (i = 0; i < 16; ++i) { |
| 1127 /* prefetch row */ | 64 /* prefetch row */ |
| 1128 prefetch_load((const uint8_t *)(input + 16)); | 65 prefetch_load((const uint8_t *)(input + 16)); |
| 1129 | 66 |
| 1130 iadst16(input, outptr); | 67 iadst16_dspr2(input, outptr); |
| 1131 input += 16; | 68 input += 16; |
| 1132 outptr += 16; | 69 outptr += 16; |
| 1133 } | 70 } |
| 1134 | 71 |
| 1135 for (i = 0; i < 16; ++i) | 72 for (i = 0; i < 16; ++i) |
| 1136 for (j = 0; j < 16; ++j) | 73 for (j = 0; j < 16; ++j) |
| 1137 temp_in[j * 16 + i] = out[i * 16 + j]; | 74 temp_in[j * 16 + i] = out[i * 16 + j]; |
| 1138 | 75 |
| 1139 idct16_cols_add_blk_dspr2(temp_in, dest, pitch); | 76 idct16_cols_add_blk_dspr2(temp_in, dest, pitch); |
| 1140 } | 77 } |
| 1141 break; | 78 break; |
| 1142 case ADST_ADST: // ADST in both directions | 79 case ADST_ADST: // ADST in both directions |
| 1143 { | 80 { |
| 1144 int16_t temp_in[16]; | 81 int16_t temp_in[16]; |
| 1145 | 82 |
| 1146 for (i = 0; i < 16; ++i) { | 83 for (i = 0; i < 16; ++i) { |
| 1147 /* prefetch row */ | 84 /* prefetch row */ |
| 1148 prefetch_load((const uint8_t *)(input + 16)); | 85 prefetch_load((const uint8_t *)(input + 16)); |
| 1149 | 86 |
| 1150 iadst16(input, outptr); | 87 iadst16_dspr2(input, outptr); |
| 1151 input += 16; | 88 input += 16; |
| 1152 outptr += 16; | 89 outptr += 16; |
| 1153 } | 90 } |
| 1154 | 91 |
| 1155 for (i = 0; i < 16; ++i) { | 92 for (i = 0; i < 16; ++i) { |
| 1156 for (j = 0; j < 16; ++j) | 93 for (j = 0; j < 16; ++j) |
| 1157 temp_in[j] = out[j * 16 + i]; | 94 temp_in[j] = out[j * 16 + i]; |
| 1158 iadst16(temp_in, temp_out); | 95 iadst16_dspr2(temp_in, temp_out); |
| 1159 for (j = 0; j < 16; ++j) | 96 for (j = 0; j < 16; ++j) |
| 1160 dest[j * pitch + i] = | 97 dest[j * pitch + i] = |
| 1161 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) | 98 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
| 1162 + dest[j * pitch + i]); | 99 + dest[j * pitch + i]); |
| 1163 } | 100 } |
| 1164 } | 101 } |
| 1165 break; | 102 break; |
| 1166 default: | 103 default: |
| 1167 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); | 104 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); |
| 1168 break; | 105 break; |
| 1169 } | 106 } |
| 1170 } | 107 } |
| 1171 | |
| 1172 void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 1173 int dest_stride) { | |
| 1174 DECLARE_ALIGNED(32, int16_t, out[16 * 16]); | |
| 1175 int16_t *outptr = out; | |
| 1176 uint32_t i; | |
| 1177 uint32_t pos = 45; | |
| 1178 | |
| 1179 /* bit positon for extract from acc */ | |
| 1180 __asm__ __volatile__ ( | |
| 1181 "wrdsp %[pos], 1 \n\t" | |
| 1182 : | |
| 1183 : [pos] "r" (pos) | |
| 1184 ); | |
| 1185 | |
| 1186 // First transform rows. Since all non-zero dct coefficients are in | |
| 1187 // upper-left 4x4 area, we only need to calculate first 4 rows here. | |
| 1188 idct16_rows_dspr2(input, outptr, 4); | |
| 1189 | |
| 1190 outptr += 4; | |
| 1191 for (i = 0; i < 6; ++i) { | |
| 1192 __asm__ __volatile__ ( | |
| 1193 "sw $zero, 0(%[outptr]) \n\t" | |
| 1194 "sw $zero, 32(%[outptr]) \n\t" | |
| 1195 "sw $zero, 64(%[outptr]) \n\t" | |
| 1196 "sw $zero, 96(%[outptr]) \n\t" | |
| 1197 "sw $zero, 128(%[outptr]) \n\t" | |
| 1198 "sw $zero, 160(%[outptr]) \n\t" | |
| 1199 "sw $zero, 192(%[outptr]) \n\t" | |
| 1200 "sw $zero, 224(%[outptr]) \n\t" | |
| 1201 "sw $zero, 256(%[outptr]) \n\t" | |
| 1202 "sw $zero, 288(%[outptr]) \n\t" | |
| 1203 "sw $zero, 320(%[outptr]) \n\t" | |
| 1204 "sw $zero, 352(%[outptr]) \n\t" | |
| 1205 "sw $zero, 384(%[outptr]) \n\t" | |
| 1206 "sw $zero, 416(%[outptr]) \n\t" | |
| 1207 "sw $zero, 448(%[outptr]) \n\t" | |
| 1208 "sw $zero, 480(%[outptr]) \n\t" | |
| 1209 | |
| 1210 : | |
| 1211 : [outptr] "r" (outptr) | |
| 1212 ); | |
| 1213 | |
| 1214 outptr += 2; | |
| 1215 } | |
| 1216 | |
| 1217 // Then transform columns | |
| 1218 idct16_cols_add_blk_dspr2(out, dest, dest_stride); | |
| 1219 } | |
| 1220 | |
| 1221 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, | |
| 1222 int dest_stride) { | |
| 1223 uint32_t pos = 45; | |
| 1224 int32_t out; | |
| 1225 int32_t r; | |
| 1226 int32_t a1, absa1; | |
| 1227 int32_t vector_a1; | |
| 1228 int32_t t1, t2, t3, t4; | |
| 1229 int32_t vector_1, vector_2, vector_3, vector_4; | |
| 1230 | |
| 1231 /* bit positon for extract from acc */ | |
| 1232 __asm__ __volatile__ ( | |
| 1233 "wrdsp %[pos], 1 \n\t" | |
| 1234 | |
| 1235 : | |
| 1236 : [pos] "r" (pos) | |
| 1237 ); | |
| 1238 | |
| 1239 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); | |
| 1240 __asm__ __volatile__ ( | |
| 1241 "addi %[out], %[out], 32 \n\t" | |
| 1242 "sra %[a1], %[out], 6 \n\t" | |
| 1243 | |
| 1244 : [out] "+r" (out), [a1] "=r" (a1) | |
| 1245 : | |
| 1246 ); | |
| 1247 | |
| 1248 if (a1 < 0) { | |
| 1249 /* use quad-byte | |
| 1250 * input and output memory are four byte aligned */ | |
| 1251 __asm__ __volatile__ ( | |
| 1252 "abs %[absa1], %[a1] \n\t" | |
| 1253 "replv.qb %[vector_a1], %[absa1] \n\t" | |
| 1254 | |
| 1255 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) | |
| 1256 : [a1] "r" (a1) | |
| 1257 ); | |
| 1258 | |
| 1259 for (r = 16; r--;) { | |
| 1260 __asm__ __volatile__ ( | |
| 1261 "lw %[t1], 0(%[dest]) \n\t" | |
| 1262 "lw %[t2], 4(%[dest]) \n\t" | |
| 1263 "lw %[t3], 8(%[dest]) \n\t" | |
| 1264 "lw %[t4], 12(%[dest]) \n\t" | |
| 1265 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 1266 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 1267 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
| 1268 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
| 1269 "sw %[vector_1], 0(%[dest]) \n\t" | |
| 1270 "sw %[vector_2], 4(%[dest]) \n\t" | |
| 1271 "sw %[vector_3], 8(%[dest]) \n\t" | |
| 1272 "sw %[vector_4], 12(%[dest]) \n\t" | |
| 1273 "add %[dest], %[dest], %[dest_stride] \n\t" | |
| 1274 | |
| 1275 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
| 1276 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
| 1277 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
| 1278 [dest] "+&r" (dest) | |
| 1279 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
| 1280 ); | |
| 1281 } | |
| 1282 } else { | |
| 1283 /* use quad-byte | |
| 1284 * input and output memory are four byte aligned */ | |
| 1285 __asm__ __volatile__ ( | |
| 1286 "replv.qb %[vector_a1], %[a1] \n\t" | |
| 1287 | |
| 1288 : [vector_a1] "=r" (vector_a1) | |
| 1289 : [a1] "r" (a1) | |
| 1290 ); | |
| 1291 | |
| 1292 for (r = 16; r--;) { | |
| 1293 __asm__ __volatile__ ( | |
| 1294 "lw %[t1], 0(%[dest]) \n\t" | |
| 1295 "lw %[t2], 4(%[dest]) \n\t" | |
| 1296 "lw %[t3], 8(%[dest]) \n\t" | |
| 1297 "lw %[t4], 12(%[dest]) \n\t" | |
| 1298 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" | |
| 1299 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" | |
| 1300 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" | |
| 1301 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" | |
| 1302 "sw %[vector_1], 0(%[dest]) \n\t" | |
| 1303 "sw %[vector_2], 4(%[dest]) \n\t" | |
| 1304 "sw %[vector_3], 8(%[dest]) \n\t" | |
| 1305 "sw %[vector_4], 12(%[dest]) \n\t" | |
| 1306 "add %[dest], %[dest], %[dest_stride] \n\t" | |
| 1307 | |
| 1308 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), | |
| 1309 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), | |
| 1310 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), | |
| 1311 [dest] "+&r" (dest) | |
| 1312 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) | |
| 1313 ); | |
| 1314 } | |
| 1315 } | |
| 1316 } | |
| 1317 #endif // #if HAVE_DSPR2 | 108 #endif // #if HAVE_DSPR2 |
| OLD | NEW |