OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <assert.h> |
| 12 #include <stdio.h> |
| 13 |
| 14 #include "./vpx_config.h" |
| 15 #include "./vp9_rtcd.h" |
| 16 #include "vp9/common/vp9_common.h" |
| 17 #include "vpx/vpx_integer.h" |
| 18 #include "vpx_ports/mem.h" |
| 19 #include "vp9/common/vp9_filter.h" |
| 20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" |
| 21 |
| 22 #if HAVE_DSPR2 |
| 23 uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; |
| 24 uint8_t *vp9_ff_cropTbl; |
| 25 |
| 26 void vp9_dsputil_static_init(void) { |
| 27 int i; |
| 28 |
| 29 for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i; |
| 30 |
| 31 for (i = 0; i < CROP_WIDTH; i++) { |
| 32 vp9_ff_cropTbl_a[i] = 0; |
| 33 vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; |
| 34 } |
| 35 |
| 36 vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH]; |
| 37 } |
| 38 |
| 39 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, |
| 40 int32_t src_stride, |
| 41 uint8_t *dst, |
| 42 int32_t dst_stride, |
| 43 const int16_t *filter_x0, |
| 44 int32_t h) { |
| 45 int32_t y; |
| 46 uint8_t *cm = vp9_ff_cropTbl; |
| 47 uint8_t *dst_ptr; |
| 48 int32_t vector1b, vector2b, vector3b, vector4b; |
| 49 int32_t Temp1, Temp2, Temp3, Temp4; |
| 50 uint32_t vector4a = 64; |
| 51 uint32_t tp1, tp2; |
| 52 uint32_t p1, p2, p3, p4; |
| 53 uint32_t tn1, tn2; |
| 54 |
| 55 vector1b = ((const int32_t *)filter_x0)[0]; |
| 56 vector2b = ((const int32_t *)filter_x0)[1]; |
| 57 vector3b = ((const int32_t *)filter_x0)[2]; |
| 58 vector4b = ((const int32_t *)filter_x0)[3]; |
| 59 |
| 60 for (y = h; y--;) { |
| 61 dst_ptr = dst; |
| 62 /* prefetch data to cache memory */ |
| 63 vp9_prefetch_load(src + src_stride); |
| 64 vp9_prefetch_load(src + src_stride + 32); |
| 65 |
| 66 __asm__ __volatile__ ( |
| 67 "ulw %[tp1], 0(%[src]) \n\t" |
| 68 "ulw %[tp2], 4(%[src]) \n\t" |
| 69 |
| 70 /* even 1. pixel */ |
| 71 "mtlo %[vector4a], $ac3 \n\t" |
| 72 "mthi $zero, $ac3 \n\t" |
| 73 "preceu.ph.qbr %[p1], %[tp1] \n\t" |
| 74 "preceu.ph.qbl %[p2], %[tp1] \n\t" |
| 75 "preceu.ph.qbr %[p3], %[tp2] \n\t" |
| 76 "preceu.ph.qbl %[p4], %[tp2] \n\t" |
| 77 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" |
| 78 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" |
| 79 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" |
| 80 "ulw %[tn2], 8(%[src]) \n\t" |
| 81 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" |
| 82 "extp %[Temp1], $ac3, 31 \n\t" |
| 83 |
| 84 /* even 2. pixel */ |
| 85 "mtlo %[vector4a], $ac2 \n\t" |
| 86 "mthi $zero, $ac2 \n\t" |
| 87 "preceu.ph.qbr %[p1], %[tn2] \n\t" |
| 88 "balign %[tn1], %[tn2], 3 \n\t" |
| 89 "balign %[tn2], %[tp2], 3 \n\t" |
| 90 "balign %[tp2], %[tp1], 3 \n\t" |
| 91 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" |
| 92 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" |
| 93 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" |
| 94 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" |
| 95 "extp %[Temp3], $ac2, 31 \n\t" |
| 96 |
| 97 /* odd 1. pixel */ |
| 98 "lbux %[tp1], %[Temp1](%[cm]) \n\t" |
| 99 "mtlo %[vector4a], $ac3 \n\t" |
| 100 "mthi $zero, $ac3 \n\t" |
| 101 "preceu.ph.qbr %[p1], %[tp2] \n\t" |
| 102 "preceu.ph.qbl %[p2], %[tp2] \n\t" |
| 103 "preceu.ph.qbr %[p3], %[tn2] \n\t" |
| 104 "preceu.ph.qbl %[p4], %[tn2] \n\t" |
| 105 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" |
| 106 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" |
| 107 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" |
| 108 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" |
| 109 "extp %[Temp2], $ac3, 31 \n\t" |
| 110 |
| 111 /* odd 2. pixel */ |
| 112 "lbux %[tp2], %[Temp3](%[cm]) \n\t" |
| 113 "mtlo %[vector4a], $ac2 \n\t" |
| 114 "mthi $zero, $ac2 \n\t" |
| 115 "preceu.ph.qbr %[p1], %[tn1] \n\t" |
| 116 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" |
| 117 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" |
| 118 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" |
| 119 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" |
| 120 "extp %[Temp4], $ac2, 31 \n\t" |
| 121 |
| 122 /* clamp */ |
| 123 "lbux %[tn1], %[Temp2](%[cm]) \n\t" |
| 124 "lbux %[p2], %[Temp4](%[cm]) \n\t" |
| 125 |
| 126 /* store bytes */ |
| 127 "sb %[tp1], 0(%[dst_ptr]) \n\t" |
| 128 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" |
| 129 |
| 130 "sb %[tn1], 0(%[dst_ptr]) \n\t" |
| 131 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" |
| 132 |
| 133 "sb %[tp2], 0(%[dst_ptr]) \n\t" |
| 134 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" |
| 135 |
| 136 "sb %[p2], 0(%[dst_ptr]) \n\t" |
| 137 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" |
| 138 |
| 139 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (
tn2), |
| 140 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), |
| 141 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [
Temp4] "=&r" (Temp4), |
| 142 [dst_ptr] "+r" (dst_ptr) |
| 143 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), |
| 144 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), |
| 145 [vector4a] "r" (vector4a), |
| 146 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) |
| 147 ); |
| 148 |
| 149 /* Next row... */ |
| 150 src += src_stride; |
| 151 dst += 1; |
| 152 } |
| 153 } |
| 154 |
| 155 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, |
| 156 int32_t src_stride, |
| 157 uint8_t *dst, |
| 158 int32_t dst_stride, |
| 159 const int16_t *filter_x0, |
| 160 int32_t h) { |
| 161 int32_t y; |
| 162 uint8_t *cm = vp9_ff_cropTbl; |
| 163 uint8_t *dst_ptr; |
| 164 uint32_t vector4a = 64; |
| 165 int32_t vector1b, vector2b, vector3b, vector4b; |
| 166 int32_t Temp1, Temp2, Temp3; |
| 167 uint32_t tp1, tp2, tp3; |
| 168 uint32_t p1, p2, p3, p4, n1; |
| 169 uint8_t *odd_dst; |
| 170 uint32_t dst_pitch_2 = (dst_stride << 1); |
| 171 |
| 172 vector1b = ((const int32_t *)filter_x0)[0]; |
| 173 vector2b = ((const int32_t *)filter_x0)[1]; |
| 174 vector3b = ((const int32_t *)filter_x0)[2]; |
| 175 vector4b = ((const int32_t *)filter_x0)[3]; |
| 176 |
| 177 for (y = h; y--;) { |
| 178 /* prefetch data to cache memory */ |
| 179 vp9_prefetch_load(src + src_stride); |
| 180 vp9_prefetch_load(src + src_stride + 32); |
| 181 |
| 182 dst_ptr = dst; |
| 183 odd_dst = (dst_ptr + dst_stride); |
| 184 |
| 185 __asm__ __volatile__ ( |
| 186 "ulw %[tp2], 0(%[src]) \n\t" |
| 187 "ulw %[tp1], 4(%[src]) \n\t" |
| 188 |
| 189 /* even 1. pixel */ |
| 190 "mtlo %[vector4a], $ac3 \n\t" |
| 191 "mthi $zero, $ac3 \n\t" |
| 192 "mtlo %[vector4a], $ac2 \n\t" |
| 193 "mthi $zero, $ac2 \n\t" |
| 194 "preceu.ph.qbr %[p1], %[tp2] \n\t" |
| 195 "preceu.ph.qbl %[p2], %[tp2] \n\t" |
| 196 "preceu.ph.qbr %[p3], %[tp1] \n\t" |
| 197 "preceu.ph.qbl %[p4], %[tp1] \n\t" |
| 198 "ulw %[tp3], 8(%[src]) \n\t" |
| 199 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" |
| 200 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" |
| 201 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" |
| 202 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" |
| 203 "extp %[Temp1], $ac3, 31 \n\t" |
| 204 |
| 205 /* even 2. pixel */ |
| 206 "preceu.ph.qbr %[p1], %[tp3] \n\t" |
| 207 "preceu.ph.qbl %[n1], %[tp3] \n\t" |
| 208 "ulw %[tp2], 12(%[src]) \n\t" |
| 209 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" |
| 210 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" |
| 211 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" |
| 212 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" |
| 213 "extp %[Temp3], $ac2, 31 \n\t" |
| 214 |
| 215 /* even 3. pixel */ |
| 216 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" |
| 217 "mtlo %[vector4a], $ac1 \n\t" |
| 218 "mthi $zero, $ac1 \n\t" |
| 219 "preceu.ph.qbr %[p2], %[tp2] \n\t" |
| 220 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" |
| 221 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" |
| 222 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" |
| 223 "lbux %[tp3], %[Temp3](%[cm]) \n\t" |
| 224 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" |
| 225 "extp %[p3], $ac1, 31 \n\t" |
| 226 |
| 227 /* even 4. pixel */ |
| 228 "mtlo %[vector4a], $ac2 \n\t" |
| 229 "mthi $zero, $ac2 \n\t" |
| 230 "mtlo %[vector4a], $ac3 \n\t" |
| 231 "mthi $zero, $ac3 \n\t" |
| 232 "sb %[Temp2], 0(%[dst_ptr]) \n\t" |
| 233 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" |
| 234 "sb %[tp3], 0(%[dst_ptr]) \n\t" |
| 235 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" |
| 236 |
| 237 "ulw %[tp1], 1(%[src]) \n\t" |
| 238 "ulw %[tp3], 5(%[src]) \n\t" |
| 239 |
| 240 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" |
| 241 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" |
| 242 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" |
| 243 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" |
| 244 "extp %[Temp3], $ac2, 31 \n\t" |
| 245 |
| 246 "lbux %[tp2], %[p3](%[cm]) \n\t" |
| 247 |
| 248 /* odd 1. pixel */ |
| 249 "mtlo %[vector4a], $ac1 \n\t" |
| 250 "mthi $zero, $ac1 \n\t" |
| 251 "preceu.ph.qbr %[p1], %[tp1] \n\t" |
| 252 "preceu.ph.qbl %[p2], %[tp1] \n\t" |
| 253 "preceu.ph.qbr %[p3], %[tp3] \n\t" |
| 254 "preceu.ph.qbl %[p4], %[tp3] \n\t" |
| 255 "sb %[tp2], 0(%[dst_ptr]) \n\t" |
| 256 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" |
| 257 "ulw %[tp2], 9(%[src]) \n\t" |
| 258 |
| 259 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" |
| 260 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" |
| 261 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" |
| 262 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" |
| 263 "extp %[Temp2], $ac3, 31 \n\t" |
| 264 |
| 265 /* odd 2. pixel */ |
| 266 "lbux %[tp1], %[Temp3](%[cm]) \n\t" |
| 267 "mtlo %[vector4a], $ac3 \n\t" |
| 268 "mthi $zero, $ac3 \n\t" |
| 269 "mtlo %[vector4a], $ac2 \n\t" |
| 270 "mthi $zero, $ac2 \n\t" |
| 271 "preceu.ph.qbr %[p1], %[tp2] \n\t" |
| 272 "preceu.ph.qbl %[n1], %[tp2] \n\t" |
| 273 "ulw %[Temp1], 13(%[src]) \n\t" |
| 274 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" |
| 275 "sb %[tp1], 0(%[dst_ptr]) \n\t" |
| 276 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" |
| 277 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" |
| 278 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" |
| 279 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" |
| 280 "extp %[Temp3], $ac1, 31 \n\t" |
| 281 |
| 282 /* odd 3. pixel */ |
| 283 "lbux %[tp3], %[Temp2](%[cm]) \n\t" |
| 284 "preceu.ph.qbr %[p2], %[Temp1] \n\t" |
| 285 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" |
| 286 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" |
| 287 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" |
| 288 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" |
| 289 "extp %[Temp2], $ac3, 31 \n\t" |
| 290 |
| 291 /* odd 4. pixel */ |
| 292 "sb %[tp3], 0(%[odd_dst]) \n\t" |
| 293 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" |
| 294 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" |
| 295 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" |
| 296 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" |
| 297 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" |
| 298 "extp %[Temp1], $ac2, 31 \n\t" |
| 299 |
| 300 /* clamp */ |
| 301 "lbux %[p4], %[Temp3](%[cm]) \n\t" |
| 302 "lbux %[p2], %[Temp2](%[cm]) \n\t" |
| 303 "lbux %[n1], %[Temp1](%[cm]) \n\t" |
| 304 |
| 305 /* store bytes */ |
| 306 "sb %[p4], 0(%[odd_dst]) \n\t" |
| 307 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" |
| 308 |
| 309 "sb %[p2], 0(%[odd_dst]) \n\t" |
| 310 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" |
| 311 |
| 312 "sb %[n1], 0(%[odd_dst]) \n\t" |
| 313 |
| 314 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), |
| 315 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), |
| 316 [n1] "=&r" (n1), |
| 317 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), |
| 318 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) |
| 319 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), |
| 320 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), |
| 321 [vector4a] "r" (vector4a), [cm] "r" (cm), |
| 322 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) |
| 323 ); |
| 324 |
| 325 /* Next row... */ |
| 326 src += src_stride; |
| 327 dst += 1; |
| 328 } |
| 329 } |
| 330 |
| 331 static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, |
| 332 int32_t src_stride, |
| 333 uint8_t *dst_ptr, |
| 334 int32_t dst_stride, |
| 335 const int16_t *filter_x0, |
| 336 int32_t h, |
| 337 int32_t count) { |
| 338 int32_t c, y; |
| 339 const uint8_t *src; |
| 340 uint8_t *dst; |
| 341 uint8_t *cm = vp9_ff_cropTbl; |
| 342 uint32_t vector_64 = 64; |
| 343 int32_t filter12, filter34, filter56, filter78; |
| 344 int32_t Temp1, Temp2, Temp3; |
| 345 uint32_t qload1, qload2; |
| 346 uint32_t p1, p2, p3, p4, p5; |
| 347 uint32_t st1, st2, st3; |
| 348 uint32_t dst_pitch_2 = (dst_stride << 1); |
| 349 uint8_t *odd_dst; |
| 350 |
| 351 filter12 = ((const int32_t *)filter_x0)[0]; |
| 352 filter34 = ((const int32_t *)filter_x0)[1]; |
| 353 filter56 = ((const int32_t *)filter_x0)[2]; |
| 354 filter78 = ((const int32_t *)filter_x0)[3]; |
| 355 |
| 356 for (y = h; y--;) { |
| 357 /* prefetch data to cache memory */ |
| 358 vp9_prefetch_load(src_ptr + src_stride); |
| 359 vp9_prefetch_load(src_ptr + src_stride + 32); |
| 360 |
| 361 src = src_ptr; |
| 362 dst = dst_ptr; |
| 363 |
| 364 odd_dst = (dst + dst_stride); |
| 365 |
| 366 for (c = 0; c < count; c++) { |
| 367 __asm__ __volatile__ ( |
| 368 "ulw %[qload1], 0(%[src]) \n
\t" |
| 369 "ulw %[qload2], 4(%[src]) \n
\t" |
| 370 |
| 371 /* even 1. pixel */ |
| 372 "mtlo %[vector_64], $ac1 \n
\t" /* even 1 */ |
| 373 "mthi $zero, $ac1 \n
\t" |
| 374 "mtlo %[vector_64], $ac2 \n
\t" /* even 2 */ |
| 375 "mthi $zero, $ac2 \n
\t" |
| 376 "preceu.ph.qbr %[p3], %[qload2] \n
\t" |
| 377 "preceu.ph.qbl %[p4], %[qload2] \n
\t" |
| 378 "preceu.ph.qbr %[p1], %[qload1] \n
\t" |
| 379 "preceu.ph.qbl %[p2], %[qload1] \n
\t" |
| 380 "ulw %[qload2], 8(%[src]) \n
\t" |
| 381 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* even 1 */ |
| 382 "dpa.w.ph $ac1, %[p2], %[filter34] \n
\t" /* even 1 */ |
| 383 "dpa.w.ph $ac1, %[p3], %[filter56] \n
\t" /* even 1 */ |
| 384 "dpa.w.ph $ac1, %[p4], %[filter78] \n
\t" /* even 1 */ |
| 385 "extp %[Temp1], $ac1, 31 \n
\t" /* even 1 */ |
| 386 |
| 387 /* even 2. pixel */ |
| 388 "mtlo %[vector_64], $ac3 \n
\t" /* even 3 */ |
| 389 "mthi $zero, $ac3 \n
\t" |
| 390 "preceu.ph.qbr %[p1], %[qload2] \n
\t" |
| 391 "preceu.ph.qbl %[p5], %[qload2] \n
\t" |
| 392 "ulw %[qload1], 12(%[src]) \n
\t" |
| 393 "dpa.w.ph $ac2, %[p2], %[filter12] \n
\t" /* even 1 */ |
| 394 "dpa.w.ph $ac2, %[p3], %[filter34] \n
\t" /* even 1 */ |
| 395 "dpa.w.ph $ac2, %[p4], %[filter56] \n
\t" /* even 1 */ |
| 396 "dpa.w.ph $ac2, %[p1], %[filter78] \n
\t" /* even 1 */ |
| 397 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 1 */ |
| 398 "extp %[Temp2], $ac2, 31 \n
\t" /* even 1 */ |
| 399 |
| 400 /* even 3. pixel */ |
| 401 "mtlo %[vector_64], $ac1 \n
\t" /* even 4 */ |
| 402 "mthi $zero, $ac1 \n
\t" |
| 403 "preceu.ph.qbr %[p2], %[qload1] \n
\t" |
| 404 "sb %[st1], 0(%[dst]) \n
\t" /* even 1 */ |
| 405 "addu %[dst], %[dst], %[dst_pitch_2]
\n\t" |
| 406 "dpa.w.ph $ac3, %[p3], %[filter12] \n
\t" /* even 3 */ |
| 407 "dpa.w.ph $ac3, %[p4], %[filter34] \n
\t" /* even 3 */ |
| 408 "dpa.w.ph $ac3, %[p1], %[filter56] \n
\t" /* even 3 */ |
| 409 "dpa.w.ph $ac3, %[p5], %[filter78] \n
\t" /* even 3 */ |
| 410 "extp %[Temp3], $ac3, 31 \n
\t" /* even 3 */ |
| 411 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 1 */ |
| 412 |
| 413 /* even 4. pixel */ |
| 414 "mtlo %[vector_64], $ac2 \n
\t" /* even 5 */ |
| 415 "mthi $zero, $ac2 \n
\t" |
| 416 "preceu.ph.qbl %[p3], %[qload1] \n
\t" |
| 417 "sb %[st2], 0(%[dst]) \n
\t" /* even 2 */ |
| 418 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 419 "ulw %[qload2], 16(%[src]) \n
\t" |
| 420 "dpa.w.ph $ac1, %[p4], %[filter12] \n
\t" /* even 4 */ |
| 421 "dpa.w.ph $ac1, %[p1], %[filter34] \n
\t" /* even 4 */ |
| 422 "dpa.w.ph $ac1, %[p5], %[filter56] \n
\t" /* even 4 */ |
| 423 "dpa.w.ph $ac1, %[p2], %[filter78] \n
\t" /* even 4 */ |
| 424 "extp %[Temp1], $ac1, 31 \n
\t" /* even 4 */ |
| 425 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 3 */ |
| 426 |
| 427 /* even 5. pixel */ |
| 428 "mtlo %[vector_64], $ac3 \n
\t" /* even 6 */ |
| 429 "mthi $zero, $ac3 \n
\t" |
| 430 "preceu.ph.qbr %[p4], %[qload2] \n
\t" |
| 431 "sb %[st3], 0(%[dst]) \n
\t" /* even 3 */ |
| 432 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 433 "dpa.w.ph $ac2, %[p1], %[filter12] \n
\t" /* even 5 */ |
| 434 "dpa.w.ph $ac2, %[p5], %[filter34] \n
\t" /* even 5 */ |
| 435 "dpa.w.ph $ac2, %[p2], %[filter56] \n
\t" /* even 5 */ |
| 436 "dpa.w.ph $ac2, %[p3], %[filter78] \n
\t" /* even 5 */ |
| 437 "extp %[Temp2], $ac2, 31 \n
\t" /* even 5 */ |
| 438 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 4 */ |
| 439 |
| 440 /* even 6. pixel */ |
| 441 "mtlo %[vector_64], $ac1 \n
\t" /* even 7 */ |
| 442 "mthi $zero, $ac1 \n
\t" |
| 443 "preceu.ph.qbl %[p1], %[qload2] \n
\t" |
| 444 "sb %[st1], 0(%[dst]) \n
\t" /* even 4 */ |
| 445 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 446 "ulw %[qload1], 20(%[src]) \n
\t" |
| 447 "dpa.w.ph $ac3, %[p5], %[filter12] \n
\t" /* even 6 */ |
| 448 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* even 6 */ |
| 449 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* even 6 */ |
| 450 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* even 6 */ |
| 451 "extp %[Temp3], $ac3, 31 \n
\t" /* even 6 */ |
| 452 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 5 */ |
| 453 |
| 454 /* even 7. pixel */ |
| 455 "mtlo %[vector_64], $ac2 \n
\t" /* even 8 */ |
| 456 "mthi $zero, $ac2 \n
\t" |
| 457 "preceu.ph.qbr %[p5], %[qload1] \n
\t" |
| 458 "sb %[st2], 0(%[dst]) \n
\t" /* even 5 */ |
| 459 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 460 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* even 7 */ |
| 461 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* even 7 */ |
| 462 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* even 7 */ |
| 463 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* even 7 */ |
| 464 "extp %[Temp1], $ac1, 31 \n
\t" /* even 7 */ |
| 465 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 6 */ |
| 466 |
| 467 /* even 8. pixel */ |
| 468 "mtlo %[vector_64], $ac3 \n
\t" /* odd 1 */ |
| 469 "mthi $zero, $ac3 \n
\t" |
| 470 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* even 8 */ |
| 471 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* even 8 */ |
| 472 "sb %[st3], 0(%[dst]) \n
\t" /* even 6 */ |
| 473 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 474 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* even 8 */ |
| 475 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* even 8 */ |
| 476 "extp %[Temp2], $ac2, 31 \n
\t" /* even 8 */ |
| 477 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 7 */ |
| 478 |
| 479 /* ODD pixels */ |
| 480 "ulw %[qload1], 1(%[src]) \n
\t" |
| 481 "ulw %[qload2], 5(%[src]) \n
\t" |
| 482 |
| 483 /* odd 1. pixel */ |
| 484 "mtlo %[vector_64], $ac1 \n
\t" /* odd 2 */ |
| 485 "mthi $zero, $ac1 \n
\t" |
| 486 "preceu.ph.qbr %[p1], %[qload1] \n
\t" |
| 487 "preceu.ph.qbl %[p2], %[qload1] \n
\t" |
| 488 "preceu.ph.qbr %[p3], %[qload2] \n
\t" |
| 489 "preceu.ph.qbl %[p4], %[qload2] \n
\t" |
| 490 "sb %[st1], 0(%[dst]) \n
\t" /* even 7 */ |
| 491 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 492 "ulw %[qload2], 9(%[src]) \n
\t" |
| 493 "dpa.w.ph $ac3, %[p1], %[filter12] \n
\t" /* odd 1 */ |
| 494 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* odd 1 */ |
| 495 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* odd 1 */ |
| 496 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* odd 1 */ |
| 497 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 1 */ |
| 498 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 8 */ |
| 499 |
| 500 /* odd 2. pixel */ |
| 501 "mtlo %[vector_64], $ac2 \n
\t" /* odd 3 */ |
| 502 "mthi $zero, $ac2 \n
\t" |
| 503 "preceu.ph.qbr %[p1], %[qload2] \n
\t" |
| 504 "preceu.ph.qbl %[p5], %[qload2] \n
\t" |
| 505 "sb %[st2], 0(%[dst]) \n
\t" /* even 8 */ |
| 506 "ulw %[qload1], 13(%[src]) \n
\t" |
| 507 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* odd 2 */ |
| 508 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* odd 2 */ |
| 509 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* odd 2 */ |
| 510 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* odd 2 */ |
| 511 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 2 */ |
| 512 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 1 */ |
| 513 |
| 514 /* odd 3. pixel */ |
| 515 "mtlo %[vector_64], $ac3 \n
\t" /* odd 4 */ |
| 516 "mthi $zero, $ac3 \n
\t" |
| 517 "preceu.ph.qbr %[p2], %[qload1] \n
\t" |
| 518 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 1 */ |
| 519 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 520 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* odd 3 */ |
| 521 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* odd 3 */ |
| 522 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* odd 3 */ |
| 523 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* odd 3 */ |
| 524 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 3 */ |
| 525 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 2 */ |
| 526 |
| 527 /* odd 4. pixel */ |
| 528 "mtlo %[vector_64], $ac1 \n
\t" /* odd 5 */ |
| 529 "mthi $zero, $ac1 \n
\t" |
| 530 "preceu.ph.qbl %[p3], %[qload1] \n
\t" |
| 531 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 2 */ |
| 532 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 533 "ulw %[qload2], 17(%[src]) \n
\t" |
| 534 "dpa.w.ph $ac3, %[p4], %[filter12] \n
\t" /* odd 4 */ |
| 535 "dpa.w.ph $ac3, %[p1], %[filter34] \n
\t" /* odd 4 */ |
| 536 "dpa.w.ph $ac3, %[p5], %[filter56] \n
\t" /* odd 4 */ |
| 537 "dpa.w.ph $ac3, %[p2], %[filter78] \n
\t" /* odd 4 */ |
| 538 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 4 */ |
| 539 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 3 */ |
| 540 |
| 541 /* odd 5. pixel */ |
| 542 "mtlo %[vector_64], $ac2 \n
\t" /* odd 6 */ |
| 543 "mthi $zero, $ac2 \n
\t" |
| 544 "preceu.ph.qbr %[p4], %[qload2] \n
\t" |
| 545 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 3 */ |
| 546 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 547 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* odd 5 */ |
| 548 "dpa.w.ph $ac1, %[p5], %[filter34] \n
\t" /* odd 5 */ |
| 549 "dpa.w.ph $ac1, %[p2], %[filter56] \n
\t" /* odd 5 */ |
| 550 "dpa.w.ph $ac1, %[p3], %[filter78] \n
\t" /* odd 5 */ |
| 551 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 5 */ |
| 552 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 4 */ |
| 553 |
| 554 /* odd 6. pixel */ |
| 555 "mtlo %[vector_64], $ac3 \n
\t" /* odd 7 */ |
| 556 "mthi $zero, $ac3 \n
\t" |
| 557 "preceu.ph.qbl %[p1], %[qload2] \n
\t" |
| 558 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 4 */ |
| 559 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 560 "ulw %[qload1], 21(%[src]) \n
\t" |
| 561 "dpa.w.ph $ac2, %[p5], %[filter12] \n
\t" /* odd 6 */ |
| 562 "dpa.w.ph $ac2, %[p2], %[filter34] \n
\t" /* odd 6 */ |
| 563 "dpa.w.ph $ac2, %[p3], %[filter56] \n
\t" /* odd 6 */ |
| 564 "dpa.w.ph $ac2, %[p4], %[filter78] \n
\t" /* odd 6 */ |
| 565 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 6 */ |
| 566 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 5 */ |
| 567 |
| 568 /* odd 7. pixel */ |
| 569 "mtlo %[vector_64], $ac1 \n
\t" /* odd 8 */ |
| 570 "mthi $zero, $ac1 \n
\t" |
| 571 "preceu.ph.qbr %[p5], %[qload1] \n
\t" |
| 572 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 5 */ |
| 573 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 574 "dpa.w.ph $ac3, %[p2], %[filter12] \n
\t" /* odd 7 */ |
| 575 "dpa.w.ph $ac3, %[p3], %[filter34] \n
\t" /* odd 7 */ |
| 576 "dpa.w.ph $ac3, %[p4], %[filter56] \n
\t" /* odd 7 */ |
| 577 "dpa.w.ph $ac3, %[p1], %[filter78] \n
\t" /* odd 7 */ |
| 578 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 7 */ |
| 579 |
| 580 /* odd 8. pixel */ |
| 581 "dpa.w.ph $ac1, %[p3], %[filter12] \n
\t" /* odd 8 */ |
| 582 "dpa.w.ph $ac1, %[p4], %[filter34] \n
\t" /* odd 8 */ |
| 583 "dpa.w.ph $ac1, %[p1], %[filter56] \n
\t" /* odd 8 */ |
| 584 "dpa.w.ph $ac1, %[p5], %[filter78] \n
\t" /* odd 8 */ |
| 585 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 8 */ |
| 586 |
| 587 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 6 */ |
| 588 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 7 */ |
| 589 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 8 */ |
| 590 |
| 591 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 6 */ |
| 592 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 593 |
| 594 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 7 */ |
| 595 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 596 |
| 597 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 8 */ |
| 598 |
| 599 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), |
| 600 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), |
| 601 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), |
| 602 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), |
| 603 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) |
| 604 : [filter12] "r" (filter12), [filter34] "r" (filter34), |
| 605 [filter56] "r" (filter56), [filter78] "r" (filter78), |
| 606 [vector_64] "r" (vector_64), [cm] "r" (cm), |
| 607 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) |
| 608 ); |
| 609 |
| 610 src += 16; |
| 611 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); |
| 612 odd_dst = (dst + dst_stride); |
| 613 } |
| 614 |
| 615 /* Next row... */ |
| 616 src_ptr += src_stride; |
| 617 |
| 618 dst_ptr += 1; |
| 619 } |
| 620 } |
| 621 |
| 622 static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, |
| 623 int32_t src_stride, |
| 624 uint8_t *dst_ptr, |
| 625 int32_t dst_stride, |
| 626 const int16_t *filter_x0, |
| 627 int32_t h) { |
| 628 int32_t c, y; |
| 629 const uint8_t *src; |
| 630 uint8_t *dst; |
| 631 uint8_t *cm = vp9_ff_cropTbl; |
| 632 uint32_t vector_64 = 64; |
| 633 int32_t filter12, filter34, filter56, filter78; |
| 634 int32_t Temp1, Temp2, Temp3; |
| 635 uint32_t qload1, qload2; |
| 636 uint32_t p1, p2, p3, p4, p5; |
| 637 uint32_t st1, st2, st3; |
| 638 uint32_t dst_pitch_2 = (dst_stride << 1); |
| 639 uint8_t *odd_dst; |
| 640 |
| 641 filter12 = ((const int32_t *)filter_x0)[0]; |
| 642 filter34 = ((const int32_t *)filter_x0)[1]; |
| 643 filter56 = ((const int32_t *)filter_x0)[2]; |
| 644 filter78 = ((const int32_t *)filter_x0)[3]; |
| 645 |
| 646 for (y = h; y--;) { |
| 647 /* prefetch data to cache memory */ |
| 648 vp9_prefetch_load(src_ptr + src_stride); |
| 649 vp9_prefetch_load(src_ptr + src_stride + 32); |
| 650 vp9_prefetch_load(src_ptr + src_stride + 64); |
| 651 |
| 652 src = src_ptr; |
| 653 dst = dst_ptr; |
| 654 |
| 655 odd_dst = (dst + dst_stride); |
| 656 |
| 657 for (c = 0; c < 4; c++) { |
| 658 __asm__ __volatile__ ( |
| 659 "ulw %[qload1], 0(%[src]) \n
\t" |
| 660 "ulw %[qload2], 4(%[src]) \n
\t" |
| 661 |
| 662 /* even 1. pixel */ |
| 663 "mtlo %[vector_64], $ac1 \n
\t" /* even 1 */ |
| 664 "mthi $zero, $ac1 \n
\t" |
| 665 "mtlo %[vector_64], $ac2 \n
\t" /* even 2 */ |
| 666 "mthi $zero, $ac2 \n
\t" |
| 667 "preceu.ph.qbr %[p3], %[qload2] \n
\t" |
| 668 "preceu.ph.qbl %[p4], %[qload2] \n
\t" |
| 669 "preceu.ph.qbr %[p1], %[qload1] \n
\t" |
| 670 "preceu.ph.qbl %[p2], %[qload1] \n
\t" |
| 671 "ulw %[qload2], 8(%[src]) \n
\t" |
| 672 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* even 1 */ |
| 673 "dpa.w.ph $ac1, %[p2], %[filter34] \n
\t" /* even 1 */ |
| 674 "dpa.w.ph $ac1, %[p3], %[filter56] \n
\t" /* even 1 */ |
| 675 "dpa.w.ph $ac1, %[p4], %[filter78] \n
\t" /* even 1 */ |
| 676 "extp %[Temp1], $ac1, 31 \n
\t" /* even 1 */ |
| 677 |
| 678 /* even 2. pixel */ |
| 679 "mtlo %[vector_64], $ac3 \n
\t" /* even 3 */ |
| 680 "mthi $zero, $ac3 \n
\t" |
| 681 "preceu.ph.qbr %[p1], %[qload2] \n
\t" |
| 682 "preceu.ph.qbl %[p5], %[qload2] \n
\t" |
| 683 "ulw %[qload1], 12(%[src]) \n
\t" |
| 684 "dpa.w.ph $ac2, %[p2], %[filter12] \n
\t" /* even 1 */ |
| 685 "dpa.w.ph $ac2, %[p3], %[filter34] \n
\t" /* even 1 */ |
| 686 "dpa.w.ph $ac2, %[p4], %[filter56] \n
\t" /* even 1 */ |
| 687 "dpa.w.ph $ac2, %[p1], %[filter78] \n
\t" /* even 1 */ |
| 688 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 1 */ |
| 689 "extp %[Temp2], $ac2, 31 \n
\t" /* even 1 */ |
| 690 |
| 691 /* even 3. pixel */ |
| 692 "mtlo %[vector_64], $ac1 \n
\t" /* even 4 */ |
| 693 "mthi $zero, $ac1 \n
\t" |
| 694 "preceu.ph.qbr %[p2], %[qload1] \n
\t" |
| 695 "sb %[st1], 0(%[dst]) \n
\t" /* even 1 */ |
| 696 "addu %[dst], %[dst], %[dst_pitch_2]
\n\t" |
| 697 "dpa.w.ph $ac3, %[p3], %[filter12] \n
\t" /* even 3 */ |
| 698 "dpa.w.ph $ac3, %[p4], %[filter34] \n
\t" /* even 3 */ |
| 699 "dpa.w.ph $ac3, %[p1], %[filter56] \n
\t" /* even 3 */ |
| 700 "dpa.w.ph $ac3, %[p5], %[filter78] \n
\t" /* even 3 */ |
| 701 "extp %[Temp3], $ac3, 31 \n
\t" /* even 3 */ |
| 702 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 1 */ |
| 703 |
| 704 /* even 4. pixel */ |
| 705 "mtlo %[vector_64], $ac2 \n
\t" /* even 5 */ |
| 706 "mthi $zero, $ac2 \n
\t" |
| 707 "preceu.ph.qbl %[p3], %[qload1] \n
\t" |
| 708 "sb %[st2], 0(%[dst]) \n
\t" /* even 2 */ |
| 709 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 710 "ulw %[qload2], 16(%[src]) \n
\t" |
| 711 "dpa.w.ph $ac1, %[p4], %[filter12] \n
\t" /* even 4 */ |
| 712 "dpa.w.ph $ac1, %[p1], %[filter34] \n
\t" /* even 4 */ |
| 713 "dpa.w.ph $ac1, %[p5], %[filter56] \n
\t" /* even 4 */ |
| 714 "dpa.w.ph $ac1, %[p2], %[filter78] \n
\t" /* even 4 */ |
| 715 "extp %[Temp1], $ac1, 31 \n
\t" /* even 4 */ |
| 716 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 3 */ |
| 717 |
| 718 /* even 5. pixel */ |
| 719 "mtlo %[vector_64], $ac3 \n
\t" /* even 6 */ |
| 720 "mthi $zero, $ac3 \n
\t" |
| 721 "preceu.ph.qbr %[p4], %[qload2] \n
\t" |
| 722 "sb %[st3], 0(%[dst]) \n
\t" /* even 3 */ |
| 723 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 724 "dpa.w.ph $ac2, %[p1], %[filter12] \n
\t" /* even 5 */ |
| 725 "dpa.w.ph $ac2, %[p5], %[filter34] \n
\t" /* even 5 */ |
| 726 "dpa.w.ph $ac2, %[p2], %[filter56] \n
\t" /* even 5 */ |
| 727 "dpa.w.ph $ac2, %[p3], %[filter78] \n
\t" /* even 5 */ |
| 728 "extp %[Temp2], $ac2, 31 \n
\t" /* even 5 */ |
| 729 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 4 */ |
| 730 |
| 731 /* even 6. pixel */ |
| 732 "mtlo %[vector_64], $ac1 \n
\t" /* even 7 */ |
| 733 "mthi $zero, $ac1 \n
\t" |
| 734 "preceu.ph.qbl %[p1], %[qload2] \n
\t" |
| 735 "sb %[st1], 0(%[dst]) \n
\t" /* even 4 */ |
| 736 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 737 "ulw %[qload1], 20(%[src]) \n
\t" |
| 738 "dpa.w.ph $ac3, %[p5], %[filter12] \n
\t" /* even 6 */ |
| 739 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* even 6 */ |
| 740 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* even 6 */ |
| 741 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* even 6 */ |
| 742 "extp %[Temp3], $ac3, 31 \n
\t" /* even 6 */ |
| 743 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 5 */ |
| 744 |
| 745 /* even 7. pixel */ |
| 746 "mtlo %[vector_64], $ac2 \n
\t" /* even 8 */ |
| 747 "mthi $zero, $ac2 \n
\t" |
| 748 "preceu.ph.qbr %[p5], %[qload1] \n
\t" |
| 749 "sb %[st2], 0(%[dst]) \n
\t" /* even 5 */ |
| 750 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 751 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* even 7 */ |
| 752 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* even 7 */ |
| 753 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* even 7 */ |
| 754 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* even 7 */ |
| 755 "extp %[Temp1], $ac1, 31 \n
\t" /* even 7 */ |
| 756 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 6 */ |
| 757 |
| 758 /* even 8. pixel */ |
| 759 "mtlo %[vector_64], $ac3 \n
\t" /* odd 1 */ |
| 760 "mthi $zero, $ac3 \n
\t" |
| 761 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* even 8 */ |
| 762 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* even 8 */ |
| 763 "sb %[st3], 0(%[dst]) \n
\t" /* even 6 */ |
| 764 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 765 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* even 8 */ |
| 766 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* even 8 */ |
| 767 "extp %[Temp2], $ac2, 31 \n
\t" /* even 8 */ |
| 768 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 7 */ |
| 769 |
| 770 /* ODD pixels */ |
| 771 "ulw %[qload1], 1(%[src]) \n
\t" |
| 772 "ulw %[qload2], 5(%[src]) \n
\t" |
| 773 |
| 774 /* odd 1. pixel */ |
| 775 "mtlo %[vector_64], $ac1 \n
\t" /* odd 2 */ |
| 776 "mthi $zero, $ac1 \n
\t" |
| 777 "preceu.ph.qbr %[p1], %[qload1] \n
\t" |
| 778 "preceu.ph.qbl %[p2], %[qload1] \n
\t" |
| 779 "preceu.ph.qbr %[p3], %[qload2] \n
\t" |
| 780 "preceu.ph.qbl %[p4], %[qload2] \n
\t" |
| 781 "sb %[st1], 0(%[dst]) \n
\t" /* even 7 */ |
| 782 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" |
| 783 "ulw %[qload2], 9(%[src]) \n
\t" |
| 784 "dpa.w.ph $ac3, %[p1], %[filter12] \n
\t" /* odd 1 */ |
| 785 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* odd 1 */ |
| 786 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* odd 1 */ |
| 787 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* odd 1 */ |
| 788 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 1 */ |
| 789 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 8 */ |
| 790 |
| 791 /* odd 2. pixel */ |
| 792 "mtlo %[vector_64], $ac2 \n
\t" /* odd 3 */ |
| 793 "mthi $zero, $ac2 \n
\t" |
| 794 "preceu.ph.qbr %[p1], %[qload2] \n
\t" |
| 795 "preceu.ph.qbl %[p5], %[qload2] \n
\t" |
| 796 "sb %[st2], 0(%[dst]) \n
\t" /* even 8 */ |
| 797 "ulw %[qload1], 13(%[src]) \n
\t" |
| 798 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* odd 2 */ |
| 799 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* odd 2 */ |
| 800 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* odd 2 */ |
| 801 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* odd 2 */ |
| 802 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 2 */ |
| 803 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 1 */ |
| 804 |
| 805 /* odd 3. pixel */ |
| 806 "mtlo %[vector_64], $ac3 \n
\t" /* odd 4 */ |
| 807 "mthi $zero, $ac3 \n
\t" |
| 808 "preceu.ph.qbr %[p2], %[qload1] \n
\t" |
| 809 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 1 */ |
| 810 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 811 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* odd 3 */ |
| 812 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* odd 3 */ |
| 813 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* odd 3 */ |
| 814 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* odd 3 */ |
| 815 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 3 */ |
| 816 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 2 */ |
| 817 |
| 818 /* odd 4. pixel */ |
| 819 "mtlo %[vector_64], $ac1 \n
\t" /* odd 5 */ |
| 820 "mthi $zero, $ac1 \n
\t" |
| 821 "preceu.ph.qbl %[p3], %[qload1] \n
\t" |
| 822 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 2 */ |
| 823 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 824 "ulw %[qload2], 17(%[src]) \n
\t" |
| 825 "dpa.w.ph $ac3, %[p4], %[filter12] \n
\t" /* odd 4 */ |
| 826 "dpa.w.ph $ac3, %[p1], %[filter34] \n
\t" /* odd 4 */ |
| 827 "dpa.w.ph $ac3, %[p5], %[filter56] \n
\t" /* odd 4 */ |
| 828 "dpa.w.ph $ac3, %[p2], %[filter78] \n
\t" /* odd 4 */ |
| 829 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 4 */ |
| 830 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 3 */ |
| 831 |
| 832 /* odd 5. pixel */ |
| 833 "mtlo %[vector_64], $ac2 \n
\t" /* odd 6 */ |
| 834 "mthi $zero, $ac2 \n
\t" |
| 835 "preceu.ph.qbr %[p4], %[qload2] \n
\t" |
| 836 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 3 */ |
| 837 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 838 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* odd 5 */ |
| 839 "dpa.w.ph $ac1, %[p5], %[filter34] \n
\t" /* odd 5 */ |
| 840 "dpa.w.ph $ac1, %[p2], %[filter56] \n
\t" /* odd 5 */ |
| 841 "dpa.w.ph $ac1, %[p3], %[filter78] \n
\t" /* odd 5 */ |
| 842 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 5 */ |
| 843 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 4 */ |
| 844 |
| 845 /* odd 6. pixel */ |
| 846 "mtlo %[vector_64], $ac3 \n
\t" /* odd 7 */ |
| 847 "mthi $zero, $ac3 \n
\t" |
| 848 "preceu.ph.qbl %[p1], %[qload2] \n
\t" |
| 849 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 4 */ |
| 850 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 851 "ulw %[qload1], 21(%[src]) \n
\t" |
| 852 "dpa.w.ph $ac2, %[p5], %[filter12] \n
\t" /* odd 6 */ |
| 853 "dpa.w.ph $ac2, %[p2], %[filter34] \n
\t" /* odd 6 */ |
| 854 "dpa.w.ph $ac2, %[p3], %[filter56] \n
\t" /* odd 6 */ |
| 855 "dpa.w.ph $ac2, %[p4], %[filter78] \n
\t" /* odd 6 */ |
| 856 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 6 */ |
| 857 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 5 */ |
| 858 |
| 859 /* odd 7. pixel */ |
| 860 "mtlo %[vector_64], $ac1 \n
\t" /* odd 8 */ |
| 861 "mthi $zero, $ac1 \n
\t" |
| 862 "preceu.ph.qbr %[p5], %[qload1] \n
\t" |
| 863 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 5 */ |
| 864 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 865 "dpa.w.ph $ac3, %[p2], %[filter12] \n
\t" /* odd 7 */ |
| 866 "dpa.w.ph $ac3, %[p3], %[filter34] \n
\t" /* odd 7 */ |
| 867 "dpa.w.ph $ac3, %[p4], %[filter56] \n
\t" /* odd 7 */ |
| 868 "dpa.w.ph $ac3, %[p1], %[filter78] \n
\t" /* odd 7 */ |
| 869 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 7 */ |
| 870 |
| 871 /* odd 8. pixel */ |
| 872 "dpa.w.ph $ac1, %[p3], %[filter12] \n
\t" /* odd 8 */ |
| 873 "dpa.w.ph $ac1, %[p4], %[filter34] \n
\t" /* odd 8 */ |
| 874 "dpa.w.ph $ac1, %[p1], %[filter56] \n
\t" /* odd 8 */ |
| 875 "dpa.w.ph $ac1, %[p5], %[filter78] \n
\t" /* odd 8 */ |
| 876 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 8 */ |
| 877 |
| 878 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 6 */ |
| 879 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 7 */ |
| 880 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 8 */ |
| 881 |
| 882 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 6 */ |
| 883 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 884 |
| 885 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 7 */ |
| 886 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" |
| 887 |
| 888 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 8 */ |
| 889 |
| 890 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), |
| 891 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), |
| 892 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), |
| 893 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), |
| 894 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) |
| 895 : [filter12] "r" (filter12), [filter34] "r" (filter34), |
| 896 [filter56] "r" (filter56), [filter78] "r" (filter78), |
| 897 [vector_64] "r" (vector_64), [cm] "r" (cm), |
| 898 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) |
| 899 ); |
| 900 |
| 901 src += 16; |
| 902 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); |
| 903 odd_dst = (dst + dst_stride); |
| 904 } |
| 905 |
| 906 /* Next row... */ |
| 907 src_ptr += src_stride; |
| 908 |
| 909 dst_ptr += 1; |
| 910 } |
| 911 } |
| 912 |
| 913 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, |
| 914 uint8_t *dst, ptrdiff_t dst_stride, |
| 915 const int16_t *filter, int w, int h) { |
| 916 int x, y, k; |
| 917 |
| 918 for (y = 0; y < h; ++y) { |
| 919 for (x = 0; x < w; ++x) { |
| 920 int sum = 0; |
| 921 |
| 922 for (k = 0; k < 8; ++k) |
| 923 sum += src[x + k] * filter[k]; |
| 924 |
| 925 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); |
| 926 } |
| 927 |
| 928 src += src_stride; |
| 929 dst += 1; |
| 930 } |
| 931 } |
| 932 |
| 933 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, |
| 934 uint8_t *dst, ptrdiff_t dst_stride, |
| 935 int w, int h) { |
| 936 int x, y; |
| 937 |
| 938 for (y = 0; y < h; ++y) { |
| 939 for (x = 0; x < w; ++x) { |
| 940 dst[x * dst_stride] = src[x]; |
| 941 } |
| 942 |
| 943 src += src_stride; |
| 944 dst += 1; |
| 945 } |
| 946 } |
| 947 |
| 948 void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, |
| 949 uint8_t *dst, ptrdiff_t dst_stride, |
| 950 const int16_t *filter_x, int x_step_q4, |
| 951 const int16_t *filter_y, int y_step_q4, |
| 952 int w, int h) { |
| 953 DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135); |
| 954 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; |
| 955 uint32_t pos = 38; |
| 956 |
| 957 /* bit positon for extract from acc */ |
| 958 __asm__ __volatile__ ( |
| 959 "wrdsp %[pos], 1 \n\t" |
| 960 : |
| 961 : [pos] "r" (pos) |
| 962 ); |
| 963 |
| 964 if (intermediate_height < h) |
| 965 intermediate_height = h; |
| 966 |
| 967 if (x_step_q4 != 16 || y_step_q4 != 16) |
| 968 return vp9_convolve8_c(src, src_stride, |
| 969 dst, dst_stride, |
| 970 filter_x, x_step_q4, |
| 971 filter_y, y_step_q4, |
| 972 w, h); |
| 973 |
| 974 if ((((const int32_t *)filter_x)[1] == 0x800000) |
| 975 && (((const int32_t *)filter_y)[1] == 0x800000)) |
| 976 return vp9_convolve_copy(src, src_stride, |
| 977 dst, dst_stride, |
| 978 filter_x, x_step_q4, |
| 979 filter_y, y_step_q4, |
| 980 w, h); |
| 981 |
| 982 /* copy the src to dst */ |
| 983 if (filter_x[3] == 0x80) { |
| 984 copy_horiz_transposed(src - src_stride * 3, src_stride, |
| 985 temp, intermediate_height, |
| 986 w, intermediate_height); |
| 987 } else if (((const int32_t *)filter_x)[0] == 0) { |
| 988 vp9_convolve2_dspr2(src - src_stride * 3, src_stride, |
| 989 temp, intermediate_height, |
| 990 filter_x, |
| 991 w, intermediate_height); |
| 992 } else { |
| 993 src -= (src_stride * 3 + 3); |
| 994 |
| 995 /* prefetch data to cache memory */ |
| 996 vp9_prefetch_load(src); |
| 997 vp9_prefetch_load(src + 32); |
| 998 |
| 999 switch (w) { |
| 1000 case 4: |
| 1001 convolve_horiz_4_transposed_dspr2(src, src_stride, |
| 1002 temp, intermediate_height, |
| 1003 filter_x, intermediate_height); |
| 1004 break; |
| 1005 case 8: |
| 1006 convolve_horiz_8_transposed_dspr2(src, src_stride, |
| 1007 temp, intermediate_height, |
| 1008 filter_x, intermediate_height); |
| 1009 break; |
| 1010 case 16: |
| 1011 case 32: |
| 1012 convolve_horiz_16_transposed_dspr2(src, src_stride, |
| 1013 temp, intermediate_height, |
| 1014 filter_x, intermediate_height, |
| 1015 (w/16)); |
| 1016 break; |
| 1017 case 64: |
| 1018 vp9_prefetch_load(src + 32); |
| 1019 convolve_horiz_64_transposed_dspr2(src, src_stride, |
| 1020 temp, intermediate_height, |
| 1021 filter_x, intermediate_height); |
| 1022 break; |
| 1023 default: |
| 1024 convolve_horiz_transposed(src, src_stride, |
| 1025 temp, intermediate_height, |
| 1026 filter_x, w, intermediate_height); |
| 1027 break; |
| 1028 } |
| 1029 } |
| 1030 |
| 1031 /* copy the src to dst */ |
| 1032 if (filter_y[3] == 0x80) { |
| 1033 copy_horiz_transposed(temp + 3, intermediate_height, |
| 1034 dst, dst_stride, |
| 1035 h, w); |
| 1036 } else if (((const int32_t *)filter_y)[0] == 0) { |
| 1037 vp9_convolve2_dspr2(temp + 3, intermediate_height, |
| 1038 dst, dst_stride, |
| 1039 filter_y, |
| 1040 h, w); |
| 1041 } else { |
| 1042 switch (h) { |
| 1043 case 4: |
| 1044 convolve_horiz_4_transposed_dspr2(temp, intermediate_height, |
| 1045 dst, dst_stride, |
| 1046 filter_y, w); |
| 1047 break; |
| 1048 case 8: |
| 1049 convolve_horiz_8_transposed_dspr2(temp, intermediate_height, |
| 1050 dst, dst_stride, |
| 1051 filter_y, w); |
| 1052 break; |
| 1053 case 16: |
| 1054 case 32: |
| 1055 convolve_horiz_16_transposed_dspr2(temp, intermediate_height, |
| 1056 dst, dst_stride, |
| 1057 filter_y, w, (h/16)); |
| 1058 break; |
| 1059 case 64: |
| 1060 convolve_horiz_64_transposed_dspr2(temp, intermediate_height, |
| 1061 dst, dst_stride, |
| 1062 filter_y, w); |
| 1063 break; |
| 1064 default: |
| 1065 convolve_horiz_transposed(temp, intermediate_height, |
| 1066 dst, dst_stride, |
| 1067 filter_y, h, w); |
| 1068 break; |
| 1069 } |
| 1070 } |
| 1071 } |
| 1072 |
| 1073 void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, |
| 1074 uint8_t *dst, ptrdiff_t dst_stride, |
| 1075 const int16_t *filter_x, int filter_x_stride, |
| 1076 const int16_t *filter_y, int filter_y_stride, |
| 1077 int w, int h) { |
| 1078 int x, y; |
| 1079 |
| 1080 /* prefetch data to cache memory */ |
| 1081 vp9_prefetch_load(src); |
| 1082 vp9_prefetch_load(src + 32); |
| 1083 vp9_prefetch_store(dst); |
| 1084 |
| 1085 switch (w) { |
| 1086 case 4: |
| 1087 { |
| 1088 uint32_t tp1; |
| 1089 |
| 1090 /* 1 word storage */ |
| 1091 for (y = h; y--; ) { |
| 1092 vp9_prefetch_load(src + src_stride); |
| 1093 vp9_prefetch_load(src + src_stride + 32); |
| 1094 vp9_prefetch_store(dst + dst_stride); |
| 1095 |
| 1096 __asm__ __volatile__ ( |
| 1097 "ulw %[tp1], (%[src]) \n\t" |
| 1098 "sw %[tp1], (%[dst]) \n\t" /* store */ |
| 1099 |
| 1100 : [tp1] "=&r" (tp1) |
| 1101 : [src] "r" (src), [dst] "r" (dst) |
| 1102 ); |
| 1103 |
| 1104 src += src_stride; |
| 1105 dst += dst_stride; |
| 1106 } |
| 1107 } |
| 1108 break; |
| 1109 case 8: |
| 1110 { |
| 1111 uint32_t tp1, tp2; |
| 1112 |
| 1113 /* 2 word storage */ |
| 1114 for (y = h; y--; ) { |
| 1115 vp9_prefetch_load(src + src_stride); |
| 1116 vp9_prefetch_load(src + src_stride + 32); |
| 1117 vp9_prefetch_store(dst + dst_stride); |
| 1118 |
| 1119 __asm__ __volatile__ ( |
| 1120 "ulw %[tp1], 0(%[src]) \n\t" |
| 1121 "ulw %[tp2], 4(%[src]) \n\t" |
| 1122 "sw %[tp1], 0(%[dst]) \n\t" /* store */ |
| 1123 "sw %[tp2], 4(%[dst]) \n\t" /* store */ |
| 1124 |
| 1125 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2) |
| 1126 : [src] "r" (src), [dst] "r" (dst) |
| 1127 ); |
| 1128 |
| 1129 src += src_stride; |
| 1130 dst += dst_stride; |
| 1131 } |
| 1132 } |
| 1133 break; |
| 1134 case 16: |
| 1135 { |
| 1136 uint32_t tp1, tp2, tp3, tp4; |
| 1137 |
| 1138 /* 4 word storage */ |
| 1139 for (y = h; y--; ) { |
| 1140 vp9_prefetch_load(src + src_stride); |
| 1141 vp9_prefetch_load(src + src_stride + 32); |
| 1142 vp9_prefetch_store(dst + dst_stride); |
| 1143 |
| 1144 __asm__ __volatile__ ( |
| 1145 "ulw %[tp1], 0(%[src]) \n\t" |
| 1146 "ulw %[tp2], 4(%[src]) \n\t" |
| 1147 "ulw %[tp3], 8(%[src]) \n\t" |
| 1148 "ulw %[tp4], 12(%[src]) \n\t" |
| 1149 |
| 1150 "sw %[tp1], 0(%[dst]) \n\t" /* store */ |
| 1151 "sw %[tp2], 4(%[dst]) \n\t" /* store */ |
| 1152 "sw %[tp3], 8(%[dst]) \n\t" /* store */ |
| 1153 "sw %[tp4], 12(%[dst]) \n\t" /* store */ |
| 1154 |
| 1155 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), |
| 1156 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4) |
| 1157 : [src] "r" (src), [dst] "r" (dst) |
| 1158 ); |
| 1159 |
| 1160 src += src_stride; |
| 1161 dst += dst_stride; |
| 1162 } |
| 1163 } |
| 1164 break; |
| 1165 case 32: |
| 1166 { |
| 1167 uint32_t tp1, tp2, tp3, tp4; |
| 1168 uint32_t tp5, tp6, tp7, tp8; |
| 1169 |
| 1170 /* 8 word storage */ |
| 1171 for (y = h; y--; ) { |
| 1172 vp9_prefetch_load(src + src_stride); |
| 1173 vp9_prefetch_load(src + src_stride + 32); |
| 1174 vp9_prefetch_store(dst + dst_stride); |
| 1175 |
| 1176 __asm__ __volatile__ ( |
| 1177 "ulw %[tp1], 0(%[src]) \n\t" |
| 1178 "ulw %[tp2], 4(%[src]) \n\t" |
| 1179 "ulw %[tp3], 8(%[src]) \n\t" |
| 1180 "ulw %[tp4], 12(%[src]) \n\t" |
| 1181 "ulw %[tp5], 16(%[src]) \n\t" |
| 1182 "ulw %[tp6], 20(%[src]) \n\t" |
| 1183 "ulw %[tp7], 24(%[src]) \n\t" |
| 1184 "ulw %[tp8], 28(%[src]) \n\t" |
| 1185 |
| 1186 "sw %[tp1], 0(%[dst]) \n\t" /* store */ |
| 1187 "sw %[tp2], 4(%[dst]) \n\t" /* store */ |
| 1188 "sw %[tp3], 8(%[dst]) \n\t" /* store */ |
| 1189 "sw %[tp4], 12(%[dst]) \n\t" /* store */ |
| 1190 "sw %[tp5], 16(%[dst]) \n\t" /* store */ |
| 1191 "sw %[tp6], 20(%[dst]) \n\t" /* store */ |
| 1192 "sw %[tp7], 24(%[dst]) \n\t" /* store */ |
| 1193 "sw %[tp8], 28(%[dst]) \n\t" /* store */ |
| 1194 |
| 1195 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), |
| 1196 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), |
| 1197 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), |
| 1198 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) |
| 1199 : [src] "r" (src), [dst] "r" (dst) |
| 1200 ); |
| 1201 |
| 1202 src += src_stride; |
| 1203 dst += dst_stride; |
| 1204 } |
| 1205 } |
| 1206 break; |
| 1207 case 64: |
| 1208 { |
| 1209 uint32_t tp1, tp2, tp3, tp4; |
| 1210 uint32_t tp5, tp6, tp7, tp8; |
| 1211 |
| 1212 vp9_prefetch_load(src + 64); |
| 1213 vp9_prefetch_store(dst + 32); |
| 1214 |
| 1215 /* 16 word storage */ |
| 1216 for (y = h; y--; ) { |
| 1217 vp9_prefetch_load(src + src_stride); |
| 1218 vp9_prefetch_load(src + src_stride + 32); |
| 1219 vp9_prefetch_load(src + src_stride + 64); |
| 1220 vp9_prefetch_store(dst + dst_stride); |
| 1221 vp9_prefetch_store(dst + dst_stride + 32); |
| 1222 |
| 1223 __asm__ __volatile__ ( |
| 1224 "ulw %[tp1], 0(%[src]) \n\t" |
| 1225 "ulw %[tp2], 4(%[src]) \n\t" |
| 1226 "ulw %[tp3], 8(%[src]) \n\t" |
| 1227 "ulw %[tp4], 12(%[src]) \n\t" |
| 1228 "ulw %[tp5], 16(%[src]) \n\t" |
| 1229 "ulw %[tp6], 20(%[src]) \n\t" |
| 1230 "ulw %[tp7], 24(%[src]) \n\t" |
| 1231 "ulw %[tp8], 28(%[src]) \n\t" |
| 1232 |
| 1233 "sw %[tp1], 0(%[dst]) \n\t" /* store */ |
| 1234 "sw %[tp2], 4(%[dst]) \n\t" /* store */ |
| 1235 "sw %[tp3], 8(%[dst]) \n\t" /* store */ |
| 1236 "sw %[tp4], 12(%[dst]) \n\t" /* store */ |
| 1237 "sw %[tp5], 16(%[dst]) \n\t" /* store */ |
| 1238 "sw %[tp6], 20(%[dst]) \n\t" /* store */ |
| 1239 "sw %[tp7], 24(%[dst]) \n\t" /* store */ |
| 1240 "sw %[tp8], 28(%[dst]) \n\t" /* store */ |
| 1241 |
| 1242 "ulw %[tp1], 32(%[src]) \n\t" |
| 1243 "ulw %[tp2], 36(%[src]) \n\t" |
| 1244 "ulw %[tp3], 40(%[src]) \n\t" |
| 1245 "ulw %[tp4], 44(%[src]) \n\t" |
| 1246 "ulw %[tp5], 48(%[src]) \n\t" |
| 1247 "ulw %[tp6], 52(%[src]) \n\t" |
| 1248 "ulw %[tp7], 56(%[src]) \n\t" |
| 1249 "ulw %[tp8], 60(%[src]) \n\t" |
| 1250 |
| 1251 "sw %[tp1], 32(%[dst]) \n\t" /* store */ |
| 1252 "sw %[tp2], 36(%[dst]) \n\t" /* store */ |
| 1253 "sw %[tp3], 40(%[dst]) \n\t" /* store */ |
| 1254 "sw %[tp4], 44(%[dst]) \n\t" /* store */ |
| 1255 "sw %[tp5], 48(%[dst]) \n\t" /* store */ |
| 1256 "sw %[tp6], 52(%[dst]) \n\t" /* store */ |
| 1257 "sw %[tp7], 56(%[dst]) \n\t" /* store */ |
| 1258 "sw %[tp8], 60(%[dst]) \n\t" /* store */ |
| 1259 |
| 1260 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), |
| 1261 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), |
| 1262 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), |
| 1263 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) |
| 1264 : [src] "r" (src), [dst] "r" (dst) |
| 1265 ); |
| 1266 |
| 1267 src += src_stride; |
| 1268 dst += dst_stride; |
| 1269 } |
| 1270 } |
| 1271 break; |
| 1272 default: |
| 1273 for (y = h; y--; ) { |
| 1274 for (x = 0; x < w; ++x) { |
| 1275 dst[x] = src[x]; |
| 1276 } |
| 1277 |
| 1278 src += src_stride; |
| 1279 dst += dst_stride; |
| 1280 } |
| 1281 break; |
| 1282 } |
| 1283 } |
| 1284 #endif |
OLD | NEW |