| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license | |
| 5 * that can be found in the LICENSE file in the root of the source | |
| 6 * tree. An additional intellectual property rights grant can be found | |
| 7 * in the file PATENTS. All contributing project authors may | |
| 8 * be found in the AUTHORS file in the root of the source tree. | |
| 9 */ | |
| 10 | |
| 11 #include <assert.h> | |
| 12 #include <stdio.h> | |
| 13 | |
| 14 #include "./vpx_dsp_rtcd.h" | |
| 15 #include "vpx_dsp/mips/vpx_common_dspr2.h" | |
| 16 #include "vpx_dsp/vpx_dsp_common.h" | |
| 17 #include "vpx_dsp/vpx_filter.h" | |
| 18 #include "vpx_ports/mem.h" | |
| 19 | |
| 20 #if HAVE_DSPR2 | |
| 21 uint8_t vpx_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; | |
| 22 uint8_t *vpx_ff_cropTbl; | |
| 23 | |
| 24 void vpx_dsputil_static_init(void) { | |
| 25 int i; | |
| 26 | |
| 27 for (i = 0; i < 256; i++) vpx_ff_cropTbl_a[i + CROP_WIDTH] = i; | |
| 28 | |
| 29 for (i = 0; i < CROP_WIDTH; i++) { | |
| 30 vpx_ff_cropTbl_a[i] = 0; | |
| 31 vpx_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; | |
| 32 } | |
| 33 | |
| 34 vpx_ff_cropTbl = &vpx_ff_cropTbl_a[CROP_WIDTH]; | |
| 35 } | |
| 36 | |
| 37 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, | |
| 38 int32_t src_stride, | |
| 39 uint8_t *dst, | |
| 40 int32_t dst_stride, | |
| 41 const int16_t *filter_x0, | |
| 42 int32_t h) { | |
| 43 int32_t y; | |
| 44 uint8_t *cm = vpx_ff_cropTbl; | |
| 45 uint8_t *dst_ptr; | |
| 46 int32_t vector1b, vector2b, vector3b, vector4b; | |
| 47 int32_t Temp1, Temp2, Temp3, Temp4; | |
| 48 uint32_t vector4a = 64; | |
| 49 uint32_t tp1, tp2; | |
| 50 uint32_t p1, p2, p3, p4; | |
| 51 uint32_t tn1, tn2; | |
| 52 | |
| 53 vector1b = ((const int32_t *)filter_x0)[0]; | |
| 54 vector2b = ((const int32_t *)filter_x0)[1]; | |
| 55 vector3b = ((const int32_t *)filter_x0)[2]; | |
| 56 vector4b = ((const int32_t *)filter_x0)[3]; | |
| 57 | |
| 58 for (y = h; y--;) { | |
| 59 dst_ptr = dst; | |
| 60 /* prefetch data to cache memory */ | |
| 61 prefetch_load(src + src_stride); | |
| 62 prefetch_load(src + src_stride + 32); | |
| 63 | |
| 64 __asm__ __volatile__ ( | |
| 65 "ulw %[tp1], 0(%[src]) \n\t" | |
| 66 "ulw %[tp2], 4(%[src]) \n\t" | |
| 67 | |
| 68 /* even 1. pixel */ | |
| 69 "mtlo %[vector4a], $ac3 \n\t" | |
| 70 "mthi $zero, $ac3 \n\t" | |
| 71 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
| 72 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
| 73 "preceu.ph.qbr %[p3], %[tp2] \n\t" | |
| 74 "preceu.ph.qbl %[p4], %[tp2] \n\t" | |
| 75 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
| 76 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
| 77 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
| 78 "ulw %[tn2], 8(%[src]) \n\t" | |
| 79 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
| 80 "extp %[Temp1], $ac3, 31 \n\t" | |
| 81 | |
| 82 /* even 2. pixel */ | |
| 83 "mtlo %[vector4a], $ac2 \n\t" | |
| 84 "mthi $zero, $ac2 \n\t" | |
| 85 "preceu.ph.qbr %[p1], %[tn2] \n\t" | |
| 86 "balign %[tn1], %[tn2], 3 \n\t" | |
| 87 "balign %[tn2], %[tp2], 3 \n\t" | |
| 88 "balign %[tp2], %[tp1], 3 \n\t" | |
| 89 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
| 90 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
| 91 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
| 92 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
| 93 "extp %[Temp3], $ac2, 31 \n\t" | |
| 94 | |
| 95 /* odd 1. pixel */ | |
| 96 "lbux %[tp1], %[Temp1](%[cm]) \n\t" | |
| 97 "mtlo %[vector4a], $ac3 \n\t" | |
| 98 "mthi $zero, $ac3 \n\t" | |
| 99 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
| 100 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
| 101 "preceu.ph.qbr %[p3], %[tn2] \n\t" | |
| 102 "preceu.ph.qbl %[p4], %[tn2] \n\t" | |
| 103 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
| 104 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
| 105 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
| 106 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
| 107 "extp %[Temp2], $ac3, 31 \n\t" | |
| 108 | |
| 109 /* odd 2. pixel */ | |
| 110 "lbux %[tp2], %[Temp3](%[cm]) \n\t" | |
| 111 "mtlo %[vector4a], $ac2 \n\t" | |
| 112 "mthi $zero, $ac2 \n\t" | |
| 113 "preceu.ph.qbr %[p1], %[tn1] \n\t" | |
| 114 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
| 115 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
| 116 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
| 117 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
| 118 "extp %[Temp4], $ac2, 31 \n\t" | |
| 119 | |
| 120 /* clamp */ | |
| 121 "lbux %[tn1], %[Temp2](%[cm]) \n\t" | |
| 122 "lbux %[p2], %[Temp4](%[cm]) \n\t" | |
| 123 | |
| 124 /* store bytes */ | |
| 125 "sb %[tp1], 0(%[dst_ptr]) \n\t" | |
| 126 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
| 127 | |
| 128 "sb %[tn1], 0(%[dst_ptr]) \n\t" | |
| 129 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
| 130 | |
| 131 "sb %[tp2], 0(%[dst_ptr]) \n\t" | |
| 132 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
| 133 | |
| 134 "sb %[p2], 0(%[dst_ptr]) \n\t" | |
| 135 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" | |
| 136 | |
| 137 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (
tn2), | |
| 138 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
| 139 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [
Temp4] "=&r" (Temp4), | |
| 140 [dst_ptr] "+r" (dst_ptr) | |
| 141 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), | |
| 142 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), | |
| 143 [vector4a] "r" (vector4a), | |
| 144 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) | |
| 145 ); | |
| 146 | |
| 147 /* Next row... */ | |
| 148 src += src_stride; | |
| 149 dst += 1; | |
| 150 } | |
| 151 } | |
| 152 | |
| 153 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, | |
| 154 int32_t src_stride, | |
| 155 uint8_t *dst, | |
| 156 int32_t dst_stride, | |
| 157 const int16_t *filter_x0, | |
| 158 int32_t h) { | |
| 159 int32_t y; | |
| 160 uint8_t *cm = vpx_ff_cropTbl; | |
| 161 uint8_t *dst_ptr; | |
| 162 uint32_t vector4a = 64; | |
| 163 int32_t vector1b, vector2b, vector3b, vector4b; | |
| 164 int32_t Temp1, Temp2, Temp3; | |
| 165 uint32_t tp1, tp2, tp3; | |
| 166 uint32_t p1, p2, p3, p4, n1; | |
| 167 uint8_t *odd_dst; | |
| 168 uint32_t dst_pitch_2 = (dst_stride << 1); | |
| 169 | |
| 170 vector1b = ((const int32_t *)filter_x0)[0]; | |
| 171 vector2b = ((const int32_t *)filter_x0)[1]; | |
| 172 vector3b = ((const int32_t *)filter_x0)[2]; | |
| 173 vector4b = ((const int32_t *)filter_x0)[3]; | |
| 174 | |
| 175 for (y = h; y--;) { | |
| 176 /* prefetch data to cache memory */ | |
| 177 prefetch_load(src + src_stride); | |
| 178 prefetch_load(src + src_stride + 32); | |
| 179 | |
| 180 dst_ptr = dst; | |
| 181 odd_dst = (dst_ptr + dst_stride); | |
| 182 | |
| 183 __asm__ __volatile__ ( | |
| 184 "ulw %[tp2], 0(%[src]) \n\t" | |
| 185 "ulw %[tp1], 4(%[src]) \n\t" | |
| 186 | |
| 187 /* even 1. pixel */ | |
| 188 "mtlo %[vector4a], $ac3 \n\t" | |
| 189 "mthi $zero, $ac3 \n\t" | |
| 190 "mtlo %[vector4a], $ac2 \n\t" | |
| 191 "mthi $zero, $ac2 \n\t" | |
| 192 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
| 193 "preceu.ph.qbl %[p2], %[tp2] \n\t" | |
| 194 "preceu.ph.qbr %[p3], %[tp1] \n\t" | |
| 195 "preceu.ph.qbl %[p4], %[tp1] \n\t" | |
| 196 "ulw %[tp3], 8(%[src]) \n\t" | |
| 197 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
| 198 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
| 199 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
| 200 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
| 201 "extp %[Temp1], $ac3, 31 \n\t" | |
| 202 | |
| 203 /* even 2. pixel */ | |
| 204 "preceu.ph.qbr %[p1], %[tp3] \n\t" | |
| 205 "preceu.ph.qbl %[n1], %[tp3] \n\t" | |
| 206 "ulw %[tp2], 12(%[src]) \n\t" | |
| 207 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" | |
| 208 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" | |
| 209 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" | |
| 210 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" | |
| 211 "extp %[Temp3], $ac2, 31 \n\t" | |
| 212 | |
| 213 /* even 3. pixel */ | |
| 214 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" | |
| 215 "mtlo %[vector4a], $ac1 \n\t" | |
| 216 "mthi $zero, $ac1 \n\t" | |
| 217 "preceu.ph.qbr %[p2], %[tp2] \n\t" | |
| 218 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" | |
| 219 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" | |
| 220 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" | |
| 221 "lbux %[tp3], %[Temp3](%[cm]) \n\t" | |
| 222 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" | |
| 223 "extp %[p3], $ac1, 31 \n\t" | |
| 224 | |
| 225 /* even 4. pixel */ | |
| 226 "mtlo %[vector4a], $ac2 \n\t" | |
| 227 "mthi $zero, $ac2 \n\t" | |
| 228 "mtlo %[vector4a], $ac3 \n\t" | |
| 229 "mthi $zero, $ac3 \n\t" | |
| 230 "sb %[Temp2], 0(%[dst_ptr]) \n\t" | |
| 231 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
| 232 "sb %[tp3], 0(%[dst_ptr]) \n\t" | |
| 233 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
| 234 | |
| 235 "ulw %[tp1], 1(%[src]) \n\t" | |
| 236 "ulw %[tp3], 5(%[src]) \n\t" | |
| 237 | |
| 238 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" | |
| 239 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" | |
| 240 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" | |
| 241 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" | |
| 242 "extp %[Temp3], $ac2, 31 \n\t" | |
| 243 | |
| 244 "lbux %[tp2], %[p3](%[cm]) \n\t" | |
| 245 | |
| 246 /* odd 1. pixel */ | |
| 247 "mtlo %[vector4a], $ac1 \n\t" | |
| 248 "mthi $zero, $ac1 \n\t" | |
| 249 "preceu.ph.qbr %[p1], %[tp1] \n\t" | |
| 250 "preceu.ph.qbl %[p2], %[tp1] \n\t" | |
| 251 "preceu.ph.qbr %[p3], %[tp3] \n\t" | |
| 252 "preceu.ph.qbl %[p4], %[tp3] \n\t" | |
| 253 "sb %[tp2], 0(%[dst_ptr]) \n\t" | |
| 254 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
| 255 "ulw %[tp2], 9(%[src]) \n\t" | |
| 256 | |
| 257 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" | |
| 258 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" | |
| 259 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" | |
| 260 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" | |
| 261 "extp %[Temp2], $ac3, 31 \n\t" | |
| 262 | |
| 263 /* odd 2. pixel */ | |
| 264 "lbux %[tp1], %[Temp3](%[cm]) \n\t" | |
| 265 "mtlo %[vector4a], $ac3 \n\t" | |
| 266 "mthi $zero, $ac3 \n\t" | |
| 267 "mtlo %[vector4a], $ac2 \n\t" | |
| 268 "mthi $zero, $ac2 \n\t" | |
| 269 "preceu.ph.qbr %[p1], %[tp2] \n\t" | |
| 270 "preceu.ph.qbl %[n1], %[tp2] \n\t" | |
| 271 "ulw %[Temp1], 13(%[src]) \n\t" | |
| 272 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" | |
| 273 "sb %[tp1], 0(%[dst_ptr]) \n\t" | |
| 274 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" | |
| 275 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" | |
| 276 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" | |
| 277 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" | |
| 278 "extp %[Temp3], $ac1, 31 \n\t" | |
| 279 | |
| 280 /* odd 3. pixel */ | |
| 281 "lbux %[tp3], %[Temp2](%[cm]) \n\t" | |
| 282 "preceu.ph.qbr %[p2], %[Temp1] \n\t" | |
| 283 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" | |
| 284 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" | |
| 285 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" | |
| 286 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" | |
| 287 "extp %[Temp2], $ac3, 31 \n\t" | |
| 288 | |
| 289 /* odd 4. pixel */ | |
| 290 "sb %[tp3], 0(%[odd_dst]) \n\t" | |
| 291 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
| 292 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" | |
| 293 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" | |
| 294 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" | |
| 295 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" | |
| 296 "extp %[Temp1], $ac2, 31 \n\t" | |
| 297 | |
| 298 /* clamp */ | |
| 299 "lbux %[p4], %[Temp3](%[cm]) \n\t" | |
| 300 "lbux %[p2], %[Temp2](%[cm]) \n\t" | |
| 301 "lbux %[n1], %[Temp1](%[cm]) \n\t" | |
| 302 | |
| 303 /* store bytes */ | |
| 304 "sb %[p4], 0(%[odd_dst]) \n\t" | |
| 305 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
| 306 | |
| 307 "sb %[p2], 0(%[odd_dst]) \n\t" | |
| 308 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" | |
| 309 | |
| 310 "sb %[n1], 0(%[odd_dst]) \n\t" | |
| 311 | |
| 312 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), | |
| 313 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
| 314 [n1] "=&r" (n1), | |
| 315 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
| 316 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) | |
| 317 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), | |
| 318 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), | |
| 319 [vector4a] "r" (vector4a), [cm] "r" (cm), | |
| 320 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
| 321 ); | |
| 322 | |
| 323 /* Next row... */ | |
| 324 src += src_stride; | |
| 325 dst += 1; | |
| 326 } | |
| 327 } | |
| 328 | |
| 329 static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, | |
| 330 int32_t src_stride, | |
| 331 uint8_t *dst_ptr, | |
| 332 int32_t dst_stride, | |
| 333 const int16_t *filter_x0, | |
| 334 int32_t h, | |
| 335 int32_t count) { | |
| 336 int32_t c, y; | |
| 337 const uint8_t *src; | |
| 338 uint8_t *dst; | |
| 339 uint8_t *cm = vpx_ff_cropTbl; | |
| 340 uint32_t vector_64 = 64; | |
| 341 int32_t filter12, filter34, filter56, filter78; | |
| 342 int32_t Temp1, Temp2, Temp3; | |
| 343 uint32_t qload1, qload2; | |
| 344 uint32_t p1, p2, p3, p4, p5; | |
| 345 uint32_t st1, st2, st3; | |
| 346 uint32_t dst_pitch_2 = (dst_stride << 1); | |
| 347 uint8_t *odd_dst; | |
| 348 | |
| 349 filter12 = ((const int32_t *)filter_x0)[0]; | |
| 350 filter34 = ((const int32_t *)filter_x0)[1]; | |
| 351 filter56 = ((const int32_t *)filter_x0)[2]; | |
| 352 filter78 = ((const int32_t *)filter_x0)[3]; | |
| 353 | |
| 354 for (y = h; y--;) { | |
| 355 /* prefetch data to cache memory */ | |
| 356 prefetch_load(src_ptr + src_stride); | |
| 357 prefetch_load(src_ptr + src_stride + 32); | |
| 358 | |
| 359 src = src_ptr; | |
| 360 dst = dst_ptr; | |
| 361 | |
| 362 odd_dst = (dst + dst_stride); | |
| 363 | |
| 364 for (c = 0; c < count; c++) { | |
| 365 __asm__ __volatile__ ( | |
| 366 "ulw %[qload1], 0(%[src]) \n
\t" | |
| 367 "ulw %[qload2], 4(%[src]) \n
\t" | |
| 368 | |
| 369 /* even 1. pixel */ | |
| 370 "mtlo %[vector_64], $ac1 \n
\t" /* even 1 */ | |
| 371 "mthi $zero, $ac1 \n
\t" | |
| 372 "mtlo %[vector_64], $ac2 \n
\t" /* even 2 */ | |
| 373 "mthi $zero, $ac2 \n
\t" | |
| 374 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
| 375 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
| 376 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
| 377 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
| 378 "ulw %[qload2], 8(%[src]) \n
\t" | |
| 379 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* even 1 */ | |
| 380 "dpa.w.ph $ac1, %[p2], %[filter34] \n
\t" /* even 1 */ | |
| 381 "dpa.w.ph $ac1, %[p3], %[filter56] \n
\t" /* even 1 */ | |
| 382 "dpa.w.ph $ac1, %[p4], %[filter78] \n
\t" /* even 1 */ | |
| 383 "extp %[Temp1], $ac1, 31 \n
\t" /* even 1 */ | |
| 384 | |
| 385 /* even 2. pixel */ | |
| 386 "mtlo %[vector_64], $ac3 \n
\t" /* even 3 */ | |
| 387 "mthi $zero, $ac3 \n
\t" | |
| 388 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
| 389 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
| 390 "ulw %[qload1], 12(%[src]) \n
\t" | |
| 391 "dpa.w.ph $ac2, %[p2], %[filter12] \n
\t" /* even 1 */ | |
| 392 "dpa.w.ph $ac2, %[p3], %[filter34] \n
\t" /* even 1 */ | |
| 393 "dpa.w.ph $ac2, %[p4], %[filter56] \n
\t" /* even 1 */ | |
| 394 "dpa.w.ph $ac2, %[p1], %[filter78] \n
\t" /* even 1 */ | |
| 395 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 1 */ | |
| 396 "extp %[Temp2], $ac2, 31 \n
\t" /* even 1 */ | |
| 397 | |
| 398 /* even 3. pixel */ | |
| 399 "mtlo %[vector_64], $ac1 \n
\t" /* even 4 */ | |
| 400 "mthi $zero, $ac1 \n
\t" | |
| 401 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
| 402 "sb %[st1], 0(%[dst]) \n
\t" /* even 1 */ | |
| 403 "addu %[dst], %[dst], %[dst_pitch_2]
\n\t" | |
| 404 "dpa.w.ph $ac3, %[p3], %[filter12] \n
\t" /* even 3 */ | |
| 405 "dpa.w.ph $ac3, %[p4], %[filter34] \n
\t" /* even 3 */ | |
| 406 "dpa.w.ph $ac3, %[p1], %[filter56] \n
\t" /* even 3 */ | |
| 407 "dpa.w.ph $ac3, %[p5], %[filter78] \n
\t" /* even 3 */ | |
| 408 "extp %[Temp3], $ac3, 31 \n
\t" /* even 3 */ | |
| 409 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 1 */ | |
| 410 | |
| 411 /* even 4. pixel */ | |
| 412 "mtlo %[vector_64], $ac2 \n
\t" /* even 5 */ | |
| 413 "mthi $zero, $ac2 \n
\t" | |
| 414 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
| 415 "sb %[st2], 0(%[dst]) \n
\t" /* even 2 */ | |
| 416 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 417 "ulw %[qload2], 16(%[src]) \n
\t" | |
| 418 "dpa.w.ph $ac1, %[p4], %[filter12] \n
\t" /* even 4 */ | |
| 419 "dpa.w.ph $ac1, %[p1], %[filter34] \n
\t" /* even 4 */ | |
| 420 "dpa.w.ph $ac1, %[p5], %[filter56] \n
\t" /* even 4 */ | |
| 421 "dpa.w.ph $ac1, %[p2], %[filter78] \n
\t" /* even 4 */ | |
| 422 "extp %[Temp1], $ac1, 31 \n
\t" /* even 4 */ | |
| 423 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 3 */ | |
| 424 | |
| 425 /* even 5. pixel */ | |
| 426 "mtlo %[vector_64], $ac3 \n
\t" /* even 6 */ | |
| 427 "mthi $zero, $ac3 \n
\t" | |
| 428 "preceu.ph.qbr %[p4], %[qload2] \n
\t" | |
| 429 "sb %[st3], 0(%[dst]) \n
\t" /* even 3 */ | |
| 430 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 431 "dpa.w.ph $ac2, %[p1], %[filter12] \n
\t" /* even 5 */ | |
| 432 "dpa.w.ph $ac2, %[p5], %[filter34] \n
\t" /* even 5 */ | |
| 433 "dpa.w.ph $ac2, %[p2], %[filter56] \n
\t" /* even 5 */ | |
| 434 "dpa.w.ph $ac2, %[p3], %[filter78] \n
\t" /* even 5 */ | |
| 435 "extp %[Temp2], $ac2, 31 \n
\t" /* even 5 */ | |
| 436 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 4 */ | |
| 437 | |
| 438 /* even 6. pixel */ | |
| 439 "mtlo %[vector_64], $ac1 \n
\t" /* even 7 */ | |
| 440 "mthi $zero, $ac1 \n
\t" | |
| 441 "preceu.ph.qbl %[p1], %[qload2] \n
\t" | |
| 442 "sb %[st1], 0(%[dst]) \n
\t" /* even 4 */ | |
| 443 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 444 "ulw %[qload1], 20(%[src]) \n
\t" | |
| 445 "dpa.w.ph $ac3, %[p5], %[filter12] \n
\t" /* even 6 */ | |
| 446 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* even 6 */ | |
| 447 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* even 6 */ | |
| 448 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* even 6 */ | |
| 449 "extp %[Temp3], $ac3, 31 \n
\t" /* even 6 */ | |
| 450 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 5 */ | |
| 451 | |
| 452 /* even 7. pixel */ | |
| 453 "mtlo %[vector_64], $ac2 \n
\t" /* even 8 */ | |
| 454 "mthi $zero, $ac2 \n
\t" | |
| 455 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
| 456 "sb %[st2], 0(%[dst]) \n
\t" /* even 5 */ | |
| 457 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 458 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* even 7 */ | |
| 459 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* even 7 */ | |
| 460 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* even 7 */ | |
| 461 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* even 7 */ | |
| 462 "extp %[Temp1], $ac1, 31 \n
\t" /* even 7 */ | |
| 463 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 6 */ | |
| 464 | |
| 465 /* even 8. pixel */ | |
| 466 "mtlo %[vector_64], $ac3 \n
\t" /* odd 1 */ | |
| 467 "mthi $zero, $ac3 \n
\t" | |
| 468 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* even 8 */ | |
| 469 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* even 8 */ | |
| 470 "sb %[st3], 0(%[dst]) \n
\t" /* even 6 */ | |
| 471 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 472 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* even 8 */ | |
| 473 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* even 8 */ | |
| 474 "extp %[Temp2], $ac2, 31 \n
\t" /* even 8 */ | |
| 475 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 7 */ | |
| 476 | |
| 477 /* ODD pixels */ | |
| 478 "ulw %[qload1], 1(%[src]) \n
\t" | |
| 479 "ulw %[qload2], 5(%[src]) \n
\t" | |
| 480 | |
| 481 /* odd 1. pixel */ | |
| 482 "mtlo %[vector_64], $ac1 \n
\t" /* odd 2 */ | |
| 483 "mthi $zero, $ac1 \n
\t" | |
| 484 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
| 485 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
| 486 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
| 487 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
| 488 "sb %[st1], 0(%[dst]) \n
\t" /* even 7 */ | |
| 489 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 490 "ulw %[qload2], 9(%[src]) \n
\t" | |
| 491 "dpa.w.ph $ac3, %[p1], %[filter12] \n
\t" /* odd 1 */ | |
| 492 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* odd 1 */ | |
| 493 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* odd 1 */ | |
| 494 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* odd 1 */ | |
| 495 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 1 */ | |
| 496 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 8 */ | |
| 497 | |
| 498 /* odd 2. pixel */ | |
| 499 "mtlo %[vector_64], $ac2 \n
\t" /* odd 3 */ | |
| 500 "mthi $zero, $ac2 \n
\t" | |
| 501 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
| 502 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
| 503 "sb %[st2], 0(%[dst]) \n
\t" /* even 8 */ | |
| 504 "ulw %[qload1], 13(%[src]) \n
\t" | |
| 505 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* odd 2 */ | |
| 506 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* odd 2 */ | |
| 507 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* odd 2 */ | |
| 508 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* odd 2 */ | |
| 509 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 2 */ | |
| 510 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 1 */ | |
| 511 | |
| 512 /* odd 3. pixel */ | |
| 513 "mtlo %[vector_64], $ac3 \n
\t" /* odd 4 */ | |
| 514 "mthi $zero, $ac3 \n
\t" | |
| 515 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
| 516 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 1 */ | |
| 517 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 518 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* odd 3 */ | |
| 519 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* odd 3 */ | |
| 520 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* odd 3 */ | |
| 521 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* odd 3 */ | |
| 522 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 3 */ | |
| 523 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 2 */ | |
| 524 | |
| 525 /* odd 4. pixel */ | |
| 526 "mtlo %[vector_64], $ac1 \n
\t" /* odd 5 */ | |
| 527 "mthi $zero, $ac1 \n
\t" | |
| 528 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
| 529 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 2 */ | |
| 530 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 531 "ulw %[qload2], 17(%[src]) \n
\t" | |
| 532 "dpa.w.ph $ac3, %[p4], %[filter12] \n
\t" /* odd 4 */ | |
| 533 "dpa.w.ph $ac3, %[p1], %[filter34] \n
\t" /* odd 4 */ | |
| 534 "dpa.w.ph $ac3, %[p5], %[filter56] \n
\t" /* odd 4 */ | |
| 535 "dpa.w.ph $ac3, %[p2], %[filter78] \n
\t" /* odd 4 */ | |
| 536 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 4 */ | |
| 537 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 3 */ | |
| 538 | |
| 539 /* odd 5. pixel */ | |
| 540 "mtlo %[vector_64], $ac2 \n
\t" /* odd 6 */ | |
| 541 "mthi $zero, $ac2 \n
\t" | |
| 542 "preceu.ph.qbr %[p4], %[qload2] \n
\t" | |
| 543 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 3 */ | |
| 544 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 545 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* odd 5 */ | |
| 546 "dpa.w.ph $ac1, %[p5], %[filter34] \n
\t" /* odd 5 */ | |
| 547 "dpa.w.ph $ac1, %[p2], %[filter56] \n
\t" /* odd 5 */ | |
| 548 "dpa.w.ph $ac1, %[p3], %[filter78] \n
\t" /* odd 5 */ | |
| 549 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 5 */ | |
| 550 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 4 */ | |
| 551 | |
| 552 /* odd 6. pixel */ | |
| 553 "mtlo %[vector_64], $ac3 \n
\t" /* odd 7 */ | |
| 554 "mthi $zero, $ac3 \n
\t" | |
| 555 "preceu.ph.qbl %[p1], %[qload2] \n
\t" | |
| 556 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 4 */ | |
| 557 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 558 "ulw %[qload1], 21(%[src]) \n
\t" | |
| 559 "dpa.w.ph $ac2, %[p5], %[filter12] \n
\t" /* odd 6 */ | |
| 560 "dpa.w.ph $ac2, %[p2], %[filter34] \n
\t" /* odd 6 */ | |
| 561 "dpa.w.ph $ac2, %[p3], %[filter56] \n
\t" /* odd 6 */ | |
| 562 "dpa.w.ph $ac2, %[p4], %[filter78] \n
\t" /* odd 6 */ | |
| 563 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 6 */ | |
| 564 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 5 */ | |
| 565 | |
| 566 /* odd 7. pixel */ | |
| 567 "mtlo %[vector_64], $ac1 \n
\t" /* odd 8 */ | |
| 568 "mthi $zero, $ac1 \n
\t" | |
| 569 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
| 570 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 5 */ | |
| 571 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 572 "dpa.w.ph $ac3, %[p2], %[filter12] \n
\t" /* odd 7 */ | |
| 573 "dpa.w.ph $ac3, %[p3], %[filter34] \n
\t" /* odd 7 */ | |
| 574 "dpa.w.ph $ac3, %[p4], %[filter56] \n
\t" /* odd 7 */ | |
| 575 "dpa.w.ph $ac3, %[p1], %[filter78] \n
\t" /* odd 7 */ | |
| 576 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 7 */ | |
| 577 | |
| 578 /* odd 8. pixel */ | |
| 579 "dpa.w.ph $ac1, %[p3], %[filter12] \n
\t" /* odd 8 */ | |
| 580 "dpa.w.ph $ac1, %[p4], %[filter34] \n
\t" /* odd 8 */ | |
| 581 "dpa.w.ph $ac1, %[p1], %[filter56] \n
\t" /* odd 8 */ | |
| 582 "dpa.w.ph $ac1, %[p5], %[filter78] \n
\t" /* odd 8 */ | |
| 583 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 8 */ | |
| 584 | |
| 585 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 6 */ | |
| 586 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 7 */ | |
| 587 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 8 */ | |
| 588 | |
| 589 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 6 */ | |
| 590 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 591 | |
| 592 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 7 */ | |
| 593 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 594 | |
| 595 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 8 */ | |
| 596 | |
| 597 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), | |
| 598 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
| 599 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
| 600 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
| 601 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) | |
| 602 : [filter12] "r" (filter12), [filter34] "r" (filter34), | |
| 603 [filter56] "r" (filter56), [filter78] "r" (filter78), | |
| 604 [vector_64] "r" (vector_64), [cm] "r" (cm), | |
| 605 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
| 606 ); | |
| 607 | |
| 608 src += 16; | |
| 609 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); | |
| 610 odd_dst = (dst + dst_stride); | |
| 611 } | |
| 612 | |
| 613 /* Next row... */ | |
| 614 src_ptr += src_stride; | |
| 615 | |
| 616 dst_ptr += 1; | |
| 617 } | |
| 618 } | |
| 619 | |
| 620 static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, | |
| 621 int32_t src_stride, | |
| 622 uint8_t *dst_ptr, | |
| 623 int32_t dst_stride, | |
| 624 const int16_t *filter_x0, | |
| 625 int32_t h) { | |
| 626 int32_t c, y; | |
| 627 const uint8_t *src; | |
| 628 uint8_t *dst; | |
| 629 uint8_t *cm = vpx_ff_cropTbl; | |
| 630 uint32_t vector_64 = 64; | |
| 631 int32_t filter12, filter34, filter56, filter78; | |
| 632 int32_t Temp1, Temp2, Temp3; | |
| 633 uint32_t qload1, qload2; | |
| 634 uint32_t p1, p2, p3, p4, p5; | |
| 635 uint32_t st1, st2, st3; | |
| 636 uint32_t dst_pitch_2 = (dst_stride << 1); | |
| 637 uint8_t *odd_dst; | |
| 638 | |
| 639 filter12 = ((const int32_t *)filter_x0)[0]; | |
| 640 filter34 = ((const int32_t *)filter_x0)[1]; | |
| 641 filter56 = ((const int32_t *)filter_x0)[2]; | |
| 642 filter78 = ((const int32_t *)filter_x0)[3]; | |
| 643 | |
| 644 for (y = h; y--;) { | |
| 645 /* prefetch data to cache memory */ | |
| 646 prefetch_load(src_ptr + src_stride); | |
| 647 prefetch_load(src_ptr + src_stride + 32); | |
| 648 prefetch_load(src_ptr + src_stride + 64); | |
| 649 | |
| 650 src = src_ptr; | |
| 651 dst = dst_ptr; | |
| 652 | |
| 653 odd_dst = (dst + dst_stride); | |
| 654 | |
| 655 for (c = 0; c < 4; c++) { | |
| 656 __asm__ __volatile__ ( | |
| 657 "ulw %[qload1], 0(%[src]) \n
\t" | |
| 658 "ulw %[qload2], 4(%[src]) \n
\t" | |
| 659 | |
| 660 /* even 1. pixel */ | |
| 661 "mtlo %[vector_64], $ac1 \n
\t" /* even 1 */ | |
| 662 "mthi $zero, $ac1 \n
\t" | |
| 663 "mtlo %[vector_64], $ac2 \n
\t" /* even 2 */ | |
| 664 "mthi $zero, $ac2 \n
\t" | |
| 665 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
| 666 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
| 667 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
| 668 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
| 669 "ulw %[qload2], 8(%[src]) \n
\t" | |
| 670 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* even 1 */ | |
| 671 "dpa.w.ph $ac1, %[p2], %[filter34] \n
\t" /* even 1 */ | |
| 672 "dpa.w.ph $ac1, %[p3], %[filter56] \n
\t" /* even 1 */ | |
| 673 "dpa.w.ph $ac1, %[p4], %[filter78] \n
\t" /* even 1 */ | |
| 674 "extp %[Temp1], $ac1, 31 \n
\t" /* even 1 */ | |
| 675 | |
| 676 /* even 2. pixel */ | |
| 677 "mtlo %[vector_64], $ac3 \n
\t" /* even 3 */ | |
| 678 "mthi $zero, $ac3 \n
\t" | |
| 679 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
| 680 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
| 681 "ulw %[qload1], 12(%[src]) \n
\t" | |
| 682 "dpa.w.ph $ac2, %[p2], %[filter12] \n
\t" /* even 1 */ | |
| 683 "dpa.w.ph $ac2, %[p3], %[filter34] \n
\t" /* even 1 */ | |
| 684 "dpa.w.ph $ac2, %[p4], %[filter56] \n
\t" /* even 1 */ | |
| 685 "dpa.w.ph $ac2, %[p1], %[filter78] \n
\t" /* even 1 */ | |
| 686 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 1 */ | |
| 687 "extp %[Temp2], $ac2, 31 \n
\t" /* even 1 */ | |
| 688 | |
| 689 /* even 3. pixel */ | |
| 690 "mtlo %[vector_64], $ac1 \n
\t" /* even 4 */ | |
| 691 "mthi $zero, $ac1 \n
\t" | |
| 692 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
| 693 "sb %[st1], 0(%[dst]) \n
\t" /* even 1 */ | |
| 694 "addu %[dst], %[dst], %[dst_pitch_2]
\n\t" | |
| 695 "dpa.w.ph $ac3, %[p3], %[filter12] \n
\t" /* even 3 */ | |
| 696 "dpa.w.ph $ac3, %[p4], %[filter34] \n
\t" /* even 3 */ | |
| 697 "dpa.w.ph $ac3, %[p1], %[filter56] \n
\t" /* even 3 */ | |
| 698 "dpa.w.ph $ac3, %[p5], %[filter78] \n
\t" /* even 3 */ | |
| 699 "extp %[Temp3], $ac3, 31 \n
\t" /* even 3 */ | |
| 700 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 1 */ | |
| 701 | |
| 702 /* even 4. pixel */ | |
| 703 "mtlo %[vector_64], $ac2 \n
\t" /* even 5 */ | |
| 704 "mthi $zero, $ac2 \n
\t" | |
| 705 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
| 706 "sb %[st2], 0(%[dst]) \n
\t" /* even 2 */ | |
| 707 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 708 "ulw %[qload2], 16(%[src]) \n
\t" | |
| 709 "dpa.w.ph $ac1, %[p4], %[filter12] \n
\t" /* even 4 */ | |
| 710 "dpa.w.ph $ac1, %[p1], %[filter34] \n
\t" /* even 4 */ | |
| 711 "dpa.w.ph $ac1, %[p5], %[filter56] \n
\t" /* even 4 */ | |
| 712 "dpa.w.ph $ac1, %[p2], %[filter78] \n
\t" /* even 4 */ | |
| 713 "extp %[Temp1], $ac1, 31 \n
\t" /* even 4 */ | |
| 714 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 3 */ | |
| 715 | |
| 716 /* even 5. pixel */ | |
| 717 "mtlo %[vector_64], $ac3 \n
\t" /* even 6 */ | |
| 718 "mthi $zero, $ac3 \n
\t" | |
| 719 "preceu.ph.qbr %[p4], %[qload2] \n
\t" | |
| 720 "sb %[st3], 0(%[dst]) \n
\t" /* even 3 */ | |
| 721 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 722 "dpa.w.ph $ac2, %[p1], %[filter12] \n
\t" /* even 5 */ | |
| 723 "dpa.w.ph $ac2, %[p5], %[filter34] \n
\t" /* even 5 */ | |
| 724 "dpa.w.ph $ac2, %[p2], %[filter56] \n
\t" /* even 5 */ | |
| 725 "dpa.w.ph $ac2, %[p3], %[filter78] \n
\t" /* even 5 */ | |
| 726 "extp %[Temp2], $ac2, 31 \n
\t" /* even 5 */ | |
| 727 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 4 */ | |
| 728 | |
| 729 /* even 6. pixel */ | |
| 730 "mtlo %[vector_64], $ac1 \n
\t" /* even 7 */ | |
| 731 "mthi $zero, $ac1 \n
\t" | |
| 732 "preceu.ph.qbl %[p1], %[qload2] \n
\t" | |
| 733 "sb %[st1], 0(%[dst]) \n
\t" /* even 4 */ | |
| 734 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 735 "ulw %[qload1], 20(%[src]) \n
\t" | |
| 736 "dpa.w.ph $ac3, %[p5], %[filter12] \n
\t" /* even 6 */ | |
| 737 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* even 6 */ | |
| 738 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* even 6 */ | |
| 739 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* even 6 */ | |
| 740 "extp %[Temp3], $ac3, 31 \n
\t" /* even 6 */ | |
| 741 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 5 */ | |
| 742 | |
| 743 /* even 7. pixel */ | |
| 744 "mtlo %[vector_64], $ac2 \n
\t" /* even 8 */ | |
| 745 "mthi $zero, $ac2 \n
\t" | |
| 746 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
| 747 "sb %[st2], 0(%[dst]) \n
\t" /* even 5 */ | |
| 748 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 749 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* even 7 */ | |
| 750 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* even 7 */ | |
| 751 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* even 7 */ | |
| 752 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* even 7 */ | |
| 753 "extp %[Temp1], $ac1, 31 \n
\t" /* even 7 */ | |
| 754 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* even 6 */ | |
| 755 | |
| 756 /* even 8. pixel */ | |
| 757 "mtlo %[vector_64], $ac3 \n
\t" /* odd 1 */ | |
| 758 "mthi $zero, $ac3 \n
\t" | |
| 759 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* even 8 */ | |
| 760 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* even 8 */ | |
| 761 "sb %[st3], 0(%[dst]) \n
\t" /* even 6 */ | |
| 762 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 763 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* even 8 */ | |
| 764 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* even 8 */ | |
| 765 "extp %[Temp2], $ac2, 31 \n
\t" /* even 8 */ | |
| 766 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* even 7 */ | |
| 767 | |
| 768 /* ODD pixels */ | |
| 769 "ulw %[qload1], 1(%[src]) \n
\t" | |
| 770 "ulw %[qload2], 5(%[src]) \n
\t" | |
| 771 | |
| 772 /* odd 1. pixel */ | |
| 773 "mtlo %[vector_64], $ac1 \n
\t" /* odd 2 */ | |
| 774 "mthi $zero, $ac1 \n
\t" | |
| 775 "preceu.ph.qbr %[p1], %[qload1] \n
\t" | |
| 776 "preceu.ph.qbl %[p2], %[qload1] \n
\t" | |
| 777 "preceu.ph.qbr %[p3], %[qload2] \n
\t" | |
| 778 "preceu.ph.qbl %[p4], %[qload2] \n
\t" | |
| 779 "sb %[st1], 0(%[dst]) \n
\t" /* even 7 */ | |
| 780 "addu %[dst], %[dst], %[dst_pitch_2] \n
\t" | |
| 781 "ulw %[qload2], 9(%[src]) \n
\t" | |
| 782 "dpa.w.ph $ac3, %[p1], %[filter12] \n
\t" /* odd 1 */ | |
| 783 "dpa.w.ph $ac3, %[p2], %[filter34] \n
\t" /* odd 1 */ | |
| 784 "dpa.w.ph $ac3, %[p3], %[filter56] \n
\t" /* odd 1 */ | |
| 785 "dpa.w.ph $ac3, %[p4], %[filter78] \n
\t" /* odd 1 */ | |
| 786 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 1 */ | |
| 787 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* even 8 */ | |
| 788 | |
| 789 /* odd 2. pixel */ | |
| 790 "mtlo %[vector_64], $ac2 \n
\t" /* odd 3 */ | |
| 791 "mthi $zero, $ac2 \n
\t" | |
| 792 "preceu.ph.qbr %[p1], %[qload2] \n
\t" | |
| 793 "preceu.ph.qbl %[p5], %[qload2] \n
\t" | |
| 794 "sb %[st2], 0(%[dst]) \n
\t" /* even 8 */ | |
| 795 "ulw %[qload1], 13(%[src]) \n
\t" | |
| 796 "dpa.w.ph $ac1, %[p2], %[filter12] \n
\t" /* odd 2 */ | |
| 797 "dpa.w.ph $ac1, %[p3], %[filter34] \n
\t" /* odd 2 */ | |
| 798 "dpa.w.ph $ac1, %[p4], %[filter56] \n
\t" /* odd 2 */ | |
| 799 "dpa.w.ph $ac1, %[p1], %[filter78] \n
\t" /* odd 2 */ | |
| 800 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 2 */ | |
| 801 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 1 */ | |
| 802 | |
| 803 /* odd 3. pixel */ | |
| 804 "mtlo %[vector_64], $ac3 \n
\t" /* odd 4 */ | |
| 805 "mthi $zero, $ac3 \n
\t" | |
| 806 "preceu.ph.qbr %[p2], %[qload1] \n
\t" | |
| 807 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 1 */ | |
| 808 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 809 "dpa.w.ph $ac2, %[p3], %[filter12] \n
\t" /* odd 3 */ | |
| 810 "dpa.w.ph $ac2, %[p4], %[filter34] \n
\t" /* odd 3 */ | |
| 811 "dpa.w.ph $ac2, %[p1], %[filter56] \n
\t" /* odd 3 */ | |
| 812 "dpa.w.ph $ac2, %[p5], %[filter78] \n
\t" /* odd 3 */ | |
| 813 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 3 */ | |
| 814 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 2 */ | |
| 815 | |
| 816 /* odd 4. pixel */ | |
| 817 "mtlo %[vector_64], $ac1 \n
\t" /* odd 5 */ | |
| 818 "mthi $zero, $ac1 \n
\t" | |
| 819 "preceu.ph.qbl %[p3], %[qload1] \n
\t" | |
| 820 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 2 */ | |
| 821 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 822 "ulw %[qload2], 17(%[src]) \n
\t" | |
| 823 "dpa.w.ph $ac3, %[p4], %[filter12] \n
\t" /* odd 4 */ | |
| 824 "dpa.w.ph $ac3, %[p1], %[filter34] \n
\t" /* odd 4 */ | |
| 825 "dpa.w.ph $ac3, %[p5], %[filter56] \n
\t" /* odd 4 */ | |
| 826 "dpa.w.ph $ac3, %[p2], %[filter78] \n
\t" /* odd 4 */ | |
| 827 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 4 */ | |
| 828 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 3 */ | |
| 829 | |
| 830 /* odd 5. pixel */ | |
| 831 "mtlo %[vector_64], $ac2 \n
\t" /* odd 6 */ | |
| 832 "mthi $zero, $ac2 \n
\t" | |
| 833 "preceu.ph.qbr %[p4], %[qload2] \n
\t" | |
| 834 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 3 */ | |
| 835 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 836 "dpa.w.ph $ac1, %[p1], %[filter12] \n
\t" /* odd 5 */ | |
| 837 "dpa.w.ph $ac1, %[p5], %[filter34] \n
\t" /* odd 5 */ | |
| 838 "dpa.w.ph $ac1, %[p2], %[filter56] \n
\t" /* odd 5 */ | |
| 839 "dpa.w.ph $ac1, %[p3], %[filter78] \n
\t" /* odd 5 */ | |
| 840 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 5 */ | |
| 841 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 4 */ | |
| 842 | |
| 843 /* odd 6. pixel */ | |
| 844 "mtlo %[vector_64], $ac3 \n
\t" /* odd 7 */ | |
| 845 "mthi $zero, $ac3 \n
\t" | |
| 846 "preceu.ph.qbl %[p1], %[qload2] \n
\t" | |
| 847 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 4 */ | |
| 848 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 849 "ulw %[qload1], 21(%[src]) \n
\t" | |
| 850 "dpa.w.ph $ac2, %[p5], %[filter12] \n
\t" /* odd 6 */ | |
| 851 "dpa.w.ph $ac2, %[p2], %[filter34] \n
\t" /* odd 6 */ | |
| 852 "dpa.w.ph $ac2, %[p3], %[filter56] \n
\t" /* odd 6 */ | |
| 853 "dpa.w.ph $ac2, %[p4], %[filter78] \n
\t" /* odd 6 */ | |
| 854 "extp %[Temp2], $ac2, 31 \n
\t" /* odd 6 */ | |
| 855 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 5 */ | |
| 856 | |
| 857 /* odd 7. pixel */ | |
| 858 "mtlo %[vector_64], $ac1 \n
\t" /* odd 8 */ | |
| 859 "mthi $zero, $ac1 \n
\t" | |
| 860 "preceu.ph.qbr %[p5], %[qload1] \n
\t" | |
| 861 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 5 */ | |
| 862 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 863 "dpa.w.ph $ac3, %[p2], %[filter12] \n
\t" /* odd 7 */ | |
| 864 "dpa.w.ph $ac3, %[p3], %[filter34] \n
\t" /* odd 7 */ | |
| 865 "dpa.w.ph $ac3, %[p4], %[filter56] \n
\t" /* odd 7 */ | |
| 866 "dpa.w.ph $ac3, %[p1], %[filter78] \n
\t" /* odd 7 */ | |
| 867 "extp %[Temp3], $ac3, 31 \n
\t" /* odd 7 */ | |
| 868 | |
| 869 /* odd 8. pixel */ | |
| 870 "dpa.w.ph $ac1, %[p3], %[filter12] \n
\t" /* odd 8 */ | |
| 871 "dpa.w.ph $ac1, %[p4], %[filter34] \n
\t" /* odd 8 */ | |
| 872 "dpa.w.ph $ac1, %[p1], %[filter56] \n
\t" /* odd 8 */ | |
| 873 "dpa.w.ph $ac1, %[p5], %[filter78] \n
\t" /* odd 8 */ | |
| 874 "extp %[Temp1], $ac1, 31 \n
\t" /* odd 8 */ | |
| 875 | |
| 876 "lbux %[st2], %[Temp2](%[cm]) \n
\t" /* odd 6 */ | |
| 877 "lbux %[st3], %[Temp3](%[cm]) \n
\t" /* odd 7 */ | |
| 878 "lbux %[st1], %[Temp1](%[cm]) \n
\t" /* odd 8 */ | |
| 879 | |
| 880 "sb %[st2], 0(%[odd_dst]) \n
\t" /* odd 6 */ | |
| 881 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 882 | |
| 883 "sb %[st3], 0(%[odd_dst]) \n
\t" /* odd 7 */ | |
| 884 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n
\t" | |
| 885 | |
| 886 "sb %[st1], 0(%[odd_dst]) \n
\t" /* odd 8 */ | |
| 887 | |
| 888 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), | |
| 889 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), | |
| 890 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), | |
| 891 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), | |
| 892 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) | |
| 893 : [filter12] "r" (filter12), [filter34] "r" (filter34), | |
| 894 [filter56] "r" (filter56), [filter78] "r" (filter78), | |
| 895 [vector_64] "r" (vector_64), [cm] "r" (cm), | |
| 896 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) | |
| 897 ); | |
| 898 | |
| 899 src += 16; | |
| 900 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); | |
| 901 odd_dst = (dst + dst_stride); | |
| 902 } | |
| 903 | |
| 904 /* Next row... */ | |
| 905 src_ptr += src_stride; | |
| 906 | |
| 907 dst_ptr += 1; | |
| 908 } | |
| 909 } | |
| 910 | |
| 911 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, | |
| 912 uint8_t *dst, ptrdiff_t dst_stride, | |
| 913 const int16_t *filter, int w, int h) { | |
| 914 int x, y, k; | |
| 915 | |
| 916 for (y = 0; y < h; ++y) { | |
| 917 for (x = 0; x < w; ++x) { | |
| 918 int sum = 0; | |
| 919 | |
| 920 for (k = 0; k < 8; ++k) | |
| 921 sum += src[x + k] * filter[k]; | |
| 922 | |
| 923 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); | |
| 924 } | |
| 925 | |
| 926 src += src_stride; | |
| 927 dst += 1; | |
| 928 } | |
| 929 } | |
| 930 | |
| 931 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, | |
| 932 uint8_t *dst, ptrdiff_t dst_stride, | |
| 933 int w, int h) { | |
| 934 int x, y; | |
| 935 | |
| 936 for (y = 0; y < h; ++y) { | |
| 937 for (x = 0; x < w; ++x) { | |
| 938 dst[x * dst_stride] = src[x]; | |
| 939 } | |
| 940 | |
| 941 src += src_stride; | |
| 942 dst += 1; | |
| 943 } | |
| 944 } | |
| 945 | |
| 946 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, | |
| 947 uint8_t *dst, ptrdiff_t dst_stride, | |
| 948 const int16_t *filter_x, int x_step_q4, | |
| 949 const int16_t *filter_y, int y_step_q4, | |
| 950 int w, int h) { | |
| 951 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); | |
| 952 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; | |
| 953 uint32_t pos = 38; | |
| 954 | |
| 955 /* bit positon for extract from acc */ | |
| 956 __asm__ __volatile__ ( | |
| 957 "wrdsp %[pos], 1 \n\t" | |
| 958 : | |
| 959 : [pos] "r" (pos) | |
| 960 ); | |
| 961 | |
| 962 if (intermediate_height < h) | |
| 963 intermediate_height = h; | |
| 964 | |
| 965 if (x_step_q4 != 16 || y_step_q4 != 16) | |
| 966 return vpx_convolve8_c(src, src_stride, | |
| 967 dst, dst_stride, | |
| 968 filter_x, x_step_q4, | |
| 969 filter_y, y_step_q4, | |
| 970 w, h); | |
| 971 | |
| 972 if ((((const int32_t *)filter_x)[1] == 0x800000) | |
| 973 && (((const int32_t *)filter_y)[1] == 0x800000)) | |
| 974 return vpx_convolve_copy(src, src_stride, | |
| 975 dst, dst_stride, | |
| 976 filter_x, x_step_q4, | |
| 977 filter_y, y_step_q4, | |
| 978 w, h); | |
| 979 | |
| 980 /* copy the src to dst */ | |
| 981 if (filter_x[3] == 0x80) { | |
| 982 copy_horiz_transposed(src - src_stride * 3, src_stride, | |
| 983 temp, intermediate_height, | |
| 984 w, intermediate_height); | |
| 985 } else if (((const int32_t *)filter_x)[0] == 0) { | |
| 986 vpx_convolve2_dspr2(src - src_stride * 3, src_stride, | |
| 987 temp, intermediate_height, | |
| 988 filter_x, | |
| 989 w, intermediate_height); | |
| 990 } else { | |
| 991 src -= (src_stride * 3 + 3); | |
| 992 | |
| 993 /* prefetch data to cache memory */ | |
| 994 prefetch_load(src); | |
| 995 prefetch_load(src + 32); | |
| 996 | |
| 997 switch (w) { | |
| 998 case 4: | |
| 999 convolve_horiz_4_transposed_dspr2(src, src_stride, | |
| 1000 temp, intermediate_height, | |
| 1001 filter_x, intermediate_height); | |
| 1002 break; | |
| 1003 case 8: | |
| 1004 convolve_horiz_8_transposed_dspr2(src, src_stride, | |
| 1005 temp, intermediate_height, | |
| 1006 filter_x, intermediate_height); | |
| 1007 break; | |
| 1008 case 16: | |
| 1009 case 32: | |
| 1010 convolve_horiz_16_transposed_dspr2(src, src_stride, | |
| 1011 temp, intermediate_height, | |
| 1012 filter_x, intermediate_height, | |
| 1013 (w/16)); | |
| 1014 break; | |
| 1015 case 64: | |
| 1016 prefetch_load(src + 32); | |
| 1017 convolve_horiz_64_transposed_dspr2(src, src_stride, | |
| 1018 temp, intermediate_height, | |
| 1019 filter_x, intermediate_height); | |
| 1020 break; | |
| 1021 default: | |
| 1022 convolve_horiz_transposed(src, src_stride, | |
| 1023 temp, intermediate_height, | |
| 1024 filter_x, w, intermediate_height); | |
| 1025 break; | |
| 1026 } | |
| 1027 } | |
| 1028 | |
| 1029 /* copy the src to dst */ | |
| 1030 if (filter_y[3] == 0x80) { | |
| 1031 copy_horiz_transposed(temp + 3, intermediate_height, | |
| 1032 dst, dst_stride, | |
| 1033 h, w); | |
| 1034 } else if (((const int32_t *)filter_y)[0] == 0) { | |
| 1035 vpx_convolve2_dspr2(temp + 3, intermediate_height, | |
| 1036 dst, dst_stride, | |
| 1037 filter_y, | |
| 1038 h, w); | |
| 1039 } else { | |
| 1040 switch (h) { | |
| 1041 case 4: | |
| 1042 convolve_horiz_4_transposed_dspr2(temp, intermediate_height, | |
| 1043 dst, dst_stride, | |
| 1044 filter_y, w); | |
| 1045 break; | |
| 1046 case 8: | |
| 1047 convolve_horiz_8_transposed_dspr2(temp, intermediate_height, | |
| 1048 dst, dst_stride, | |
| 1049 filter_y, w); | |
| 1050 break; | |
| 1051 case 16: | |
| 1052 case 32: | |
| 1053 convolve_horiz_16_transposed_dspr2(temp, intermediate_height, | |
| 1054 dst, dst_stride, | |
| 1055 filter_y, w, (h/16)); | |
| 1056 break; | |
| 1057 case 64: | |
| 1058 convolve_horiz_64_transposed_dspr2(temp, intermediate_height, | |
| 1059 dst, dst_stride, | |
| 1060 filter_y, w); | |
| 1061 break; | |
| 1062 default: | |
| 1063 convolve_horiz_transposed(temp, intermediate_height, | |
| 1064 dst, dst_stride, | |
| 1065 filter_y, h, w); | |
| 1066 break; | |
| 1067 } | |
| 1068 } | |
| 1069 } | |
| 1070 | |
| 1071 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, | |
| 1072 uint8_t *dst, ptrdiff_t dst_stride, | |
| 1073 const int16_t *filter_x, int filter_x_stride, | |
| 1074 const int16_t *filter_y, int filter_y_stride, | |
| 1075 int w, int h) { | |
| 1076 int x, y; | |
| 1077 | |
| 1078 /* prefetch data to cache memory */ | |
| 1079 prefetch_load(src); | |
| 1080 prefetch_load(src + 32); | |
| 1081 prefetch_store(dst); | |
| 1082 | |
| 1083 switch (w) { | |
| 1084 case 4: | |
| 1085 { | |
| 1086 uint32_t tp1; | |
| 1087 | |
| 1088 /* 1 word storage */ | |
| 1089 for (y = h; y--; ) { | |
| 1090 prefetch_load(src + src_stride); | |
| 1091 prefetch_load(src + src_stride + 32); | |
| 1092 prefetch_store(dst + dst_stride); | |
| 1093 | |
| 1094 __asm__ __volatile__ ( | |
| 1095 "ulw %[tp1], (%[src]) \n\t" | |
| 1096 "sw %[tp1], (%[dst]) \n\t" /* store */ | |
| 1097 | |
| 1098 : [tp1] "=&r" (tp1) | |
| 1099 : [src] "r" (src), [dst] "r" (dst) | |
| 1100 ); | |
| 1101 | |
| 1102 src += src_stride; | |
| 1103 dst += dst_stride; | |
| 1104 } | |
| 1105 } | |
| 1106 break; | |
| 1107 case 8: | |
| 1108 { | |
| 1109 uint32_t tp1, tp2; | |
| 1110 | |
| 1111 /* 2 word storage */ | |
| 1112 for (y = h; y--; ) { | |
| 1113 prefetch_load(src + src_stride); | |
| 1114 prefetch_load(src + src_stride + 32); | |
| 1115 prefetch_store(dst + dst_stride); | |
| 1116 | |
| 1117 __asm__ __volatile__ ( | |
| 1118 "ulw %[tp1], 0(%[src]) \n\t" | |
| 1119 "ulw %[tp2], 4(%[src]) \n\t" | |
| 1120 "sw %[tp1], 0(%[dst]) \n\t" /* store */ | |
| 1121 "sw %[tp2], 4(%[dst]) \n\t" /* store */ | |
| 1122 | |
| 1123 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2) | |
| 1124 : [src] "r" (src), [dst] "r" (dst) | |
| 1125 ); | |
| 1126 | |
| 1127 src += src_stride; | |
| 1128 dst += dst_stride; | |
| 1129 } | |
| 1130 } | |
| 1131 break; | |
| 1132 case 16: | |
| 1133 { | |
| 1134 uint32_t tp1, tp2, tp3, tp4; | |
| 1135 | |
| 1136 /* 4 word storage */ | |
| 1137 for (y = h; y--; ) { | |
| 1138 prefetch_load(src + src_stride); | |
| 1139 prefetch_load(src + src_stride + 32); | |
| 1140 prefetch_store(dst + dst_stride); | |
| 1141 | |
| 1142 __asm__ __volatile__ ( | |
| 1143 "ulw %[tp1], 0(%[src]) \n\t" | |
| 1144 "ulw %[tp2], 4(%[src]) \n\t" | |
| 1145 "ulw %[tp3], 8(%[src]) \n\t" | |
| 1146 "ulw %[tp4], 12(%[src]) \n\t" | |
| 1147 | |
| 1148 "sw %[tp1], 0(%[dst]) \n\t" /* store */ | |
| 1149 "sw %[tp2], 4(%[dst]) \n\t" /* store */ | |
| 1150 "sw %[tp3], 8(%[dst]) \n\t" /* store */ | |
| 1151 "sw %[tp4], 12(%[dst]) \n\t" /* store */ | |
| 1152 | |
| 1153 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
| 1154 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4) | |
| 1155 : [src] "r" (src), [dst] "r" (dst) | |
| 1156 ); | |
| 1157 | |
| 1158 src += src_stride; | |
| 1159 dst += dst_stride; | |
| 1160 } | |
| 1161 } | |
| 1162 break; | |
| 1163 case 32: | |
| 1164 { | |
| 1165 uint32_t tp1, tp2, tp3, tp4; | |
| 1166 uint32_t tp5, tp6, tp7, tp8; | |
| 1167 | |
| 1168 /* 8 word storage */ | |
| 1169 for (y = h; y--; ) { | |
| 1170 prefetch_load(src + src_stride); | |
| 1171 prefetch_load(src + src_stride + 32); | |
| 1172 prefetch_store(dst + dst_stride); | |
| 1173 | |
| 1174 __asm__ __volatile__ ( | |
| 1175 "ulw %[tp1], 0(%[src]) \n\t" | |
| 1176 "ulw %[tp2], 4(%[src]) \n\t" | |
| 1177 "ulw %[tp3], 8(%[src]) \n\t" | |
| 1178 "ulw %[tp4], 12(%[src]) \n\t" | |
| 1179 "ulw %[tp5], 16(%[src]) \n\t" | |
| 1180 "ulw %[tp6], 20(%[src]) \n\t" | |
| 1181 "ulw %[tp7], 24(%[src]) \n\t" | |
| 1182 "ulw %[tp8], 28(%[src]) \n\t" | |
| 1183 | |
| 1184 "sw %[tp1], 0(%[dst]) \n\t" /* store */ | |
| 1185 "sw %[tp2], 4(%[dst]) \n\t" /* store */ | |
| 1186 "sw %[tp3], 8(%[dst]) \n\t" /* store */ | |
| 1187 "sw %[tp4], 12(%[dst]) \n\t" /* store */ | |
| 1188 "sw %[tp5], 16(%[dst]) \n\t" /* store */ | |
| 1189 "sw %[tp6], 20(%[dst]) \n\t" /* store */ | |
| 1190 "sw %[tp7], 24(%[dst]) \n\t" /* store */ | |
| 1191 "sw %[tp8], 28(%[dst]) \n\t" /* store */ | |
| 1192 | |
| 1193 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
| 1194 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), | |
| 1195 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), | |
| 1196 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) | |
| 1197 : [src] "r" (src), [dst] "r" (dst) | |
| 1198 ); | |
| 1199 | |
| 1200 src += src_stride; | |
| 1201 dst += dst_stride; | |
| 1202 } | |
| 1203 } | |
| 1204 break; | |
| 1205 case 64: | |
| 1206 { | |
| 1207 uint32_t tp1, tp2, tp3, tp4; | |
| 1208 uint32_t tp5, tp6, tp7, tp8; | |
| 1209 | |
| 1210 prefetch_load(src + 64); | |
| 1211 prefetch_store(dst + 32); | |
| 1212 | |
| 1213 /* 16 word storage */ | |
| 1214 for (y = h; y--; ) { | |
| 1215 prefetch_load(src + src_stride); | |
| 1216 prefetch_load(src + src_stride + 32); | |
| 1217 prefetch_load(src + src_stride + 64); | |
| 1218 prefetch_store(dst + dst_stride); | |
| 1219 prefetch_store(dst + dst_stride + 32); | |
| 1220 | |
| 1221 __asm__ __volatile__ ( | |
| 1222 "ulw %[tp1], 0(%[src]) \n\t" | |
| 1223 "ulw %[tp2], 4(%[src]) \n\t" | |
| 1224 "ulw %[tp3], 8(%[src]) \n\t" | |
| 1225 "ulw %[tp4], 12(%[src]) \n\t" | |
| 1226 "ulw %[tp5], 16(%[src]) \n\t" | |
| 1227 "ulw %[tp6], 20(%[src]) \n\t" | |
| 1228 "ulw %[tp7], 24(%[src]) \n\t" | |
| 1229 "ulw %[tp8], 28(%[src]) \n\t" | |
| 1230 | |
| 1231 "sw %[tp1], 0(%[dst]) \n\t" /* store */ | |
| 1232 "sw %[tp2], 4(%[dst]) \n\t" /* store */ | |
| 1233 "sw %[tp3], 8(%[dst]) \n\t" /* store */ | |
| 1234 "sw %[tp4], 12(%[dst]) \n\t" /* store */ | |
| 1235 "sw %[tp5], 16(%[dst]) \n\t" /* store */ | |
| 1236 "sw %[tp6], 20(%[dst]) \n\t" /* store */ | |
| 1237 "sw %[tp7], 24(%[dst]) \n\t" /* store */ | |
| 1238 "sw %[tp8], 28(%[dst]) \n\t" /* store */ | |
| 1239 | |
| 1240 "ulw %[tp1], 32(%[src]) \n\t" | |
| 1241 "ulw %[tp2], 36(%[src]) \n\t" | |
| 1242 "ulw %[tp3], 40(%[src]) \n\t" | |
| 1243 "ulw %[tp4], 44(%[src]) \n\t" | |
| 1244 "ulw %[tp5], 48(%[src]) \n\t" | |
| 1245 "ulw %[tp6], 52(%[src]) \n\t" | |
| 1246 "ulw %[tp7], 56(%[src]) \n\t" | |
| 1247 "ulw %[tp8], 60(%[src]) \n\t" | |
| 1248 | |
| 1249 "sw %[tp1], 32(%[dst]) \n\t" /* store */ | |
| 1250 "sw %[tp2], 36(%[dst]) \n\t" /* store */ | |
| 1251 "sw %[tp3], 40(%[dst]) \n\t" /* store */ | |
| 1252 "sw %[tp4], 44(%[dst]) \n\t" /* store */ | |
| 1253 "sw %[tp5], 48(%[dst]) \n\t" /* store */ | |
| 1254 "sw %[tp6], 52(%[dst]) \n\t" /* store */ | |
| 1255 "sw %[tp7], 56(%[dst]) \n\t" /* store */ | |
| 1256 "sw %[tp8], 60(%[dst]) \n\t" /* store */ | |
| 1257 | |
| 1258 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), | |
| 1259 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), | |
| 1260 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), | |
| 1261 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) | |
| 1262 : [src] "r" (src), [dst] "r" (dst) | |
| 1263 ); | |
| 1264 | |
| 1265 src += src_stride; | |
| 1266 dst += dst_stride; | |
| 1267 } | |
| 1268 } | |
| 1269 break; | |
| 1270 default: | |
| 1271 for (y = h; y--; ) { | |
| 1272 for (x = 0; x < w; ++x) { | |
| 1273 dst[x] = src[x]; | |
| 1274 } | |
| 1275 | |
| 1276 src += src_stride; | |
| 1277 dst += dst_stride; | |
| 1278 } | |
| 1279 break; | |
| 1280 } | |
| 1281 } | |
| 1282 #endif | |
| OLD | NEW |