| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
| 3 ; | |
| 4 ; Use of this source code is governed by a BSD-style license | |
| 5 ; that can be found in the LICENSE file in the root of the source | |
| 6 ; tree. An additional intellectual property rights grant can be found | |
| 7 ; in the file PATENTS. All contributing project authors may | |
| 8 ; be found in the AUTHORS file in the root of the source tree. | |
| 9 ; | |
| 10 | |
| 11 | |
| 12 .globl vp8_sub_pixel_variance4x4_ppc | |
| 13 .globl vp8_sub_pixel_variance8x8_ppc | |
| 14 .globl vp8_sub_pixel_variance8x16_ppc | |
| 15 .globl vp8_sub_pixel_variance16x8_ppc | |
| 16 .globl vp8_sub_pixel_variance16x16_ppc | |
| 17 | |
| 18 .macro load_c V, LABEL, OFF, R0, R1 | |
| 19 lis \R0, \LABEL@ha | |
| 20 la \R1, \LABEL@l(\R0) | |
| 21 lvx \V, \OFF, \R1 | |
| 22 .endm | |
| 23 | |
| 24 .macro load_vfilter V0, V1 | |
| 25 load_c \V0, vfilter_b, r6, r12, r10 | |
| 26 | |
| 27 addi r6, r6, 16 | |
| 28 lvx \V1, r6, r10 | |
| 29 .endm | |
| 30 | |
| 31 .macro HProlog jump_label | |
| 32 ;# load up horizontal filter | |
| 33 slwi. r5, r5, 4 ;# index into horizontal filter array | |
| 34 | |
| 35 ;# index to the next set of vectors in the row. | |
| 36 li r10, 16 | |
| 37 | |
| 38 ;# downshift by 7 ( divide by 128 ) at the end | |
| 39 vspltish v19, 7 | |
| 40 | |
| 41 ;# If there isn't any filtering to be done for the horizontal, then | |
| 42 ;# just skip to the second pass. | |
| 43 beq \jump_label | |
| 44 | |
| 45 load_c v20, hfilter_b, r5, r12, r0 | |
| 46 | |
| 47 ;# setup constants | |
| 48 ;# v14 permutation value for alignment | |
| 49 load_c v28, b_hperm_b, 0, r12, r0 | |
| 50 | |
| 51 ;# index to the next set of vectors in the row. | |
| 52 li r12, 32 | |
| 53 | |
| 54 ;# rounding added in on the multiply | |
| 55 vspltisw v21, 8 | |
| 56 vspltisw v18, 3 | |
| 57 vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 | |
| 58 | |
| 59 slwi. r6, r6, 5 ;# index into vertical filter array | |
| 60 .endm | |
| 61 | |
| 62 ;# Filters a horizontal line | |
| 63 ;# expects: | |
| 64 ;# r3 src_ptr | |
| 65 ;# r4 pitch | |
| 66 ;# r10 16 | |
| 67 ;# r12 32 | |
| 68 ;# v17 perm intput | |
| 69 ;# v18 rounding | |
| 70 ;# v19 shift | |
| 71 ;# v20 filter taps | |
| 72 ;# v21 tmp | |
| 73 ;# v22 tmp | |
| 74 ;# v23 tmp | |
| 75 ;# v24 tmp | |
| 76 ;# v25 tmp | |
| 77 ;# v26 tmp | |
| 78 ;# v27 tmp | |
| 79 ;# v28 perm output | |
| 80 ;# | |
| 81 | |
| 82 .macro hfilter_8 V, hp, lp, increment_counter | |
| 83 lvsl v17, 0, r3 ;# permutate value for alignment | |
| 84 | |
| 85 ;# input to filter is 9 bytes wide, output is 8 bytes. | |
| 86 lvx v21, 0, r3 | |
| 87 lvx v22, r10, r3 | |
| 88 | |
| 89 .if \increment_counter | |
| 90 add r3, r3, r4 | |
| 91 .endif | |
| 92 vperm v21, v21, v22, v17 | |
| 93 | |
| 94 vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456 | |
| 95 vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A | |
| 96 | |
| 97 vmsummbm v24, v20, v24, v18 | |
| 98 vmsummbm v25, v20, v25, v18 | |
| 99 | |
| 100 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) | |
| 101 | |
| 102 vsrh v24, v24, v19 ;# divide v0, v1 by 128 | |
| 103 | |
| 104 vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result | |
| 105 .endm | |
| 106 | |
| 107 .macro vfilter_16 P0 P1 | |
| 108 vmuleub v22, \P0, v20 ;# 64 + 4 positive taps | |
| 109 vadduhm v22, v18, v22 | |
| 110 vmuloub v23, \P0, v20 | |
| 111 vadduhm v23, v18, v23 | |
| 112 | |
| 113 vmuleub v24, \P1, v21 | |
| 114 vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary | |
| 115 vmuloub v25, \P1, v21 | |
| 116 vadduhm v23, v23, v25 ;# Ro = odds | |
| 117 | |
| 118 vsrh v22, v22, v19 ;# divide by 128 | |
| 119 vsrh v23, v23, v19 ;# v16 v17 = evens, odds | |
| 120 vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order | |
| 121 vmrglh v23, v22, v23 | |
| 122 vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result | |
| 123 .endm | |
| 124 | |
| 125 .macro compute_sum_sse src, ref, sum, sse, t1, t2, z0 | |
| 126 ;# Compute sum first. Unpack to so signed subract | |
| 127 ;# can be used. Only have a half word signed | |
| 128 ;# subract. Do high, then low. | |
| 129 vmrghb \t1, \z0, \src | |
| 130 vmrghb \t2, \z0, \ref | |
| 131 vsubshs \t1, \t1, \t2 | |
| 132 vsum4shs \sum, \t1, \sum | |
| 133 | |
| 134 vmrglb \t1, \z0, \src | |
| 135 vmrglb \t2, \z0, \ref | |
| 136 vsubshs \t1, \t1, \t2 | |
| 137 vsum4shs \sum, \t1, \sum | |
| 138 | |
| 139 ;# Now compute sse. | |
| 140 vsububs \t1, \src, \ref | |
| 141 vsububs \t2, \ref, \src | |
| 142 vor \t1, \t1, \t2 | |
| 143 | |
| 144 vmsumubm \sse, \t1, \t1, \sse | |
| 145 .endm | |
| 146 | |
| 147 .macro variance_final sum, sse, z0, DS | |
| 148 vsumsws \sum, \sum, \z0 | |
| 149 vsumsws \sse, \sse, \z0 | |
| 150 | |
| 151 stvx \sum, 0, r1 | |
| 152 lwz r3, 12(r1) | |
| 153 | |
| 154 stvx \sse, 0, r1 | |
| 155 lwz r4, 12(r1) | |
| 156 | |
| 157 stw r4, 0(r9) ;# sse | |
| 158 | |
| 159 mullw r3, r3, r3 ;# sum*sum | |
| 160 srlwi r3, r3, \DS ;# (sum*sum) >> 8 | |
| 161 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) | |
| 162 .endm | |
| 163 | |
| 164 .macro compute_sum_sse_16 V, increment_counter | |
| 165 load_and_align_16 v16, r7, r8, \increment_counter | |
| 166 compute_sum_sse \V, v16, v18, v19, v20, v21, v23 | |
| 167 .endm | |
| 168 | |
| 169 .macro load_and_align_16 V, R, P, increment_counter | |
| 170 lvsl v17, 0, \R ;# permutate value for alignment | |
| 171 | |
| 172 ;# input to filter is 21 bytes wide, output is 16 bytes. | |
| 173 ;# input will can span three vectors if not aligned correctly. | |
| 174 lvx v21, 0, \R | |
| 175 lvx v22, r10, \R | |
| 176 | |
| 177 .if \increment_counter | |
| 178 add \R, \R, \P | |
| 179 .endif | |
| 180 | |
| 181 vperm \V, v21, v22, v17 | |
| 182 .endm | |
| 183 | |
| 184 .align 2 | |
| 185 ;# r3 unsigned char *src_ptr | |
| 186 ;# r4 int src_pixels_per_line | |
| 187 ;# r5 int xoffset | |
| 188 ;# r6 int yoffset | |
| 189 ;# r7 unsigned char *dst_ptr | |
| 190 ;# r8 int dst_pixels_per_line | |
| 191 ;# r9 unsigned int *sse | |
| 192 ;# | |
| 193 ;# r3 return value | |
| 194 vp8_sub_pixel_variance4x4_ppc: | |
| 195 mfspr r11, 256 ;# get old VRSAVE | |
| 196 oris r12, r11, 0xf830 | |
| 197 ori r12, r12, 0xfff8 | |
| 198 mtspr 256, r12 ;# set VRSAVE | |
| 199 | |
| 200 stwu r1,-32(r1) ;# create space on the stack | |
| 201 | |
| 202 HProlog second_pass_4x4_pre_copy_b | |
| 203 | |
| 204 ;# Load up permutation constants | |
| 205 load_c v10, b_0123_b, 0, r12, r0 | |
| 206 load_c v11, b_4567_b, 0, r12, r0 | |
| 207 | |
| 208 hfilter_8 v0, v10, v11, 1 | |
| 209 hfilter_8 v1, v10, v11, 1 | |
| 210 hfilter_8 v2, v10, v11, 1 | |
| 211 hfilter_8 v3, v10, v11, 1 | |
| 212 | |
| 213 ;# Finished filtering main horizontal block. If there is no | |
| 214 ;# vertical filtering, jump to storing the data. Otherwise | |
| 215 ;# load up and filter the additional line that is needed | |
| 216 ;# for the vertical filter. | |
| 217 beq compute_sum_sse_4x4_b | |
| 218 | |
| 219 hfilter_8 v4, v10, v11, 0 | |
| 220 | |
| 221 b second_pass_4x4_b | |
| 222 | |
| 223 second_pass_4x4_pre_copy_b: | |
| 224 slwi r6, r6, 5 ;# index into vertical filter array | |
| 225 | |
| 226 load_and_align_16 v0, r3, r4, 1 | |
| 227 load_and_align_16 v1, r3, r4, 1 | |
| 228 load_and_align_16 v2, r3, r4, 1 | |
| 229 load_and_align_16 v3, r3, r4, 1 | |
| 230 load_and_align_16 v4, r3, r4, 0 | |
| 231 | |
| 232 second_pass_4x4_b: | |
| 233 vspltish v20, 8 | |
| 234 vspltish v18, 3 | |
| 235 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 236 | |
| 237 load_vfilter v20, v21 | |
| 238 | |
| 239 vfilter_16 v0, v1 | |
| 240 vfilter_16 v1, v2 | |
| 241 vfilter_16 v2, v3 | |
| 242 vfilter_16 v3, v4 | |
| 243 | |
| 244 compute_sum_sse_4x4_b: | |
| 245 vspltish v18, 0 ;# sum | |
| 246 vspltish v19, 0 ;# sse | |
| 247 vspltish v23, 0 ;# unpack | |
| 248 li r10, 16 | |
| 249 | |
| 250 load_and_align_16 v4, r7, r8, 1 | |
| 251 load_and_align_16 v5, r7, r8, 1 | |
| 252 load_and_align_16 v6, r7, r8, 1 | |
| 253 load_and_align_16 v7, r7, r8, 1 | |
| 254 | |
| 255 vmrghb v0, v0, v1 | |
| 256 vmrghb v1, v2, v3 | |
| 257 | |
| 258 vmrghb v2, v4, v5 | |
| 259 vmrghb v3, v6, v7 | |
| 260 | |
| 261 load_c v10, b_hilo_b, 0, r12, r0 | |
| 262 | |
| 263 vperm v0, v0, v1, v10 | |
| 264 vperm v1, v2, v3, v10 | |
| 265 | |
| 266 compute_sum_sse v0, v1, v18, v19, v20, v21, v23 | |
| 267 | |
| 268 variance_final v18, v19, v23, 4 | |
| 269 | |
| 270 addi r1, r1, 32 ;# recover stack | |
| 271 mtspr 256, r11 ;# reset old VRSAVE | |
| 272 | |
| 273 blr | |
| 274 | |
| 275 .align 2 | |
| 276 ;# r3 unsigned char *src_ptr | |
| 277 ;# r4 int src_pixels_per_line | |
| 278 ;# r5 int xoffset | |
| 279 ;# r6 int yoffset | |
| 280 ;# r7 unsigned char *dst_ptr | |
| 281 ;# r8 int dst_pixels_per_line | |
| 282 ;# r9 unsigned int *sse | |
| 283 ;# | |
| 284 ;# r3 return value | |
| 285 vp8_sub_pixel_variance8x8_ppc: | |
| 286 mfspr r11, 256 ;# get old VRSAVE | |
| 287 oris r12, r11, 0xfff0 | |
| 288 ori r12, r12, 0xffff | |
| 289 mtspr 256, r12 ;# set VRSAVE | |
| 290 | |
| 291 stwu r1,-32(r1) ;# create space on the stack | |
| 292 | |
| 293 HProlog second_pass_8x8_pre_copy_b | |
| 294 | |
| 295 ;# Load up permutation constants | |
| 296 load_c v10, b_0123_b, 0, r12, r0 | |
| 297 load_c v11, b_4567_b, 0, r12, r0 | |
| 298 | |
| 299 hfilter_8 v0, v10, v11, 1 | |
| 300 hfilter_8 v1, v10, v11, 1 | |
| 301 hfilter_8 v2, v10, v11, 1 | |
| 302 hfilter_8 v3, v10, v11, 1 | |
| 303 hfilter_8 v4, v10, v11, 1 | |
| 304 hfilter_8 v5, v10, v11, 1 | |
| 305 hfilter_8 v6, v10, v11, 1 | |
| 306 hfilter_8 v7, v10, v11, 1 | |
| 307 | |
| 308 ;# Finished filtering main horizontal block. If there is no | |
| 309 ;# vertical filtering, jump to storing the data. Otherwise | |
| 310 ;# load up and filter the additional line that is needed | |
| 311 ;# for the vertical filter. | |
| 312 beq compute_sum_sse_8x8_b | |
| 313 | |
| 314 hfilter_8 v8, v10, v11, 0 | |
| 315 | |
| 316 b second_pass_8x8_b | |
| 317 | |
| 318 second_pass_8x8_pre_copy_b: | |
| 319 slwi. r6, r6, 5 ;# index into vertical filter array | |
| 320 | |
| 321 load_and_align_16 v0, r3, r4, 1 | |
| 322 load_and_align_16 v1, r3, r4, 1 | |
| 323 load_and_align_16 v2, r3, r4, 1 | |
| 324 load_and_align_16 v3, r3, r4, 1 | |
| 325 load_and_align_16 v4, r3, r4, 1 | |
| 326 load_and_align_16 v5, r3, r4, 1 | |
| 327 load_and_align_16 v6, r3, r4, 1 | |
| 328 load_and_align_16 v7, r3, r4, 1 | |
| 329 load_and_align_16 v8, r3, r4, 0 | |
| 330 | |
| 331 beq compute_sum_sse_8x8_b | |
| 332 | |
| 333 second_pass_8x8_b: | |
| 334 vspltish v20, 8 | |
| 335 vspltish v18, 3 | |
| 336 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 337 | |
| 338 load_vfilter v20, v21 | |
| 339 | |
| 340 vfilter_16 v0, v1 | |
| 341 vfilter_16 v1, v2 | |
| 342 vfilter_16 v2, v3 | |
| 343 vfilter_16 v3, v4 | |
| 344 vfilter_16 v4, v5 | |
| 345 vfilter_16 v5, v6 | |
| 346 vfilter_16 v6, v7 | |
| 347 vfilter_16 v7, v8 | |
| 348 | |
| 349 compute_sum_sse_8x8_b: | |
| 350 vspltish v18, 0 ;# sum | |
| 351 vspltish v19, 0 ;# sse | |
| 352 vspltish v23, 0 ;# unpack | |
| 353 li r10, 16 | |
| 354 | |
| 355 vmrghb v0, v0, v1 | |
| 356 vmrghb v1, v2, v3 | |
| 357 vmrghb v2, v4, v5 | |
| 358 vmrghb v3, v6, v7 | |
| 359 | |
| 360 load_and_align_16 v4, r7, r8, 1 | |
| 361 load_and_align_16 v5, r7, r8, 1 | |
| 362 load_and_align_16 v6, r7, r8, 1 | |
| 363 load_and_align_16 v7, r7, r8, 1 | |
| 364 load_and_align_16 v8, r7, r8, 1 | |
| 365 load_and_align_16 v9, r7, r8, 1 | |
| 366 load_and_align_16 v10, r7, r8, 1 | |
| 367 load_and_align_16 v11, r7, r8, 0 | |
| 368 | |
| 369 vmrghb v4, v4, v5 | |
| 370 vmrghb v5, v6, v7 | |
| 371 vmrghb v6, v8, v9 | |
| 372 vmrghb v7, v10, v11 | |
| 373 | |
| 374 compute_sum_sse v0, v4, v18, v19, v20, v21, v23 | |
| 375 compute_sum_sse v1, v5, v18, v19, v20, v21, v23 | |
| 376 compute_sum_sse v2, v6, v18, v19, v20, v21, v23 | |
| 377 compute_sum_sse v3, v7, v18, v19, v20, v21, v23 | |
| 378 | |
| 379 variance_final v18, v19, v23, 6 | |
| 380 | |
| 381 addi r1, r1, 32 ;# recover stack | |
| 382 mtspr 256, r11 ;# reset old VRSAVE | |
| 383 blr | |
| 384 | |
| 385 .align 2 | |
| 386 ;# r3 unsigned char *src_ptr | |
| 387 ;# r4 int src_pixels_per_line | |
| 388 ;# r5 int xoffset | |
| 389 ;# r6 int yoffset | |
| 390 ;# r7 unsigned char *dst_ptr | |
| 391 ;# r8 int dst_pixels_per_line | |
| 392 ;# r9 unsigned int *sse | |
| 393 ;# | |
| 394 ;# r3 return value | |
| 395 vp8_sub_pixel_variance8x16_ppc: | |
| 396 mfspr r11, 256 ;# get old VRSAVE | |
| 397 oris r12, r11, 0xffff | |
| 398 ori r12, r12, 0xfffc | |
| 399 mtspr 256, r12 ;# set VRSAVE | |
| 400 | |
| 401 stwu r1,-32(r1) ;# create space on the stack | |
| 402 | |
| 403 HProlog second_pass_8x16_pre_copy_b | |
| 404 | |
| 405 ;# Load up permutation constants | |
| 406 load_c v29, b_0123_b, 0, r12, r0 | |
| 407 load_c v30, b_4567_b, 0, r12, r0 | |
| 408 | |
| 409 hfilter_8 v0, v29, v30, 1 | |
| 410 hfilter_8 v1, v29, v30, 1 | |
| 411 hfilter_8 v2, v29, v30, 1 | |
| 412 hfilter_8 v3, v29, v30, 1 | |
| 413 hfilter_8 v4, v29, v30, 1 | |
| 414 hfilter_8 v5, v29, v30, 1 | |
| 415 hfilter_8 v6, v29, v30, 1 | |
| 416 hfilter_8 v7, v29, v30, 1 | |
| 417 hfilter_8 v8, v29, v30, 1 | |
| 418 hfilter_8 v9, v29, v30, 1 | |
| 419 hfilter_8 v10, v29, v30, 1 | |
| 420 hfilter_8 v11, v29, v30, 1 | |
| 421 hfilter_8 v12, v29, v30, 1 | |
| 422 hfilter_8 v13, v29, v30, 1 | |
| 423 hfilter_8 v14, v29, v30, 1 | |
| 424 hfilter_8 v15, v29, v30, 1 | |
| 425 | |
| 426 ;# Finished filtering main horizontal block. If there is no | |
| 427 ;# vertical filtering, jump to storing the data. Otherwise | |
| 428 ;# load up and filter the additional line that is needed | |
| 429 ;# for the vertical filter. | |
| 430 beq compute_sum_sse_8x16_b | |
| 431 | |
| 432 hfilter_8 v16, v29, v30, 0 | |
| 433 | |
| 434 b second_pass_8x16_b | |
| 435 | |
| 436 second_pass_8x16_pre_copy_b: | |
| 437 slwi. r6, r6, 5 ;# index into vertical filter array | |
| 438 | |
| 439 load_and_align_16 v0, r3, r4, 1 | |
| 440 load_and_align_16 v1, r3, r4, 1 | |
| 441 load_and_align_16 v2, r3, r4, 1 | |
| 442 load_and_align_16 v3, r3, r4, 1 | |
| 443 load_and_align_16 v4, r3, r4, 1 | |
| 444 load_and_align_16 v5, r3, r4, 1 | |
| 445 load_and_align_16 v6, r3, r4, 1 | |
| 446 load_and_align_16 v7, r3, r4, 1 | |
| 447 load_and_align_16 v8, r3, r4, 1 | |
| 448 load_and_align_16 v9, r3, r4, 1 | |
| 449 load_and_align_16 v10, r3, r4, 1 | |
| 450 load_and_align_16 v11, r3, r4, 1 | |
| 451 load_and_align_16 v12, r3, r4, 1 | |
| 452 load_and_align_16 v13, r3, r4, 1 | |
| 453 load_and_align_16 v14, r3, r4, 1 | |
| 454 load_and_align_16 v15, r3, r4, 1 | |
| 455 load_and_align_16 v16, r3, r4, 0 | |
| 456 | |
| 457 beq compute_sum_sse_8x16_b | |
| 458 | |
| 459 second_pass_8x16_b: | |
| 460 vspltish v20, 8 | |
| 461 vspltish v18, 3 | |
| 462 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 463 | |
| 464 load_vfilter v20, v21 | |
| 465 | |
| 466 vfilter_16 v0, v1 | |
| 467 vfilter_16 v1, v2 | |
| 468 vfilter_16 v2, v3 | |
| 469 vfilter_16 v3, v4 | |
| 470 vfilter_16 v4, v5 | |
| 471 vfilter_16 v5, v6 | |
| 472 vfilter_16 v6, v7 | |
| 473 vfilter_16 v7, v8 | |
| 474 vfilter_16 v8, v9 | |
| 475 vfilter_16 v9, v10 | |
| 476 vfilter_16 v10, v11 | |
| 477 vfilter_16 v11, v12 | |
| 478 vfilter_16 v12, v13 | |
| 479 vfilter_16 v13, v14 | |
| 480 vfilter_16 v14, v15 | |
| 481 vfilter_16 v15, v16 | |
| 482 | |
| 483 compute_sum_sse_8x16_b: | |
| 484 vspltish v18, 0 ;# sum | |
| 485 vspltish v19, 0 ;# sse | |
| 486 vspltish v23, 0 ;# unpack | |
| 487 li r10, 16 | |
| 488 | |
| 489 vmrghb v0, v0, v1 | |
| 490 vmrghb v1, v2, v3 | |
| 491 vmrghb v2, v4, v5 | |
| 492 vmrghb v3, v6, v7 | |
| 493 vmrghb v4, v8, v9 | |
| 494 vmrghb v5, v10, v11 | |
| 495 vmrghb v6, v12, v13 | |
| 496 vmrghb v7, v14, v15 | |
| 497 | |
| 498 load_and_align_16 v8, r7, r8, 1 | |
| 499 load_and_align_16 v9, r7, r8, 1 | |
| 500 load_and_align_16 v10, r7, r8, 1 | |
| 501 load_and_align_16 v11, r7, r8, 1 | |
| 502 load_and_align_16 v12, r7, r8, 1 | |
| 503 load_and_align_16 v13, r7, r8, 1 | |
| 504 load_and_align_16 v14, r7, r8, 1 | |
| 505 load_and_align_16 v15, r7, r8, 1 | |
| 506 | |
| 507 vmrghb v8, v8, v9 | |
| 508 vmrghb v9, v10, v11 | |
| 509 vmrghb v10, v12, v13 | |
| 510 vmrghb v11, v14, v15 | |
| 511 | |
| 512 compute_sum_sse v0, v8, v18, v19, v20, v21, v23 | |
| 513 compute_sum_sse v1, v9, v18, v19, v20, v21, v23 | |
| 514 compute_sum_sse v2, v10, v18, v19, v20, v21, v23 | |
| 515 compute_sum_sse v3, v11, v18, v19, v20, v21, v23 | |
| 516 | |
| 517 load_and_align_16 v8, r7, r8, 1 | |
| 518 load_and_align_16 v9, r7, r8, 1 | |
| 519 load_and_align_16 v10, r7, r8, 1 | |
| 520 load_and_align_16 v11, r7, r8, 1 | |
| 521 load_and_align_16 v12, r7, r8, 1 | |
| 522 load_and_align_16 v13, r7, r8, 1 | |
| 523 load_and_align_16 v14, r7, r8, 1 | |
| 524 load_and_align_16 v15, r7, r8, 0 | |
| 525 | |
| 526 vmrghb v8, v8, v9 | |
| 527 vmrghb v9, v10, v11 | |
| 528 vmrghb v10, v12, v13 | |
| 529 vmrghb v11, v14, v15 | |
| 530 | |
| 531 compute_sum_sse v4, v8, v18, v19, v20, v21, v23 | |
| 532 compute_sum_sse v5, v9, v18, v19, v20, v21, v23 | |
| 533 compute_sum_sse v6, v10, v18, v19, v20, v21, v23 | |
| 534 compute_sum_sse v7, v11, v18, v19, v20, v21, v23 | |
| 535 | |
| 536 variance_final v18, v19, v23, 7 | |
| 537 | |
| 538 addi r1, r1, 32 ;# recover stack | |
| 539 mtspr 256, r11 ;# reset old VRSAVE | |
| 540 blr | |
| 541 | |
| 542 ;# Filters a horizontal line | |
| 543 ;# expects: | |
| 544 ;# r3 src_ptr | |
| 545 ;# r4 pitch | |
| 546 ;# r10 16 | |
| 547 ;# r12 32 | |
| 548 ;# v17 perm intput | |
| 549 ;# v18 rounding | |
| 550 ;# v19 shift | |
| 551 ;# v20 filter taps | |
| 552 ;# v21 tmp | |
| 553 ;# v22 tmp | |
| 554 ;# v23 tmp | |
| 555 ;# v24 tmp | |
| 556 ;# v25 tmp | |
| 557 ;# v26 tmp | |
| 558 ;# v27 tmp | |
| 559 ;# v28 perm output | |
| 560 ;# | |
| 561 .macro hfilter_16 V, increment_counter | |
| 562 | |
| 563 lvsl v17, 0, r3 ;# permutate value for alignment | |
| 564 | |
| 565 ;# input to filter is 21 bytes wide, output is 16 bytes. | |
| 566 ;# input will can span three vectors if not aligned correctly. | |
| 567 lvx v21, 0, r3 | |
| 568 lvx v22, r10, r3 | |
| 569 lvx v23, r12, r3 | |
| 570 | |
| 571 .if \increment_counter | |
| 572 add r3, r3, r4 | |
| 573 .endif | |
| 574 vperm v21, v21, v22, v17 | |
| 575 vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified | |
| 576 | |
| 577 ;# set 0 | |
| 578 vmsummbm v24, v20, v21, v18 ;# taps times elements | |
| 579 | |
| 580 ;# set 1 | |
| 581 vsldoi v23, v21, v22, 1 | |
| 582 vmsummbm v25, v20, v23, v18 | |
| 583 | |
| 584 ;# set 2 | |
| 585 vsldoi v23, v21, v22, 2 | |
| 586 vmsummbm v26, v20, v23, v18 | |
| 587 | |
| 588 ;# set 3 | |
| 589 vsldoi v23, v21, v22, 3 | |
| 590 vmsummbm v27, v20, v23, v18 | |
| 591 | |
| 592 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) | |
| 593 vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F | |
| 594 | |
| 595 vsrh v24, v24, v19 ;# divide v0, v1 by 128 | |
| 596 vsrh v25, v25, v19 | |
| 597 | |
| 598 vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result | |
| 599 vperm \V, \V, v0, v28 ;# \V = correctly-ordered result | |
| 600 .endm | |
| 601 | |
| 602 .align 2 | |
| 603 ;# r3 unsigned char *src_ptr | |
| 604 ;# r4 int src_pixels_per_line | |
| 605 ;# r5 int xoffset | |
| 606 ;# r6 int yoffset | |
| 607 ;# r7 unsigned char *dst_ptr | |
| 608 ;# r8 int dst_pixels_per_line | |
| 609 ;# r9 unsigned int *sse | |
| 610 ;# | |
| 611 ;# r3 return value | |
| 612 vp8_sub_pixel_variance16x8_ppc: | |
| 613 mfspr r11, 256 ;# get old VRSAVE | |
| 614 oris r12, r11, 0xffff | |
| 615 ori r12, r12, 0xfff8 | |
| 616 mtspr 256, r12 ;# set VRSAVE | |
| 617 | |
| 618 stwu r1, -32(r1) ;# create space on the stack | |
| 619 | |
| 620 HProlog second_pass_16x8_pre_copy_b | |
| 621 | |
| 622 hfilter_16 v0, 1 | |
| 623 hfilter_16 v1, 1 | |
| 624 hfilter_16 v2, 1 | |
| 625 hfilter_16 v3, 1 | |
| 626 hfilter_16 v4, 1 | |
| 627 hfilter_16 v5, 1 | |
| 628 hfilter_16 v6, 1 | |
| 629 hfilter_16 v7, 1 | |
| 630 | |
| 631 ;# Finished filtering main horizontal block. If there is no | |
| 632 ;# vertical filtering, jump to storing the data. Otherwise | |
| 633 ;# load up and filter the additional line that is needed | |
| 634 ;# for the vertical filter. | |
| 635 beq compute_sum_sse_16x8_b | |
| 636 | |
| 637 hfilter_16 v8, 0 | |
| 638 | |
| 639 b second_pass_16x8_b | |
| 640 | |
| 641 second_pass_16x8_pre_copy_b: | |
| 642 slwi. r6, r6, 5 ;# index into vertical filter array | |
| 643 | |
| 644 load_and_align_16 v0, r3, r4, 1 | |
| 645 load_and_align_16 v1, r3, r4, 1 | |
| 646 load_and_align_16 v2, r3, r4, 1 | |
| 647 load_and_align_16 v3, r3, r4, 1 | |
| 648 load_and_align_16 v4, r3, r4, 1 | |
| 649 load_and_align_16 v5, r3, r4, 1 | |
| 650 load_and_align_16 v6, r3, r4, 1 | |
| 651 load_and_align_16 v7, r3, r4, 1 | |
| 652 load_and_align_16 v8, r3, r4, 1 | |
| 653 | |
| 654 beq compute_sum_sse_16x8_b | |
| 655 | |
| 656 second_pass_16x8_b: | |
| 657 vspltish v20, 8 | |
| 658 vspltish v18, 3 | |
| 659 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 660 | |
| 661 load_vfilter v20, v21 | |
| 662 | |
| 663 vfilter_16 v0, v1 | |
| 664 vfilter_16 v1, v2 | |
| 665 vfilter_16 v2, v3 | |
| 666 vfilter_16 v3, v4 | |
| 667 vfilter_16 v4, v5 | |
| 668 vfilter_16 v5, v6 | |
| 669 vfilter_16 v6, v7 | |
| 670 vfilter_16 v7, v8 | |
| 671 | |
| 672 compute_sum_sse_16x8_b: | |
| 673 vspltish v18, 0 ;# sum | |
| 674 vspltish v19, 0 ;# sse | |
| 675 vspltish v23, 0 ;# unpack | |
| 676 li r10, 16 | |
| 677 | |
| 678 compute_sum_sse_16 v0, 1 | |
| 679 compute_sum_sse_16 v1, 1 | |
| 680 compute_sum_sse_16 v2, 1 | |
| 681 compute_sum_sse_16 v3, 1 | |
| 682 compute_sum_sse_16 v4, 1 | |
| 683 compute_sum_sse_16 v5, 1 | |
| 684 compute_sum_sse_16 v6, 1 | |
| 685 compute_sum_sse_16 v7, 0 | |
| 686 | |
| 687 variance_final v18, v19, v23, 7 | |
| 688 | |
| 689 addi r1, r1, 32 ;# recover stack | |
| 690 | |
| 691 mtspr 256, r11 ;# reset old VRSAVE | |
| 692 | |
| 693 blr | |
| 694 | |
| 695 .align 2 | |
| 696 ;# r3 unsigned char *src_ptr | |
| 697 ;# r4 int src_pixels_per_line | |
| 698 ;# r5 int xoffset | |
| 699 ;# r6 int yoffset | |
| 700 ;# r7 unsigned char *dst_ptr | |
| 701 ;# r8 int dst_pixels_per_line | |
| 702 ;# r9 unsigned int *sse | |
| 703 ;# | |
| 704 ;# r3 return value | |
| 705 vp8_sub_pixel_variance16x16_ppc: | |
| 706 mfspr r11, 256 ;# get old VRSAVE | |
| 707 oris r12, r11, 0xffff | |
| 708 ori r12, r12, 0xfff8 | |
| 709 mtspr 256, r12 ;# set VRSAVE | |
| 710 | |
| 711 stwu r1, -32(r1) ;# create space on the stack | |
| 712 | |
| 713 HProlog second_pass_16x16_pre_copy_b | |
| 714 | |
| 715 hfilter_16 v0, 1 | |
| 716 hfilter_16 v1, 1 | |
| 717 hfilter_16 v2, 1 | |
| 718 hfilter_16 v3, 1 | |
| 719 hfilter_16 v4, 1 | |
| 720 hfilter_16 v5, 1 | |
| 721 hfilter_16 v6, 1 | |
| 722 hfilter_16 v7, 1 | |
| 723 hfilter_16 v8, 1 | |
| 724 hfilter_16 v9, 1 | |
| 725 hfilter_16 v10, 1 | |
| 726 hfilter_16 v11, 1 | |
| 727 hfilter_16 v12, 1 | |
| 728 hfilter_16 v13, 1 | |
| 729 hfilter_16 v14, 1 | |
| 730 hfilter_16 v15, 1 | |
| 731 | |
| 732 ;# Finished filtering main horizontal block. If there is no | |
| 733 ;# vertical filtering, jump to storing the data. Otherwise | |
| 734 ;# load up and filter the additional line that is needed | |
| 735 ;# for the vertical filter. | |
| 736 beq compute_sum_sse_16x16_b | |
| 737 | |
| 738 hfilter_16 v16, 0 | |
| 739 | |
| 740 b second_pass_16x16_b | |
| 741 | |
| 742 second_pass_16x16_pre_copy_b: | |
| 743 slwi. r6, r6, 5 ;# index into vertical filter array | |
| 744 | |
| 745 load_and_align_16 v0, r3, r4, 1 | |
| 746 load_and_align_16 v1, r3, r4, 1 | |
| 747 load_and_align_16 v2, r3, r4, 1 | |
| 748 load_and_align_16 v3, r3, r4, 1 | |
| 749 load_and_align_16 v4, r3, r4, 1 | |
| 750 load_and_align_16 v5, r3, r4, 1 | |
| 751 load_and_align_16 v6, r3, r4, 1 | |
| 752 load_and_align_16 v7, r3, r4, 1 | |
| 753 load_and_align_16 v8, r3, r4, 1 | |
| 754 load_and_align_16 v9, r3, r4, 1 | |
| 755 load_and_align_16 v10, r3, r4, 1 | |
| 756 load_and_align_16 v11, r3, r4, 1 | |
| 757 load_and_align_16 v12, r3, r4, 1 | |
| 758 load_and_align_16 v13, r3, r4, 1 | |
| 759 load_and_align_16 v14, r3, r4, 1 | |
| 760 load_and_align_16 v15, r3, r4, 1 | |
| 761 load_and_align_16 v16, r3, r4, 0 | |
| 762 | |
| 763 beq compute_sum_sse_16x16_b | |
| 764 | |
| 765 second_pass_16x16_b: | |
| 766 vspltish v20, 8 | |
| 767 vspltish v18, 3 | |
| 768 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 769 | |
| 770 load_vfilter v20, v21 | |
| 771 | |
| 772 vfilter_16 v0, v1 | |
| 773 vfilter_16 v1, v2 | |
| 774 vfilter_16 v2, v3 | |
| 775 vfilter_16 v3, v4 | |
| 776 vfilter_16 v4, v5 | |
| 777 vfilter_16 v5, v6 | |
| 778 vfilter_16 v6, v7 | |
| 779 vfilter_16 v7, v8 | |
| 780 vfilter_16 v8, v9 | |
| 781 vfilter_16 v9, v10 | |
| 782 vfilter_16 v10, v11 | |
| 783 vfilter_16 v11, v12 | |
| 784 vfilter_16 v12, v13 | |
| 785 vfilter_16 v13, v14 | |
| 786 vfilter_16 v14, v15 | |
| 787 vfilter_16 v15, v16 | |
| 788 | |
| 789 compute_sum_sse_16x16_b: | |
| 790 vspltish v18, 0 ;# sum | |
| 791 vspltish v19, 0 ;# sse | |
| 792 vspltish v23, 0 ;# unpack | |
| 793 li r10, 16 | |
| 794 | |
| 795 compute_sum_sse_16 v0, 1 | |
| 796 compute_sum_sse_16 v1, 1 | |
| 797 compute_sum_sse_16 v2, 1 | |
| 798 compute_sum_sse_16 v3, 1 | |
| 799 compute_sum_sse_16 v4, 1 | |
| 800 compute_sum_sse_16 v5, 1 | |
| 801 compute_sum_sse_16 v6, 1 | |
| 802 compute_sum_sse_16 v7, 1 | |
| 803 compute_sum_sse_16 v8, 1 | |
| 804 compute_sum_sse_16 v9, 1 | |
| 805 compute_sum_sse_16 v10, 1 | |
| 806 compute_sum_sse_16 v11, 1 | |
| 807 compute_sum_sse_16 v12, 1 | |
| 808 compute_sum_sse_16 v13, 1 | |
| 809 compute_sum_sse_16 v14, 1 | |
| 810 compute_sum_sse_16 v15, 0 | |
| 811 | |
| 812 variance_final v18, v19, v23, 8 | |
| 813 | |
| 814 addi r1, r1, 32 ;# recover stack | |
| 815 | |
| 816 mtspr 256, r11 ;# reset old VRSAVE | |
| 817 | |
| 818 blr | |
| 819 | |
| 820 .data | |
| 821 | |
| 822 .align 4 | |
| 823 hfilter_b: | |
| 824 .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 | |
| 825 .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 | |
| 826 .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 | |
| 827 .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 | |
| 828 .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 | |
| 829 .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 | |
| 830 .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 | |
| 831 .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 | |
| 832 | |
| 833 .align 4 | |
| 834 vfilter_b: | |
| 835 .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 | |
| 836 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 837 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 | |
| 838 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
| 839 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 | |
| 840 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 | |
| 841 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 | |
| 842 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 | |
| 843 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 | |
| 844 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 | |
| 845 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 | |
| 846 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 | |
| 847 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 | |
| 848 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 | |
| 849 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 | |
| 850 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 | |
| 851 | |
| 852 .align 4 | |
| 853 b_hperm_b: | |
| 854 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 | |
| 855 | |
| 856 .align 4 | |
| 857 b_0123_b: | |
| 858 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 | |
| 859 | |
| 860 .align 4 | |
| 861 b_4567_b: | |
| 862 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 | |
| 863 | |
| 864 b_hilo_b: | |
| 865 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 | |
| OLD | NEW |