OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 .globl vp9_sub_pixel_variance4x4_ppc |
| 13 .globl vp9_sub_pixel_variance8x8_ppc |
| 14 .globl vp9_sub_pixel_variance8x16_ppc |
| 15 .globl vp9_sub_pixel_variance16x8_ppc |
| 16 .globl vp9_sub_pixel_variance16x16_ppc |
| 17 |
| 18 .macro load_c V, LABEL, OFF, R0, R1 |
| 19 lis \R0, \LABEL@ha |
| 20 la \R1, \LABEL@l(\R0) |
| 21 lvx \V, \OFF, \R1 |
| 22 .endm |
| 23 |
| 24 .macro load_vfilter V0, V1 |
| 25 load_c \V0, vfilter_b, r6, r12, r10 |
| 26 |
| 27 addi r6, r6, 16 |
| 28 lvx \V1, r6, r10 |
| 29 .endm |
| 30 |
| 31 .macro HProlog jump_label |
| 32 ;# load up horizontal filter |
| 33 slwi. r5, r5, 4 ;# index into horizontal filter array |
| 34 |
| 35 ;# index to the next set of vectors in the row. |
| 36 li r10, 16 |
| 37 |
| 38 ;# downshift by 7 ( divide by 128 ) at the end |
| 39 vspltish v19, 7 |
| 40 |
| 41 ;# If there isn't any filtering to be done for the horizontal, then |
| 42 ;# just skip to the second pass. |
| 43 beq \jump_label |
| 44 |
| 45 load_c v20, hfilter_b, r5, r12, r0 |
| 46 |
| 47 ;# setup constants |
| 48 ;# v14 permutation value for alignment |
| 49 load_c v28, b_hperm_b, 0, r12, r0 |
| 50 |
| 51 ;# index to the next set of vectors in the row. |
| 52 li r12, 32 |
| 53 |
| 54 ;# rounding added in on the multiply |
| 55 vspltisw v21, 8 |
| 56 vspltisw v18, 3 |
| 57 vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 |
| 58 |
| 59 slwi. r6, r6, 5 ;# index into vertical filter array |
| 60 .endm |
| 61 |
| 62 ;# Filters a horizontal line |
| 63 ;# expects: |
| 64 ;# r3 src_ptr |
| 65 ;# r4 pitch |
| 66 ;# r10 16 |
| 67 ;# r12 32 |
| 68 ;# v17 perm intput |
| 69 ;# v18 rounding |
| 70 ;# v19 shift |
| 71 ;# v20 filter taps |
| 72 ;# v21 tmp |
| 73 ;# v22 tmp |
| 74 ;# v23 tmp |
| 75 ;# v24 tmp |
| 76 ;# v25 tmp |
| 77 ;# v26 tmp |
| 78 ;# v27 tmp |
| 79 ;# v28 perm output |
| 80 ;# |
| 81 |
| 82 .macro hfilter_8 V, hp, lp, increment_counter |
| 83 lvsl v17, 0, r3 ;# permutate value for alignment |
| 84 |
| 85 ;# input to filter is 9 bytes wide, output is 8 bytes. |
| 86 lvx v21, 0, r3 |
| 87 lvx v22, r10, r3 |
| 88 |
| 89 .if \increment_counter |
| 90 add r3, r3, r4 |
| 91 .endif |
| 92 vperm v21, v21, v22, v17 |
| 93 |
| 94 vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456 |
| 95 vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A |
| 96 |
| 97 vmsummbm v24, v20, v24, v18 |
| 98 vmsummbm v25, v20, v25, v18 |
| 99 |
| 100 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) |
| 101 |
| 102 vsrh v24, v24, v19 ;# divide v0, v1 by 128 |
| 103 |
| 104 vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result |
| 105 .endm |
| 106 |
| 107 .macro vfilter_16 P0 P1 |
| 108 vmuleub v22, \P0, v20 ;# 64 + 4 positive taps |
| 109 vadduhm v22, v18, v22 |
| 110 vmuloub v23, \P0, v20 |
| 111 vadduhm v23, v18, v23 |
| 112 |
| 113 vmuleub v24, \P1, v21 |
| 114 vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary |
| 115 vmuloub v25, \P1, v21 |
| 116 vadduhm v23, v23, v25 ;# Ro = odds |
| 117 |
| 118 vsrh v22, v22, v19 ;# divide by 128 |
| 119 vsrh v23, v23, v19 ;# v16 v17 = evens, odds |
| 120 vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order |
| 121 vmrglh v23, v22, v23 |
| 122 vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result |
| 123 .endm |
| 124 |
| 125 .macro compute_sum_sse src, ref, sum, sse, t1, t2, z0 |
| 126 ;# Compute sum first. Unpack to so signed subract |
| 127 ;# can be used. Only have a half word signed |
| 128 ;# subract. Do high, then low. |
| 129 vmrghb \t1, \z0, \src |
| 130 vmrghb \t2, \z0, \ref |
| 131 vsubshs \t1, \t1, \t2 |
| 132 vsum4shs \sum, \t1, \sum |
| 133 |
| 134 vmrglb \t1, \z0, \src |
| 135 vmrglb \t2, \z0, \ref |
| 136 vsubshs \t1, \t1, \t2 |
| 137 vsum4shs \sum, \t1, \sum |
| 138 |
| 139 ;# Now compute sse. |
| 140 vsububs \t1, \src, \ref |
| 141 vsububs \t2, \ref, \src |
| 142 vor \t1, \t1, \t2 |
| 143 |
| 144 vmsumubm \sse, \t1, \t1, \sse |
| 145 .endm |
| 146 |
| 147 .macro variance_final sum, sse, z0, DS |
| 148 vsumsws \sum, \sum, \z0 |
| 149 vsumsws \sse, \sse, \z0 |
| 150 |
| 151 stvx \sum, 0, r1 |
| 152 lwz r3, 12(r1) |
| 153 |
| 154 stvx \sse, 0, r1 |
| 155 lwz r4, 12(r1) |
| 156 |
| 157 stw r4, 0(r9) ;# sse |
| 158 |
| 159 mullw r3, r3, r3 ;# sum*sum |
| 160 srawi r3, r3, \DS ;# (sum*sum) >> 8 |
| 161 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) |
| 162 .endm |
| 163 |
| 164 .macro compute_sum_sse_16 V, increment_counter |
| 165 load_and_align_16 v16, r7, r8, \increment_counter |
| 166 compute_sum_sse \V, v16, v18, v19, v20, v21, v23 |
| 167 .endm |
| 168 |
| 169 .macro load_and_align_16 V, R, P, increment_counter |
| 170 lvsl v17, 0, \R ;# permutate value for alignment |
| 171 |
| 172 ;# input to filter is 21 bytes wide, output is 16 bytes. |
| 173 ;# input will can span three vectors if not aligned correctly. |
| 174 lvx v21, 0, \R |
| 175 lvx v22, r10, \R |
| 176 |
| 177 .if \increment_counter |
| 178 add \R, \R, \P |
| 179 .endif |
| 180 |
| 181 vperm \V, v21, v22, v17 |
| 182 .endm |
| 183 |
| 184 .align 2 |
| 185 ;# r3 unsigned char *src_ptr |
| 186 ;# r4 int src_pixels_per_line |
| 187 ;# r5 int xoffset |
| 188 ;# r6 int yoffset |
| 189 ;# r7 unsigned char *dst_ptr |
| 190 ;# r8 int dst_pixels_per_line |
| 191 ;# r9 unsigned int *sse |
| 192 ;# |
| 193 ;# r3 return value |
| 194 vp9_sub_pixel_variance4x4_ppc: |
| 195 mfspr r11, 256 ;# get old VRSAVE |
| 196 oris r12, r11, 0xf830 |
| 197 ori r12, r12, 0xfff8 |
| 198 mtspr 256, r12 ;# set VRSAVE |
| 199 |
| 200 stwu r1,-32(r1) ;# create space on the stack |
| 201 |
| 202 HProlog second_pass_4x4_pre_copy_b |
| 203 |
| 204 ;# Load up permutation constants |
| 205 load_c v10, b_0123_b, 0, r12, r0 |
| 206 load_c v11, b_4567_b, 0, r12, r0 |
| 207 |
| 208 hfilter_8 v0, v10, v11, 1 |
| 209 hfilter_8 v1, v10, v11, 1 |
| 210 hfilter_8 v2, v10, v11, 1 |
| 211 hfilter_8 v3, v10, v11, 1 |
| 212 |
| 213 ;# Finished filtering main horizontal block. If there is no |
| 214 ;# vertical filtering, jump to storing the data. Otherwise |
| 215 ;# load up and filter the additional line that is needed |
| 216 ;# for the vertical filter. |
| 217 beq compute_sum_sse_4x4_b |
| 218 |
| 219 hfilter_8 v4, v10, v11, 0 |
| 220 |
| 221 b second_pass_4x4_b |
| 222 |
| 223 second_pass_4x4_pre_copy_b: |
| 224 slwi r6, r6, 5 ;# index into vertical filter array |
| 225 |
| 226 load_and_align_16 v0, r3, r4, 1 |
| 227 load_and_align_16 v1, r3, r4, 1 |
| 228 load_and_align_16 v2, r3, r4, 1 |
| 229 load_and_align_16 v3, r3, r4, 1 |
| 230 load_and_align_16 v4, r3, r4, 0 |
| 231 |
| 232 second_pass_4x4_b: |
| 233 vspltish v20, 8 |
| 234 vspltish v18, 3 |
| 235 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 |
| 236 |
| 237 load_vfilter v20, v21 |
| 238 |
| 239 vfilter_16 v0, v1 |
| 240 vfilter_16 v1, v2 |
| 241 vfilter_16 v2, v3 |
| 242 vfilter_16 v3, v4 |
| 243 |
| 244 compute_sum_sse_4x4_b: |
| 245 vspltish v18, 0 ;# sum |
| 246 vspltish v19, 0 ;# sse |
| 247 vspltish v23, 0 ;# unpack |
| 248 li r10, 16 |
| 249 |
| 250 load_and_align_16 v4, r7, r8, 1 |
| 251 load_and_align_16 v5, r7, r8, 1 |
| 252 load_and_align_16 v6, r7, r8, 1 |
| 253 load_and_align_16 v7, r7, r8, 1 |
| 254 |
| 255 vmrghb v0, v0, v1 |
| 256 vmrghb v1, v2, v3 |
| 257 |
| 258 vmrghb v2, v4, v5 |
| 259 vmrghb v3, v6, v7 |
| 260 |
| 261 load_c v10, b_hilo_b, 0, r12, r0 |
| 262 |
| 263 vperm v0, v0, v1, v10 |
| 264 vperm v1, v2, v3, v10 |
| 265 |
| 266 compute_sum_sse v0, v1, v18, v19, v20, v21, v23 |
| 267 |
| 268 variance_final v18, v19, v23, 4 |
| 269 |
| 270 addi r1, r1, 32 ;# recover stack |
| 271 mtspr 256, r11 ;# reset old VRSAVE |
| 272 |
| 273 blr |
| 274 |
| 275 .align 2 |
| 276 ;# r3 unsigned char *src_ptr |
| 277 ;# r4 int src_pixels_per_line |
| 278 ;# r5 int xoffset |
| 279 ;# r6 int yoffset |
| 280 ;# r7 unsigned char *dst_ptr |
| 281 ;# r8 int dst_pixels_per_line |
| 282 ;# r9 unsigned int *sse |
| 283 ;# |
| 284 ;# r3 return value |
| 285 vp9_sub_pixel_variance8x8_ppc: |
| 286 mfspr r11, 256 ;# get old VRSAVE |
| 287 oris r12, r11, 0xfff0 |
| 288 ori r12, r12, 0xffff |
| 289 mtspr 256, r12 ;# set VRSAVE |
| 290 |
| 291 stwu r1,-32(r1) ;# create space on the stack |
| 292 |
| 293 HProlog second_pass_8x8_pre_copy_b |
| 294 |
| 295 ;# Load up permutation constants |
| 296 load_c v10, b_0123_b, 0, r12, r0 |
| 297 load_c v11, b_4567_b, 0, r12, r0 |
| 298 |
| 299 hfilter_8 v0, v10, v11, 1 |
| 300 hfilter_8 v1, v10, v11, 1 |
| 301 hfilter_8 v2, v10, v11, 1 |
| 302 hfilter_8 v3, v10, v11, 1 |
| 303 hfilter_8 v4, v10, v11, 1 |
| 304 hfilter_8 v5, v10, v11, 1 |
| 305 hfilter_8 v6, v10, v11, 1 |
| 306 hfilter_8 v7, v10, v11, 1 |
| 307 |
| 308 ;# Finished filtering main horizontal block. If there is no |
| 309 ;# vertical filtering, jump to storing the data. Otherwise |
| 310 ;# load up and filter the additional line that is needed |
| 311 ;# for the vertical filter. |
| 312 beq compute_sum_sse_8x8_b |
| 313 |
| 314 hfilter_8 v8, v10, v11, 0 |
| 315 |
| 316 b second_pass_8x8_b |
| 317 |
| 318 second_pass_8x8_pre_copy_b: |
| 319 slwi. r6, r6, 5 ;# index into vertical filter array |
| 320 |
| 321 load_and_align_16 v0, r3, r4, 1 |
| 322 load_and_align_16 v1, r3, r4, 1 |
| 323 load_and_align_16 v2, r3, r4, 1 |
| 324 load_and_align_16 v3, r3, r4, 1 |
| 325 load_and_align_16 v4, r3, r4, 1 |
| 326 load_and_align_16 v5, r3, r4, 1 |
| 327 load_and_align_16 v6, r3, r4, 1 |
| 328 load_and_align_16 v7, r3, r4, 1 |
| 329 load_and_align_16 v8, r3, r4, 0 |
| 330 |
| 331 beq compute_sum_sse_8x8_b |
| 332 |
| 333 second_pass_8x8_b: |
| 334 vspltish v20, 8 |
| 335 vspltish v18, 3 |
| 336 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 |
| 337 |
| 338 load_vfilter v20, v21 |
| 339 |
| 340 vfilter_16 v0, v1 |
| 341 vfilter_16 v1, v2 |
| 342 vfilter_16 v2, v3 |
| 343 vfilter_16 v3, v4 |
| 344 vfilter_16 v4, v5 |
| 345 vfilter_16 v5, v6 |
| 346 vfilter_16 v6, v7 |
| 347 vfilter_16 v7, v8 |
| 348 |
| 349 compute_sum_sse_8x8_b: |
| 350 vspltish v18, 0 ;# sum |
| 351 vspltish v19, 0 ;# sse |
| 352 vspltish v23, 0 ;# unpack |
| 353 li r10, 16 |
| 354 |
| 355 vmrghb v0, v0, v1 |
| 356 vmrghb v1, v2, v3 |
| 357 vmrghb v2, v4, v5 |
| 358 vmrghb v3, v6, v7 |
| 359 |
| 360 load_and_align_16 v4, r7, r8, 1 |
| 361 load_and_align_16 v5, r7, r8, 1 |
| 362 load_and_align_16 v6, r7, r8, 1 |
| 363 load_and_align_16 v7, r7, r8, 1 |
| 364 load_and_align_16 v8, r7, r8, 1 |
| 365 load_and_align_16 v9, r7, r8, 1 |
| 366 load_and_align_16 v10, r7, r8, 1 |
| 367 load_and_align_16 v11, r7, r8, 0 |
| 368 |
| 369 vmrghb v4, v4, v5 |
| 370 vmrghb v5, v6, v7 |
| 371 vmrghb v6, v8, v9 |
| 372 vmrghb v7, v10, v11 |
| 373 |
| 374 compute_sum_sse v0, v4, v18, v19, v20, v21, v23 |
| 375 compute_sum_sse v1, v5, v18, v19, v20, v21, v23 |
| 376 compute_sum_sse v2, v6, v18, v19, v20, v21, v23 |
| 377 compute_sum_sse v3, v7, v18, v19, v20, v21, v23 |
| 378 |
| 379 variance_final v18, v19, v23, 6 |
| 380 |
| 381 addi r1, r1, 32 ;# recover stack |
| 382 mtspr 256, r11 ;# reset old VRSAVE |
| 383 blr |
| 384 |
| 385 .align 2 |
| 386 ;# r3 unsigned char *src_ptr |
| 387 ;# r4 int src_pixels_per_line |
| 388 ;# r5 int xoffset |
| 389 ;# r6 int yoffset |
| 390 ;# r7 unsigned char *dst_ptr |
| 391 ;# r8 int dst_pixels_per_line |
| 392 ;# r9 unsigned int *sse |
| 393 ;# |
| 394 ;# r3 return value |
| 395 vp9_sub_pixel_variance8x16_ppc: |
| 396 mfspr r11, 256 ;# get old VRSAVE |
| 397 oris r12, r11, 0xffff |
| 398 ori r12, r12, 0xfffc |
| 399 mtspr 256, r12 ;# set VRSAVE |
| 400 |
| 401 stwu r1,-32(r1) ;# create space on the stack |
| 402 |
| 403 HProlog second_pass_8x16_pre_copy_b |
| 404 |
| 405 ;# Load up permutation constants |
| 406 load_c v29, b_0123_b, 0, r12, r0 |
| 407 load_c v30, b_4567_b, 0, r12, r0 |
| 408 |
| 409 hfilter_8 v0, v29, v30, 1 |
| 410 hfilter_8 v1, v29, v30, 1 |
| 411 hfilter_8 v2, v29, v30, 1 |
| 412 hfilter_8 v3, v29, v30, 1 |
| 413 hfilter_8 v4, v29, v30, 1 |
| 414 hfilter_8 v5, v29, v30, 1 |
| 415 hfilter_8 v6, v29, v30, 1 |
| 416 hfilter_8 v7, v29, v30, 1 |
| 417 hfilter_8 v8, v29, v30, 1 |
| 418 hfilter_8 v9, v29, v30, 1 |
| 419 hfilter_8 v10, v29, v30, 1 |
| 420 hfilter_8 v11, v29, v30, 1 |
| 421 hfilter_8 v12, v29, v30, 1 |
| 422 hfilter_8 v13, v29, v30, 1 |
| 423 hfilter_8 v14, v29, v30, 1 |
| 424 hfilter_8 v15, v29, v30, 1 |
| 425 |
| 426 ;# Finished filtering main horizontal block. If there is no |
| 427 ;# vertical filtering, jump to storing the data. Otherwise |
| 428 ;# load up and filter the additional line that is needed |
| 429 ;# for the vertical filter. |
| 430 beq compute_sum_sse_8x16_b |
| 431 |
| 432 hfilter_8 v16, v29, v30, 0 |
| 433 |
| 434 b second_pass_8x16_b |
| 435 |
| 436 second_pass_8x16_pre_copy_b: |
| 437 slwi. r6, r6, 5 ;# index into vertical filter array |
| 438 |
| 439 load_and_align_16 v0, r3, r4, 1 |
| 440 load_and_align_16 v1, r3, r4, 1 |
| 441 load_and_align_16 v2, r3, r4, 1 |
| 442 load_and_align_16 v3, r3, r4, 1 |
| 443 load_and_align_16 v4, r3, r4, 1 |
| 444 load_and_align_16 v5, r3, r4, 1 |
| 445 load_and_align_16 v6, r3, r4, 1 |
| 446 load_and_align_16 v7, r3, r4, 1 |
| 447 load_and_align_16 v8, r3, r4, 1 |
| 448 load_and_align_16 v9, r3, r4, 1 |
| 449 load_and_align_16 v10, r3, r4, 1 |
| 450 load_and_align_16 v11, r3, r4, 1 |
| 451 load_and_align_16 v12, r3, r4, 1 |
| 452 load_and_align_16 v13, r3, r4, 1 |
| 453 load_and_align_16 v14, r3, r4, 1 |
| 454 load_and_align_16 v15, r3, r4, 1 |
| 455 load_and_align_16 v16, r3, r4, 0 |
| 456 |
| 457 beq compute_sum_sse_8x16_b |
| 458 |
| 459 second_pass_8x16_b: |
| 460 vspltish v20, 8 |
| 461 vspltish v18, 3 |
| 462 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 |
| 463 |
| 464 load_vfilter v20, v21 |
| 465 |
| 466 vfilter_16 v0, v1 |
| 467 vfilter_16 v1, v2 |
| 468 vfilter_16 v2, v3 |
| 469 vfilter_16 v3, v4 |
| 470 vfilter_16 v4, v5 |
| 471 vfilter_16 v5, v6 |
| 472 vfilter_16 v6, v7 |
| 473 vfilter_16 v7, v8 |
| 474 vfilter_16 v8, v9 |
| 475 vfilter_16 v9, v10 |
| 476 vfilter_16 v10, v11 |
| 477 vfilter_16 v11, v12 |
| 478 vfilter_16 v12, v13 |
| 479 vfilter_16 v13, v14 |
| 480 vfilter_16 v14, v15 |
| 481 vfilter_16 v15, v16 |
| 482 |
| 483 compute_sum_sse_8x16_b: |
| 484 vspltish v18, 0 ;# sum |
| 485 vspltish v19, 0 ;# sse |
| 486 vspltish v23, 0 ;# unpack |
| 487 li r10, 16 |
| 488 |
| 489 vmrghb v0, v0, v1 |
| 490 vmrghb v1, v2, v3 |
| 491 vmrghb v2, v4, v5 |
| 492 vmrghb v3, v6, v7 |
| 493 vmrghb v4, v8, v9 |
| 494 vmrghb v5, v10, v11 |
| 495 vmrghb v6, v12, v13 |
| 496 vmrghb v7, v14, v15 |
| 497 |
| 498 load_and_align_16 v8, r7, r8, 1 |
| 499 load_and_align_16 v9, r7, r8, 1 |
| 500 load_and_align_16 v10, r7, r8, 1 |
| 501 load_and_align_16 v11, r7, r8, 1 |
| 502 load_and_align_16 v12, r7, r8, 1 |
| 503 load_and_align_16 v13, r7, r8, 1 |
| 504 load_and_align_16 v14, r7, r8, 1 |
| 505 load_and_align_16 v15, r7, r8, 1 |
| 506 |
| 507 vmrghb v8, v8, v9 |
| 508 vmrghb v9, v10, v11 |
| 509 vmrghb v10, v12, v13 |
| 510 vmrghb v11, v14, v15 |
| 511 |
| 512 compute_sum_sse v0, v8, v18, v19, v20, v21, v23 |
| 513 compute_sum_sse v1, v9, v18, v19, v20, v21, v23 |
| 514 compute_sum_sse v2, v10, v18, v19, v20, v21, v23 |
| 515 compute_sum_sse v3, v11, v18, v19, v20, v21, v23 |
| 516 |
| 517 load_and_align_16 v8, r7, r8, 1 |
| 518 load_and_align_16 v9, r7, r8, 1 |
| 519 load_and_align_16 v10, r7, r8, 1 |
| 520 load_and_align_16 v11, r7, r8, 1 |
| 521 load_and_align_16 v12, r7, r8, 1 |
| 522 load_and_align_16 v13, r7, r8, 1 |
| 523 load_and_align_16 v14, r7, r8, 1 |
| 524 load_and_align_16 v15, r7, r8, 0 |
| 525 |
| 526 vmrghb v8, v8, v9 |
| 527 vmrghb v9, v10, v11 |
| 528 vmrghb v10, v12, v13 |
| 529 vmrghb v11, v14, v15 |
| 530 |
| 531 compute_sum_sse v4, v8, v18, v19, v20, v21, v23 |
| 532 compute_sum_sse v5, v9, v18, v19, v20, v21, v23 |
| 533 compute_sum_sse v6, v10, v18, v19, v20, v21, v23 |
| 534 compute_sum_sse v7, v11, v18, v19, v20, v21, v23 |
| 535 |
| 536 variance_final v18, v19, v23, 7 |
| 537 |
| 538 addi r1, r1, 32 ;# recover stack |
| 539 mtspr 256, r11 ;# reset old VRSAVE |
| 540 blr |
| 541 |
| 542 ;# Filters a horizontal line |
| 543 ;# expects: |
| 544 ;# r3 src_ptr |
| 545 ;# r4 pitch |
| 546 ;# r10 16 |
| 547 ;# r12 32 |
| 548 ;# v17 perm intput |
| 549 ;# v18 rounding |
| 550 ;# v19 shift |
| 551 ;# v20 filter taps |
| 552 ;# v21 tmp |
| 553 ;# v22 tmp |
| 554 ;# v23 tmp |
| 555 ;# v24 tmp |
| 556 ;# v25 tmp |
| 557 ;# v26 tmp |
| 558 ;# v27 tmp |
| 559 ;# v28 perm output |
| 560 ;# |
| 561 .macro hfilter_16 V, increment_counter |
| 562 |
| 563 lvsl v17, 0, r3 ;# permutate value for alignment |
| 564 |
| 565 ;# input to filter is 21 bytes wide, output is 16 bytes. |
| 566 ;# input will can span three vectors if not aligned correctly. |
| 567 lvx v21, 0, r3 |
| 568 lvx v22, r10, r3 |
| 569 lvx v23, r12, r3 |
| 570 |
| 571 .if \increment_counter |
| 572 add r3, r3, r4 |
| 573 .endif |
| 574 vperm v21, v21, v22, v17 |
| 575 vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified |
| 576 |
| 577 ;# set 0 |
| 578 vmsummbm v24, v20, v21, v18 ;# taps times elements |
| 579 |
| 580 ;# set 1 |
| 581 vsldoi v23, v21, v22, 1 |
| 582 vmsummbm v25, v20, v23, v18 |
| 583 |
| 584 ;# set 2 |
| 585 vsldoi v23, v21, v22, 2 |
| 586 vmsummbm v26, v20, v23, v18 |
| 587 |
| 588 ;# set 3 |
| 589 vsldoi v23, v21, v22, 3 |
| 590 vmsummbm v27, v20, v23, v18 |
| 591 |
| 592 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) |
| 593 vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F |
| 594 |
| 595 vsrh v24, v24, v19 ;# divide v0, v1 by 128 |
| 596 vsrh v25, v25, v19 |
| 597 |
| 598 vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result |
| 599 vperm \V, \V, v0, v28 ;# \V = correctly-ordered result |
| 600 .endm |
| 601 |
| 602 .align 2 |
| 603 ;# r3 unsigned char *src_ptr |
| 604 ;# r4 int src_pixels_per_line |
| 605 ;# r5 int xoffset |
| 606 ;# r6 int yoffset |
| 607 ;# r7 unsigned char *dst_ptr |
| 608 ;# r8 int dst_pixels_per_line |
| 609 ;# r9 unsigned int *sse |
| 610 ;# |
| 611 ;# r3 return value |
| 612 vp9_sub_pixel_variance16x8_ppc: |
| 613 mfspr r11, 256 ;# get old VRSAVE |
| 614 oris r12, r11, 0xffff |
| 615 ori r12, r12, 0xfff8 |
| 616 mtspr 256, r12 ;# set VRSAVE |
| 617 |
| 618 stwu r1, -32(r1) ;# create space on the stack |
| 619 |
| 620 HProlog second_pass_16x8_pre_copy_b |
| 621 |
| 622 hfilter_16 v0, 1 |
| 623 hfilter_16 v1, 1 |
| 624 hfilter_16 v2, 1 |
| 625 hfilter_16 v3, 1 |
| 626 hfilter_16 v4, 1 |
| 627 hfilter_16 v5, 1 |
| 628 hfilter_16 v6, 1 |
| 629 hfilter_16 v7, 1 |
| 630 |
| 631 ;# Finished filtering main horizontal block. If there is no |
| 632 ;# vertical filtering, jump to storing the data. Otherwise |
| 633 ;# load up and filter the additional line that is needed |
| 634 ;# for the vertical filter. |
| 635 beq compute_sum_sse_16x8_b |
| 636 |
| 637 hfilter_16 v8, 0 |
| 638 |
| 639 b second_pass_16x8_b |
| 640 |
| 641 second_pass_16x8_pre_copy_b: |
| 642 slwi. r6, r6, 5 ;# index into vertical filter array |
| 643 |
| 644 load_and_align_16 v0, r3, r4, 1 |
| 645 load_and_align_16 v1, r3, r4, 1 |
| 646 load_and_align_16 v2, r3, r4, 1 |
| 647 load_and_align_16 v3, r3, r4, 1 |
| 648 load_and_align_16 v4, r3, r4, 1 |
| 649 load_and_align_16 v5, r3, r4, 1 |
| 650 load_and_align_16 v6, r3, r4, 1 |
| 651 load_and_align_16 v7, r3, r4, 1 |
| 652 load_and_align_16 v8, r3, r4, 1 |
| 653 |
| 654 beq compute_sum_sse_16x8_b |
| 655 |
| 656 second_pass_16x8_b: |
| 657 vspltish v20, 8 |
| 658 vspltish v18, 3 |
| 659 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 |
| 660 |
| 661 load_vfilter v20, v21 |
| 662 |
| 663 vfilter_16 v0, v1 |
| 664 vfilter_16 v1, v2 |
| 665 vfilter_16 v2, v3 |
| 666 vfilter_16 v3, v4 |
| 667 vfilter_16 v4, v5 |
| 668 vfilter_16 v5, v6 |
| 669 vfilter_16 v6, v7 |
| 670 vfilter_16 v7, v8 |
| 671 |
| 672 compute_sum_sse_16x8_b: |
| 673 vspltish v18, 0 ;# sum |
| 674 vspltish v19, 0 ;# sse |
| 675 vspltish v23, 0 ;# unpack |
| 676 li r10, 16 |
| 677 |
| 678 compute_sum_sse_16 v0, 1 |
| 679 compute_sum_sse_16 v1, 1 |
| 680 compute_sum_sse_16 v2, 1 |
| 681 compute_sum_sse_16 v3, 1 |
| 682 compute_sum_sse_16 v4, 1 |
| 683 compute_sum_sse_16 v5, 1 |
| 684 compute_sum_sse_16 v6, 1 |
| 685 compute_sum_sse_16 v7, 0 |
| 686 |
| 687 variance_final v18, v19, v23, 7 |
| 688 |
| 689 addi r1, r1, 32 ;# recover stack |
| 690 |
| 691 mtspr 256, r11 ;# reset old VRSAVE |
| 692 |
| 693 blr |
| 694 |
| 695 .align 2 |
| 696 ;# r3 unsigned char *src_ptr |
| 697 ;# r4 int src_pixels_per_line |
| 698 ;# r5 int xoffset |
| 699 ;# r6 int yoffset |
| 700 ;# r7 unsigned char *dst_ptr |
| 701 ;# r8 int dst_pixels_per_line |
| 702 ;# r9 unsigned int *sse |
| 703 ;# |
| 704 ;# r3 return value |
| 705 vp9_sub_pixel_variance16x16_ppc: |
| 706 mfspr r11, 256 ;# get old VRSAVE |
| 707 oris r12, r11, 0xffff |
| 708 ori r12, r12, 0xfff8 |
| 709 mtspr 256, r12 ;# set VRSAVE |
| 710 |
| 711 stwu r1, -32(r1) ;# create space on the stack |
| 712 |
| 713 HProlog second_pass_16x16_pre_copy_b |
| 714 |
| 715 hfilter_16 v0, 1 |
| 716 hfilter_16 v1, 1 |
| 717 hfilter_16 v2, 1 |
| 718 hfilter_16 v3, 1 |
| 719 hfilter_16 v4, 1 |
| 720 hfilter_16 v5, 1 |
| 721 hfilter_16 v6, 1 |
| 722 hfilter_16 v7, 1 |
| 723 hfilter_16 v8, 1 |
| 724 hfilter_16 v9, 1 |
| 725 hfilter_16 v10, 1 |
| 726 hfilter_16 v11, 1 |
| 727 hfilter_16 v12, 1 |
| 728 hfilter_16 v13, 1 |
| 729 hfilter_16 v14, 1 |
| 730 hfilter_16 v15, 1 |
| 731 |
| 732 ;# Finished filtering main horizontal block. If there is no |
| 733 ;# vertical filtering, jump to storing the data. Otherwise |
| 734 ;# load up and filter the additional line that is needed |
| 735 ;# for the vertical filter. |
| 736 beq compute_sum_sse_16x16_b |
| 737 |
| 738 hfilter_16 v16, 0 |
| 739 |
| 740 b second_pass_16x16_b |
| 741 |
| 742 second_pass_16x16_pre_copy_b: |
| 743 slwi. r6, r6, 5 ;# index into vertical filter array |
| 744 |
| 745 load_and_align_16 v0, r3, r4, 1 |
| 746 load_and_align_16 v1, r3, r4, 1 |
| 747 load_and_align_16 v2, r3, r4, 1 |
| 748 load_and_align_16 v3, r3, r4, 1 |
| 749 load_and_align_16 v4, r3, r4, 1 |
| 750 load_and_align_16 v5, r3, r4, 1 |
| 751 load_and_align_16 v6, r3, r4, 1 |
| 752 load_and_align_16 v7, r3, r4, 1 |
| 753 load_and_align_16 v8, r3, r4, 1 |
| 754 load_and_align_16 v9, r3, r4, 1 |
| 755 load_and_align_16 v10, r3, r4, 1 |
| 756 load_and_align_16 v11, r3, r4, 1 |
| 757 load_and_align_16 v12, r3, r4, 1 |
| 758 load_and_align_16 v13, r3, r4, 1 |
| 759 load_and_align_16 v14, r3, r4, 1 |
| 760 load_and_align_16 v15, r3, r4, 1 |
| 761 load_and_align_16 v16, r3, r4, 0 |
| 762 |
| 763 beq compute_sum_sse_16x16_b |
| 764 |
| 765 second_pass_16x16_b: |
| 766 vspltish v20, 8 |
| 767 vspltish v18, 3 |
| 768 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 |
| 769 |
| 770 load_vfilter v20, v21 |
| 771 |
| 772 vfilter_16 v0, v1 |
| 773 vfilter_16 v1, v2 |
| 774 vfilter_16 v2, v3 |
| 775 vfilter_16 v3, v4 |
| 776 vfilter_16 v4, v5 |
| 777 vfilter_16 v5, v6 |
| 778 vfilter_16 v6, v7 |
| 779 vfilter_16 v7, v8 |
| 780 vfilter_16 v8, v9 |
| 781 vfilter_16 v9, v10 |
| 782 vfilter_16 v10, v11 |
| 783 vfilter_16 v11, v12 |
| 784 vfilter_16 v12, v13 |
| 785 vfilter_16 v13, v14 |
| 786 vfilter_16 v14, v15 |
| 787 vfilter_16 v15, v16 |
| 788 |
| 789 compute_sum_sse_16x16_b: |
| 790 vspltish v18, 0 ;# sum |
| 791 vspltish v19, 0 ;# sse |
| 792 vspltish v23, 0 ;# unpack |
| 793 li r10, 16 |
| 794 |
| 795 compute_sum_sse_16 v0, 1 |
| 796 compute_sum_sse_16 v1, 1 |
| 797 compute_sum_sse_16 v2, 1 |
| 798 compute_sum_sse_16 v3, 1 |
| 799 compute_sum_sse_16 v4, 1 |
| 800 compute_sum_sse_16 v5, 1 |
| 801 compute_sum_sse_16 v6, 1 |
| 802 compute_sum_sse_16 v7, 1 |
| 803 compute_sum_sse_16 v8, 1 |
| 804 compute_sum_sse_16 v9, 1 |
| 805 compute_sum_sse_16 v10, 1 |
| 806 compute_sum_sse_16 v11, 1 |
| 807 compute_sum_sse_16 v12, 1 |
| 808 compute_sum_sse_16 v13, 1 |
| 809 compute_sum_sse_16 v14, 1 |
| 810 compute_sum_sse_16 v15, 0 |
| 811 |
| 812 variance_final v18, v19, v23, 8 |
| 813 |
| 814 addi r1, r1, 32 ;# recover stack |
| 815 |
| 816 mtspr 256, r11 ;# reset old VRSAVE |
| 817 |
| 818 blr |
| 819 |
| 820 .data |
| 821 |
| 822 .align 4 |
| 823 hfilter_b: |
| 824 .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 |
| 825 .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 |
| 826 .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 |
| 827 .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 |
| 828 .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 |
| 829 .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 |
| 830 .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 |
| 831 .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 |
| 832 |
| 833 .align 4 |
| 834 vfilter_b: |
| 835 .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 |
| 836 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| 837 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 |
| 838 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 |
| 839 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 |
| 840 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 |
| 841 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 |
| 842 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 |
| 843 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 |
| 844 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 |
| 845 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 |
| 846 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 |
| 847 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 |
| 848 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 |
| 849 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 |
| 850 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 |
| 851 |
| 852 .align 4 |
| 853 b_hperm_b: |
| 854 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 |
| 855 |
| 856 .align 4 |
| 857 b_0123_b: |
| 858 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 |
| 859 |
| 860 .align 4 |
| 861 b_4567_b: |
| 862 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 |
| 863 |
| 864 b_hilo_b: |
| 865 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 |
OLD | NEW |