| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
| 3 ; | |
| 4 ; Use of this source code is governed by a BSD-style license | |
| 5 ; that can be found in the LICENSE file in the root of the source | |
| 6 ; tree. An additional intellectual property rights grant can be found | |
| 7 ; in the file PATENTS. All contributing project authors may | |
| 8 ; be found in the AUTHORS file in the root of the source tree. | |
| 9 ; | |
| 10 | |
| 11 | |
| 12 .globl sixtap_predict_ppc | |
| 13 .globl sixtap_predict8x4_ppc | |
| 14 .globl sixtap_predict8x8_ppc | |
| 15 .globl sixtap_predict16x16_ppc | |
| 16 | |
| 17 .macro load_c V, LABEL, OFF, R0, R1 | |
| 18 lis \R0, \LABEL@ha | |
| 19 la \R1, \LABEL@l(\R0) | |
| 20 lvx \V, \OFF, \R1 | |
| 21 .endm | |
| 22 | |
| 23 .macro load_hfilter V0, V1 | |
| 24 load_c \V0, HFilter, r5, r9, r10 | |
| 25 | |
| 26 addi r5, r5, 16 | |
| 27 lvx \V1, r5, r10 | |
| 28 .endm | |
| 29 | |
| 30 ;# Vertical filtering | |
| 31 .macro Vprolog | |
| 32 load_c v0, VFilter, r6, r3, r10 | |
| 33 | |
| 34 vspltish v5, 8 | |
| 35 vspltish v6, 3 | |
| 36 vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 37 | |
| 38 vspltb v1, v0, 1 | |
| 39 vspltb v2, v0, 2 | |
| 40 vspltb v3, v0, 3 | |
| 41 vspltb v4, v0, 4 | |
| 42 vspltb v5, v0, 5 | |
| 43 vspltb v0, v0, 0 | |
| 44 .endm | |
| 45 | |
| 46 .macro vpre_load | |
| 47 Vprolog | |
| 48 li r10, 16 | |
| 49 lvx v10, 0, r9 ;# v10..v14 = first 5 rows | |
| 50 lvx v11, r10, r9 | |
| 51 addi r9, r9, 32 | |
| 52 lvx v12, 0, r9 | |
| 53 lvx v13, r10, r9 | |
| 54 addi r9, r9, 32 | |
| 55 lvx v14, 0, r9 | |
| 56 .endm | |
| 57 | |
| 58 .macro Msum Re, Ro, V, T, TMP | |
| 59 ;# (Re,Ro) += (V*T) | |
| 60 vmuleub \TMP, \V, \T ;# trashes v8 | |
| 61 vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary | |
| 62 vmuloub \TMP, \V, \T | |
| 63 vadduhm \Ro, \Ro, \TMP ;# Ro = odds | |
| 64 .endm | |
| 65 | |
| 66 .macro vinterp_no_store P0 P1 P2 P3 P4 P5 | |
| 67 vmuleub v8, \P0, v0 ;# 64 + 4 positive taps | |
| 68 vadduhm v16, v6, v8 | |
| 69 vmuloub v8, \P0, v0 | |
| 70 vadduhm v17, v6, v8 | |
| 71 Msum v16, v17, \P2, v2, v8 | |
| 72 Msum v16, v17, \P3, v3, v8 | |
| 73 Msum v16, v17, \P5, v5, v8 | |
| 74 | |
| 75 vmuleub v18, \P1, v1 ;# 2 negative taps | |
| 76 vmuloub v19, \P1, v1 | |
| 77 Msum v18, v19, \P4, v4, v8 | |
| 78 | |
| 79 vsubuhs v16, v16, v18 ;# subtract neg from pos | |
| 80 vsubuhs v17, v17, v19 | |
| 81 vsrh v16, v16, v7 ;# divide by 128 | |
| 82 vsrh v17, v17, v7 ;# v16 v17 = evens, odds | |
| 83 vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order | |
| 84 vmrglh v19, v16, v17 | |
| 85 vpkuhus \P0, v18, v19 ;# P0 = 8-bit result | |
| 86 .endm | |
| 87 | |
| 88 .macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 | |
| 89 vmuleub v24, \P0, v13 ;# 64 + 4 positive taps | |
| 90 vadduhm v21, v20, v24 | |
| 91 vmuloub v24, \P0, v13 | |
| 92 vadduhm v22, v20, v24 | |
| 93 Msum v21, v22, \P2, v15, v25 | |
| 94 Msum v21, v22, \P3, v16, v25 | |
| 95 Msum v21, v22, \P5, v18, v25 | |
| 96 | |
| 97 vmuleub v23, \P1, v14 ;# 2 negative taps | |
| 98 vmuloub v24, \P1, v14 | |
| 99 Msum v23, v24, \P4, v17, v25 | |
| 100 | |
| 101 vsubuhs v21, v21, v23 ;# subtract neg from pos | |
| 102 vsubuhs v22, v22, v24 | |
| 103 vsrh v21, v21, v19 ;# divide by 128 | |
| 104 vsrh v22, v22, v19 ;# v16 v17 = evens, odds | |
| 105 vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order | |
| 106 vmrglh v24, v21, v22 | |
| 107 vpkuhus \P0, v23, v24 ;# P0 = 8-bit result | |
| 108 .endm | |
| 109 | |
| 110 | |
| 111 .macro Vinterp P0 P1 P2 P3 P4 P5 | |
| 112 vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 | |
| 113 stvx \P0, 0, r7 | |
| 114 add r7, r7, r8 ;# 33 ops per 16 pels | |
| 115 .endm | |
| 116 | |
| 117 | |
| 118 .macro luma_v P0, P1, P2, P3, P4, P5 | |
| 119 addi r9, r9, 16 ;# P5 = newest input row | |
| 120 lvx \P5, 0, r9 | |
| 121 Vinterp \P0, \P1, \P2, \P3, \P4, \P5 | |
| 122 .endm | |
| 123 | |
| 124 .macro luma_vtwo | |
| 125 luma_v v10, v11, v12, v13, v14, v15 | |
| 126 luma_v v11, v12, v13, v14, v15, v10 | |
| 127 .endm | |
| 128 | |
| 129 .macro luma_vfour | |
| 130 luma_vtwo | |
| 131 luma_v v12, v13, v14, v15, v10, v11 | |
| 132 luma_v v13, v14, v15, v10, v11, v12 | |
| 133 .endm | |
| 134 | |
| 135 .macro luma_vsix | |
| 136 luma_vfour | |
| 137 luma_v v14, v15, v10, v11, v12, v13 | |
| 138 luma_v v15, v10, v11, v12, v13, v14 | |
| 139 .endm | |
| 140 | |
| 141 .macro Interp4 R I I4 | |
| 142 vmsummbm \R, v13, \I, v15 | |
| 143 vmsummbm \R, v14, \I4, \R | |
| 144 .endm | |
| 145 | |
| 146 .macro Read8x8 VD, RS, RP, increment_counter | |
| 147 lvsl v21, 0, \RS ;# permutate value for alignment | |
| 148 | |
| 149 ;# input to filter is 21 bytes wide, output is 16 bytes. | |
| 150 ;# input will can span three vectors if not aligned correctly. | |
| 151 lvx \VD, 0, \RS | |
| 152 lvx v20, r10, \RS | |
| 153 | |
| 154 .if \increment_counter | |
| 155 add \RS, \RS, \RP | |
| 156 .endif | |
| 157 | |
| 158 vperm \VD, \VD, v20, v21 | |
| 159 .endm | |
| 160 | |
| 161 .macro interp_8x8 R | |
| 162 vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 | |
| 163 vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A | |
| 164 Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 | |
| 165 vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx | |
| 166 Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 | |
| 167 | |
| 168 vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 | |
| 169 vsrh \R, \R, v19 | |
| 170 | |
| 171 vpkuhus \R, \R, \R ;# saturate and pack | |
| 172 | |
| 173 .endm | |
| 174 | |
| 175 .macro Read4x4 VD, RS, RP, increment_counter | |
| 176 lvsl v21, 0, \RS ;# permutate value for alignment | |
| 177 | |
| 178 ;# input to filter is 21 bytes wide, output is 16 bytes. | |
| 179 ;# input will can span three vectors if not aligned correctly. | |
| 180 lvx v20, 0, \RS | |
| 181 | |
| 182 .if \increment_counter | |
| 183 add \RS, \RS, \RP | |
| 184 .endif | |
| 185 | |
| 186 vperm \VD, v20, v20, v21 | |
| 187 .endm | |
| 188 .text | |
| 189 | |
| 190 .align 2 | |
| 191 ;# r3 unsigned char * src | |
| 192 ;# r4 int src_pitch | |
| 193 ;# r5 int x_offset | |
| 194 ;# r6 int y_offset | |
| 195 ;# r7 unsigned char * dst | |
| 196 ;# r8 int dst_pitch | |
| 197 sixtap_predict_ppc: | |
| 198 mfspr r11, 256 ;# get old VRSAVE | |
| 199 oris r12, r11, 0xff87 | |
| 200 ori r12, r12, 0xffc0 | |
| 201 mtspr 256, r12 ;# set VRSAVE | |
| 202 | |
| 203 stwu r1,-32(r1) ;# create space on the stack | |
| 204 | |
| 205 slwi. r5, r5, 5 ;# index into horizontal filter array | |
| 206 | |
| 207 vspltish v19, 7 | |
| 208 | |
| 209 ;# If there isn't any filtering to be done for the horizontal, then | |
| 210 ;# just skip to the second pass. | |
| 211 beq- vertical_only_4x4 | |
| 212 | |
| 213 ;# load up horizontal filter | |
| 214 load_hfilter v13, v14 | |
| 215 | |
| 216 ;# rounding added in on the multiply | |
| 217 vspltisw v16, 8 | |
| 218 vspltisw v15, 3 | |
| 219 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 | |
| 220 | |
| 221 ;# Load up permutation constants | |
| 222 load_c v16, B_0123, 0, r9, r10 | |
| 223 load_c v17, B_4567, 0, r9, r10 | |
| 224 load_c v18, B_89AB, 0, r9, r10 | |
| 225 | |
| 226 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after | |
| 227 addi r3, r3, -2 | |
| 228 | |
| 229 addi r9, r3, 0 | |
| 230 li r10, 16 | |
| 231 Read8x8 v2, r3, r4, 1 | |
| 232 Read8x8 v3, r3, r4, 1 | |
| 233 Read8x8 v4, r3, r4, 1 | |
| 234 Read8x8 v5, r3, r4, 1 | |
| 235 | |
| 236 slwi. r6, r6, 4 ;# index into vertical filter array | |
| 237 | |
| 238 ;# filter a line | |
| 239 interp_8x8 v2 | |
| 240 interp_8x8 v3 | |
| 241 interp_8x8 v4 | |
| 242 interp_8x8 v5 | |
| 243 | |
| 244 ;# Finished filtering main horizontal block. If there is no | |
| 245 ;# vertical filtering, jump to storing the data. Otherwise | |
| 246 ;# load up and filter the additional 5 lines that are needed | |
| 247 ;# for the vertical filter. | |
| 248 beq- store_4x4 | |
| 249 | |
| 250 ;# only needed if there is a vertical filter present | |
| 251 ;# if the second filter is not null then need to back off by 2*pitch | |
| 252 sub r9, r9, r4 | |
| 253 sub r9, r9, r4 | |
| 254 | |
| 255 Read8x8 v0, r9, r4, 1 | |
| 256 Read8x8 v1, r9, r4, 0 | |
| 257 Read8x8 v6, r3, r4, 1 | |
| 258 Read8x8 v7, r3, r4, 1 | |
| 259 Read8x8 v8, r3, r4, 0 | |
| 260 | |
| 261 interp_8x8 v0 | |
| 262 interp_8x8 v1 | |
| 263 interp_8x8 v6 | |
| 264 interp_8x8 v7 | |
| 265 interp_8x8 v8 | |
| 266 | |
| 267 b second_pass_4x4 | |
| 268 | |
| 269 vertical_only_4x4: | |
| 270 ;# only needed if there is a vertical filter present | |
| 271 ;# if the second filter is not null then need to back off by 2*pitch | |
| 272 sub r3, r3, r4 | |
| 273 sub r3, r3, r4 | |
| 274 li r10, 16 | |
| 275 | |
| 276 Read8x8 v0, r3, r4, 1 | |
| 277 Read8x8 v1, r3, r4, 1 | |
| 278 Read8x8 v2, r3, r4, 1 | |
| 279 Read8x8 v3, r3, r4, 1 | |
| 280 Read8x8 v4, r3, r4, 1 | |
| 281 Read8x8 v5, r3, r4, 1 | |
| 282 Read8x8 v6, r3, r4, 1 | |
| 283 Read8x8 v7, r3, r4, 1 | |
| 284 Read8x8 v8, r3, r4, 0 | |
| 285 | |
| 286 slwi r6, r6, 4 ;# index into vertical filter array | |
| 287 | |
| 288 second_pass_4x4: | |
| 289 load_c v20, b_hilo_4x4, 0, r9, r10 | |
| 290 load_c v21, b_hilo, 0, r9, r10 | |
| 291 | |
| 292 ;# reposition input so that it can go through the | |
| 293 ;# filtering phase with one pass. | |
| 294 vperm v0, v0, v1, v20 ;# 0 1 x x | |
| 295 vperm v2, v2, v3, v20 ;# 2 3 x x | |
| 296 vperm v4, v4, v5, v20 ;# 4 5 x x | |
| 297 vperm v6, v6, v7, v20 ;# 6 7 x x | |
| 298 | |
| 299 vperm v0, v0, v2, v21 ;# 0 1 2 3 | |
| 300 vperm v4, v4, v6, v21 ;# 4 5 6 7 | |
| 301 | |
| 302 vsldoi v1, v0, v4, 4 | |
| 303 vsldoi v2, v0, v4, 8 | |
| 304 vsldoi v3, v0, v4, 12 | |
| 305 | |
| 306 vsldoi v5, v4, v8, 4 | |
| 307 | |
| 308 load_c v13, VFilter, r6, r9, r10 | |
| 309 | |
| 310 vspltish v15, 8 | |
| 311 vspltish v20, 3 | |
| 312 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 313 | |
| 314 vspltb v14, v13, 1 | |
| 315 vspltb v15, v13, 2 | |
| 316 vspltb v16, v13, 3 | |
| 317 vspltb v17, v13, 4 | |
| 318 vspltb v18, v13, 5 | |
| 319 vspltb v13, v13, 0 | |
| 320 | |
| 321 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 | |
| 322 | |
| 323 stvx v0, 0, r1 | |
| 324 | |
| 325 lwz r0, 0(r1) | |
| 326 stw r0, 0(r7) | |
| 327 add r7, r7, r8 | |
| 328 | |
| 329 lwz r0, 4(r1) | |
| 330 stw r0, 0(r7) | |
| 331 add r7, r7, r8 | |
| 332 | |
| 333 lwz r0, 8(r1) | |
| 334 stw r0, 0(r7) | |
| 335 add r7, r7, r8 | |
| 336 | |
| 337 lwz r0, 12(r1) | |
| 338 stw r0, 0(r7) | |
| 339 | |
| 340 b exit_4x4 | |
| 341 | |
| 342 store_4x4: | |
| 343 | |
| 344 stvx v2, 0, r1 | |
| 345 lwz r0, 0(r1) | |
| 346 stw r0, 0(r7) | |
| 347 add r7, r7, r8 | |
| 348 | |
| 349 stvx v3, 0, r1 | |
| 350 lwz r0, 0(r1) | |
| 351 stw r0, 0(r7) | |
| 352 add r7, r7, r8 | |
| 353 | |
| 354 stvx v4, 0, r1 | |
| 355 lwz r0, 0(r1) | |
| 356 stw r0, 0(r7) | |
| 357 add r7, r7, r8 | |
| 358 | |
| 359 stvx v5, 0, r1 | |
| 360 lwz r0, 0(r1) | |
| 361 stw r0, 0(r7) | |
| 362 | |
| 363 exit_4x4: | |
| 364 | |
| 365 addi r1, r1, 32 ;# recover stack | |
| 366 | |
| 367 mtspr 256, r11 ;# reset old VRSAVE | |
| 368 | |
| 369 blr | |
| 370 | |
| 371 .macro w_8x8 V, D, R, P | |
| 372 stvx \V, 0, r1 | |
| 373 lwz \R, 0(r1) | |
| 374 stw \R, 0(r7) | |
| 375 lwz \R, 4(r1) | |
| 376 stw \R, 4(r7) | |
| 377 add \D, \D, \P | |
| 378 .endm | |
| 379 | |
| 380 .align 2 | |
| 381 ;# r3 unsigned char * src | |
| 382 ;# r4 int src_pitch | |
| 383 ;# r5 int x_offset | |
| 384 ;# r6 int y_offset | |
| 385 ;# r7 unsigned char * dst | |
| 386 ;# r8 int dst_pitch | |
| 387 | |
| 388 sixtap_predict8x4_ppc: | |
| 389 mfspr r11, 256 ;# get old VRSAVE | |
| 390 oris r12, r11, 0xffff | |
| 391 ori r12, r12, 0xffc0 | |
| 392 mtspr 256, r12 ;# set VRSAVE | |
| 393 | |
| 394 stwu r1,-32(r1) ;# create space on the stack | |
| 395 | |
| 396 slwi. r5, r5, 5 ;# index into horizontal filter array | |
| 397 | |
| 398 vspltish v19, 7 | |
| 399 | |
| 400 ;# If there isn't any filtering to be done for the horizontal, then | |
| 401 ;# just skip to the second pass. | |
| 402 beq- second_pass_pre_copy_8x4 | |
| 403 | |
| 404 load_hfilter v13, v14 | |
| 405 | |
| 406 ;# rounding added in on the multiply | |
| 407 vspltisw v16, 8 | |
| 408 vspltisw v15, 3 | |
| 409 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 | |
| 410 | |
| 411 ;# Load up permutation constants | |
| 412 load_c v16, B_0123, 0, r9, r10 | |
| 413 load_c v17, B_4567, 0, r9, r10 | |
| 414 load_c v18, B_89AB, 0, r9, r10 | |
| 415 | |
| 416 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after | |
| 417 addi r3, r3, -2 | |
| 418 | |
| 419 addi r9, r3, 0 | |
| 420 li r10, 16 | |
| 421 Read8x8 v2, r3, r4, 1 | |
| 422 Read8x8 v3, r3, r4, 1 | |
| 423 Read8x8 v4, r3, r4, 1 | |
| 424 Read8x8 v5, r3, r4, 1 | |
| 425 | |
| 426 slwi. r6, r6, 4 ;# index into vertical filter array | |
| 427 | |
| 428 ;# filter a line | |
| 429 interp_8x8 v2 | |
| 430 interp_8x8 v3 | |
| 431 interp_8x8 v4 | |
| 432 interp_8x8 v5 | |
| 433 | |
| 434 ;# Finished filtering main horizontal block. If there is no | |
| 435 ;# vertical filtering, jump to storing the data. Otherwise | |
| 436 ;# load up and filter the additional 5 lines that are needed | |
| 437 ;# for the vertical filter. | |
| 438 beq- store_8x4 | |
| 439 | |
| 440 ;# only needed if there is a vertical filter present | |
| 441 ;# if the second filter is not null then need to back off by 2*pitch | |
| 442 sub r9, r9, r4 | |
| 443 sub r9, r9, r4 | |
| 444 | |
| 445 Read8x8 v0, r9, r4, 1 | |
| 446 Read8x8 v1, r9, r4, 0 | |
| 447 Read8x8 v6, r3, r4, 1 | |
| 448 Read8x8 v7, r3, r4, 1 | |
| 449 Read8x8 v8, r3, r4, 0 | |
| 450 | |
| 451 interp_8x8 v0 | |
| 452 interp_8x8 v1 | |
| 453 interp_8x8 v6 | |
| 454 interp_8x8 v7 | |
| 455 interp_8x8 v8 | |
| 456 | |
| 457 b second_pass_8x4 | |
| 458 | |
| 459 second_pass_pre_copy_8x4: | |
| 460 ;# only needed if there is a vertical filter present | |
| 461 ;# if the second filter is not null then need to back off by 2*pitch | |
| 462 sub r3, r3, r4 | |
| 463 sub r3, r3, r4 | |
| 464 li r10, 16 | |
| 465 | |
| 466 Read8x8 v0, r3, r4, 1 | |
| 467 Read8x8 v1, r3, r4, 1 | |
| 468 Read8x8 v2, r3, r4, 1 | |
| 469 Read8x8 v3, r3, r4, 1 | |
| 470 Read8x8 v4, r3, r4, 1 | |
| 471 Read8x8 v5, r3, r4, 1 | |
| 472 Read8x8 v6, r3, r4, 1 | |
| 473 Read8x8 v7, r3, r4, 1 | |
| 474 Read8x8 v8, r3, r4, 1 | |
| 475 | |
| 476 slwi r6, r6, 4 ;# index into vertical filter array | |
| 477 | |
| 478 second_pass_8x4: | |
| 479 load_c v13, VFilter, r6, r9, r10 | |
| 480 | |
| 481 vspltish v15, 8 | |
| 482 vspltish v20, 3 | |
| 483 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 484 | |
| 485 vspltb v14, v13, 1 | |
| 486 vspltb v15, v13, 2 | |
| 487 vspltb v16, v13, 3 | |
| 488 vspltb v17, v13, 4 | |
| 489 vspltb v18, v13, 5 | |
| 490 vspltb v13, v13, 0 | |
| 491 | |
| 492 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 | |
| 493 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 | |
| 494 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 | |
| 495 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 | |
| 496 | |
| 497 cmpi cr0, r8, 8 | |
| 498 beq cr0, store_aligned_8x4 | |
| 499 | |
| 500 w_8x8 v0, r7, r0, r8 | |
| 501 w_8x8 v1, r7, r0, r8 | |
| 502 w_8x8 v2, r7, r0, r8 | |
| 503 w_8x8 v3, r7, r0, r8 | |
| 504 | |
| 505 b exit_8x4 | |
| 506 | |
| 507 store_aligned_8x4: | |
| 508 | |
| 509 load_c v10, b_hilo, 0, r9, r10 | |
| 510 | |
| 511 vperm v0, v0, v1, v10 | |
| 512 vperm v2, v2, v3, v10 | |
| 513 | |
| 514 stvx v0, 0, r7 | |
| 515 addi r7, r7, 16 | |
| 516 stvx v2, 0, r7 | |
| 517 | |
| 518 b exit_8x4 | |
| 519 | |
| 520 store_8x4: | |
| 521 cmpi cr0, r8, 8 | |
| 522 beq cr0, store_aligned2_8x4 | |
| 523 | |
| 524 w_8x8 v2, r7, r0, r8 | |
| 525 w_8x8 v3, r7, r0, r8 | |
| 526 w_8x8 v4, r7, r0, r8 | |
| 527 w_8x8 v5, r7, r0, r8 | |
| 528 | |
| 529 b exit_8x4 | |
| 530 | |
| 531 store_aligned2_8x4: | |
| 532 load_c v10, b_hilo, 0, r9, r10 | |
| 533 | |
| 534 vperm v2, v2, v3, v10 | |
| 535 vperm v4, v4, v5, v10 | |
| 536 | |
| 537 stvx v2, 0, r7 | |
| 538 addi r7, r7, 16 | |
| 539 stvx v4, 0, r7 | |
| 540 | |
| 541 exit_8x4: | |
| 542 | |
| 543 addi r1, r1, 32 ;# recover stack | |
| 544 | |
| 545 mtspr 256, r11 ;# reset old VRSAVE | |
| 546 | |
| 547 | |
| 548 blr | |
| 549 | |
| 550 .align 2 | |
| 551 ;# r3 unsigned char * src | |
| 552 ;# r4 int src_pitch | |
| 553 ;# r5 int x_offset | |
| 554 ;# r6 int y_offset | |
| 555 ;# r7 unsigned char * dst | |
| 556 ;# r8 int dst_pitch | |
| 557 | |
| 558 ;# Because the width that needs to be filtered will fit in a single altivec | |
| 559 ;# register there is no need to loop. Everything can stay in registers. | |
| 560 sixtap_predict8x8_ppc: | |
| 561 mfspr r11, 256 ;# get old VRSAVE | |
| 562 oris r12, r11, 0xffff | |
| 563 ori r12, r12, 0xffc0 | |
| 564 mtspr 256, r12 ;# set VRSAVE | |
| 565 | |
| 566 stwu r1,-32(r1) ;# create space on the stack | |
| 567 | |
| 568 slwi. r5, r5, 5 ;# index into horizontal filter array | |
| 569 | |
| 570 vspltish v19, 7 | |
| 571 | |
| 572 ;# If there isn't any filtering to be done for the horizontal, then | |
| 573 ;# just skip to the second pass. | |
| 574 beq- second_pass_pre_copy_8x8 | |
| 575 | |
| 576 load_hfilter v13, v14 | |
| 577 | |
| 578 ;# rounding added in on the multiply | |
| 579 vspltisw v16, 8 | |
| 580 vspltisw v15, 3 | |
| 581 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 | |
| 582 | |
| 583 ;# Load up permutation constants | |
| 584 load_c v16, B_0123, 0, r9, r10 | |
| 585 load_c v17, B_4567, 0, r9, r10 | |
| 586 load_c v18, B_89AB, 0, r9, r10 | |
| 587 | |
| 588 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after | |
| 589 addi r3, r3, -2 | |
| 590 | |
| 591 addi r9, r3, 0 | |
| 592 li r10, 16 | |
| 593 Read8x8 v2, r3, r4, 1 | |
| 594 Read8x8 v3, r3, r4, 1 | |
| 595 Read8x8 v4, r3, r4, 1 | |
| 596 Read8x8 v5, r3, r4, 1 | |
| 597 Read8x8 v6, r3, r4, 1 | |
| 598 Read8x8 v7, r3, r4, 1 | |
| 599 Read8x8 v8, r3, r4, 1 | |
| 600 Read8x8 v9, r3, r4, 1 | |
| 601 | |
| 602 slwi. r6, r6, 4 ;# index into vertical filter array | |
| 603 | |
| 604 ;# filter a line | |
| 605 interp_8x8 v2 | |
| 606 interp_8x8 v3 | |
| 607 interp_8x8 v4 | |
| 608 interp_8x8 v5 | |
| 609 interp_8x8 v6 | |
| 610 interp_8x8 v7 | |
| 611 interp_8x8 v8 | |
| 612 interp_8x8 v9 | |
| 613 | |
| 614 ;# Finished filtering main horizontal block. If there is no | |
| 615 ;# vertical filtering, jump to storing the data. Otherwise | |
| 616 ;# load up and filter the additional 5 lines that are needed | |
| 617 ;# for the vertical filter. | |
| 618 beq- store_8x8 | |
| 619 | |
| 620 ;# only needed if there is a vertical filter present | |
| 621 ;# if the second filter is not null then need to back off by 2*pitch | |
| 622 sub r9, r9, r4 | |
| 623 sub r9, r9, r4 | |
| 624 | |
| 625 Read8x8 v0, r9, r4, 1 | |
| 626 Read8x8 v1, r9, r4, 0 | |
| 627 Read8x8 v10, r3, r4, 1 | |
| 628 Read8x8 v11, r3, r4, 1 | |
| 629 Read8x8 v12, r3, r4, 0 | |
| 630 | |
| 631 interp_8x8 v0 | |
| 632 interp_8x8 v1 | |
| 633 interp_8x8 v10 | |
| 634 interp_8x8 v11 | |
| 635 interp_8x8 v12 | |
| 636 | |
| 637 b second_pass_8x8 | |
| 638 | |
| 639 second_pass_pre_copy_8x8: | |
| 640 ;# only needed if there is a vertical filter present | |
| 641 ;# if the second filter is not null then need to back off by 2*pitch | |
| 642 sub r3, r3, r4 | |
| 643 sub r3, r3, r4 | |
| 644 li r10, 16 | |
| 645 | |
| 646 Read8x8 v0, r3, r4, 1 | |
| 647 Read8x8 v1, r3, r4, 1 | |
| 648 Read8x8 v2, r3, r4, 1 | |
| 649 Read8x8 v3, r3, r4, 1 | |
| 650 Read8x8 v4, r3, r4, 1 | |
| 651 Read8x8 v5, r3, r4, 1 | |
| 652 Read8x8 v6, r3, r4, 1 | |
| 653 Read8x8 v7, r3, r4, 1 | |
| 654 Read8x8 v8, r3, r4, 1 | |
| 655 Read8x8 v9, r3, r4, 1 | |
| 656 Read8x8 v10, r3, r4, 1 | |
| 657 Read8x8 v11, r3, r4, 1 | |
| 658 Read8x8 v12, r3, r4, 0 | |
| 659 | |
| 660 slwi r6, r6, 4 ;# index into vertical filter array | |
| 661 | |
| 662 second_pass_8x8: | |
| 663 load_c v13, VFilter, r6, r9, r10 | |
| 664 | |
| 665 vspltish v15, 8 | |
| 666 vspltish v20, 3 | |
| 667 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
| 668 | |
| 669 vspltb v14, v13, 1 | |
| 670 vspltb v15, v13, 2 | |
| 671 vspltb v16, v13, 3 | |
| 672 vspltb v17, v13, 4 | |
| 673 vspltb v18, v13, 5 | |
| 674 vspltb v13, v13, 0 | |
| 675 | |
| 676 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 | |
| 677 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 | |
| 678 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 | |
| 679 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 | |
| 680 vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 | |
| 681 vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 | |
| 682 vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 | |
| 683 vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 | |
| 684 | |
| 685 cmpi cr0, r8, 8 | |
| 686 beq cr0, store_aligned_8x8 | |
| 687 | |
| 688 w_8x8 v0, r7, r0, r8 | |
| 689 w_8x8 v1, r7, r0, r8 | |
| 690 w_8x8 v2, r7, r0, r8 | |
| 691 w_8x8 v3, r7, r0, r8 | |
| 692 w_8x8 v4, r7, r0, r8 | |
| 693 w_8x8 v5, r7, r0, r8 | |
| 694 w_8x8 v6, r7, r0, r8 | |
| 695 w_8x8 v7, r7, r0, r8 | |
| 696 | |
| 697 b exit_8x8 | |
| 698 | |
| 699 store_aligned_8x8: | |
| 700 | |
| 701 load_c v10, b_hilo, 0, r9, r10 | |
| 702 | |
| 703 vperm v0, v0, v1, v10 | |
| 704 vperm v2, v2, v3, v10 | |
| 705 vperm v4, v4, v5, v10 | |
| 706 vperm v6, v6, v7, v10 | |
| 707 | |
| 708 stvx v0, 0, r7 | |
| 709 addi r7, r7, 16 | |
| 710 stvx v2, 0, r7 | |
| 711 addi r7, r7, 16 | |
| 712 stvx v4, 0, r7 | |
| 713 addi r7, r7, 16 | |
| 714 stvx v6, 0, r7 | |
| 715 | |
| 716 b exit_8x8 | |
| 717 | |
| 718 store_8x8: | |
| 719 cmpi cr0, r8, 8 | |
| 720 beq cr0, store_aligned2_8x8 | |
| 721 | |
| 722 w_8x8 v2, r7, r0, r8 | |
| 723 w_8x8 v3, r7, r0, r8 | |
| 724 w_8x8 v4, r7, r0, r8 | |
| 725 w_8x8 v5, r7, r0, r8 | |
| 726 w_8x8 v6, r7, r0, r8 | |
| 727 w_8x8 v7, r7, r0, r8 | |
| 728 w_8x8 v8, r7, r0, r8 | |
| 729 w_8x8 v9, r7, r0, r8 | |
| 730 | |
| 731 b exit_8x8 | |
| 732 | |
| 733 store_aligned2_8x8: | |
| 734 load_c v10, b_hilo, 0, r9, r10 | |
| 735 | |
| 736 vperm v2, v2, v3, v10 | |
| 737 vperm v4, v4, v5, v10 | |
| 738 vperm v6, v6, v7, v10 | |
| 739 vperm v8, v8, v9, v10 | |
| 740 | |
| 741 stvx v2, 0, r7 | |
| 742 addi r7, r7, 16 | |
| 743 stvx v4, 0, r7 | |
| 744 addi r7, r7, 16 | |
| 745 stvx v6, 0, r7 | |
| 746 addi r7, r7, 16 | |
| 747 stvx v8, 0, r7 | |
| 748 | |
| 749 exit_8x8: | |
| 750 | |
| 751 addi r1, r1, 32 ;# recover stack | |
| 752 | |
| 753 mtspr 256, r11 ;# reset old VRSAVE | |
| 754 | |
| 755 blr | |
| 756 | |
| 757 .align 2 | |
| 758 ;# r3 unsigned char * src | |
| 759 ;# r4 int src_pitch | |
| 760 ;# r5 int x_offset | |
| 761 ;# r6 int y_offset | |
| 762 ;# r7 unsigned char * dst | |
| 763 ;# r8 int dst_pitch | |
| 764 | |
| 765 ;# Two pass filtering. First pass is Horizontal edges, second pass is vertical | |
| 766 ;# edges. One of the filters can be null, but both won't be. Needs to use a | |
| 767 ;# temporary buffer because the source buffer can't be modified and the buffer | |
| 768 ;# for the destination is not large enough to hold the temporary data. | |
| 769 sixtap_predict16x16_ppc: | |
| 770 mfspr r11, 256 ;# get old VRSAVE | |
| 771 oris r12, r11, 0xffff | |
| 772 ori r12, r12, 0xf000 | |
| 773 mtspr 256, r12 ;# set VRSAVE | |
| 774 | |
| 775 stwu r1,-416(r1) ;# create space on the stack | |
| 776 | |
| 777 ;# Three possiblities | |
| 778 ;# 1. First filter is null. Don't use a temp buffer. | |
| 779 ;# 2. Second filter is null. Don't use a temp buffer. | |
| 780 ;# 3. Neither are null, use temp buffer. | |
| 781 | |
| 782 ;# First Pass (horizontal edge) | |
| 783 ;# setup pointers for src | |
| 784 ;# if possiblity (1) then setup the src pointer to be the orginal and jump | |
| 785 ;# to second pass. this is based on if x_offset is 0. | |
| 786 | |
| 787 ;# load up horizontal filter | |
| 788 slwi. r5, r5, 5 ;# index into horizontal filter array | |
| 789 | |
| 790 load_hfilter v4, v5 | |
| 791 | |
| 792 beq- copy_horizontal_16x21 | |
| 793 | |
| 794 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after | |
| 795 addi r3, r3, -2 | |
| 796 | |
| 797 slwi. r6, r6, 4 ;# index into vertical filter array | |
| 798 | |
| 799 ;# setup constants | |
| 800 ;# v14 permutation value for alignment | |
| 801 load_c v14, b_hperm, 0, r9, r10 | |
| 802 | |
| 803 ;# These statements are guessing that there won't be a second pass, | |
| 804 ;# but if there is then inside the bypass they need to be set | |
| 805 li r0, 16 ;# prepare for no vertical filter | |
| 806 | |
| 807 ;# Change the output pointer and pitch to be the actual | |
| 808 ;# desination instead of a temporary buffer. | |
| 809 addi r9, r7, 0 | |
| 810 addi r5, r8, 0 | |
| 811 | |
| 812 ;# no vertical filter, so write the output from the first pass | |
| 813 ;# directly into the output buffer. | |
| 814 beq- no_vertical_filter_bypass | |
| 815 | |
| 816 ;# if the second filter is not null then need to back off by 2*pitch | |
| 817 sub r3, r3, r4 | |
| 818 sub r3, r3, r4 | |
| 819 | |
| 820 ;# setup counter for the number of lines that are going to be filtered | |
| 821 li r0, 21 | |
| 822 | |
| 823 ;# use the stack as temporary storage | |
| 824 la r9, 48(r1) | |
| 825 li r5, 16 | |
| 826 | |
| 827 no_vertical_filter_bypass: | |
| 828 | |
| 829 mtctr r0 | |
| 830 | |
| 831 ;# rounding added in on the multiply | |
| 832 vspltisw v10, 8 | |
| 833 vspltisw v12, 3 | |
| 834 vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 | |
| 835 | |
| 836 ;# downshift by 7 ( divide by 128 ) at the end | |
| 837 vspltish v13, 7 | |
| 838 | |
| 839 ;# index to the next set of vectors in the row. | |
| 840 li r10, 16 | |
| 841 li r12, 32 | |
| 842 | |
| 843 horizontal_loop_16x16: | |
| 844 | |
| 845 lvsl v15, 0, r3 ;# permutate value for alignment | |
| 846 | |
| 847 ;# input to filter is 21 bytes wide, output is 16 bytes. | |
| 848 ;# input will can span three vectors if not aligned correctly. | |
| 849 lvx v1, 0, r3 | |
| 850 lvx v2, r10, r3 | |
| 851 lvx v3, r12, r3 | |
| 852 | |
| 853 vperm v8, v1, v2, v15 | |
| 854 vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified | |
| 855 | |
| 856 vsldoi v11, v8, v9, 4 | |
| 857 | |
| 858 ;# set 0 | |
| 859 vmsummbm v6, v4, v8, v12 ;# taps times elements | |
| 860 vmsummbm v0, v5, v11, v6 | |
| 861 | |
| 862 ;# set 1 | |
| 863 vsldoi v10, v8, v9, 1 | |
| 864 vsldoi v11, v8, v9, 5 | |
| 865 | |
| 866 vmsummbm v6, v4, v10, v12 | |
| 867 vmsummbm v1, v5, v11, v6 | |
| 868 | |
| 869 ;# set 2 | |
| 870 vsldoi v10, v8, v9, 2 | |
| 871 vsldoi v11, v8, v9, 6 | |
| 872 | |
| 873 vmsummbm v6, v4, v10, v12 | |
| 874 vmsummbm v2, v5, v11, v6 | |
| 875 | |
| 876 ;# set 3 | |
| 877 vsldoi v10, v8, v9, 3 | |
| 878 vsldoi v11, v8, v9, 7 | |
| 879 | |
| 880 vmsummbm v6, v4, v10, v12 | |
| 881 vmsummbm v3, v5, v11, v6 | |
| 882 | |
| 883 vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) | |
| 884 vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F | |
| 885 | |
| 886 vsrh v0, v0, v13 ;# divide v0, v1 by 128 | |
| 887 vsrh v1, v1, v13 | |
| 888 | |
| 889 vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result | |
| 890 vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result | |
| 891 | |
| 892 stvx v0, 0, r9 | |
| 893 add r9, r9, r5 | |
| 894 | |
| 895 add r3, r3, r4 | |
| 896 | |
| 897 bdnz horizontal_loop_16x16 | |
| 898 | |
| 899 ;# check again to see if vertical filter needs to be done. | |
| 900 cmpi cr0, r6, 0 | |
| 901 beq cr0, end_16x16 | |
| 902 | |
| 903 ;# yes there is, so go to the second pass | |
| 904 b second_pass_16x16 | |
| 905 | |
| 906 copy_horizontal_16x21: | |
| 907 li r10, 21 | |
| 908 mtctr r10 | |
| 909 | |
| 910 li r10, 16 | |
| 911 | |
| 912 sub r3, r3, r4 | |
| 913 sub r3, r3, r4 | |
| 914 | |
| 915 ;# this is done above if there is a horizontal filter, | |
| 916 ;# if not it needs to be done down here. | |
| 917 slwi r6, r6, 4 ;# index into vertical filter array | |
| 918 | |
| 919 ;# always write to the stack when doing a horizontal copy | |
| 920 la r9, 48(r1) | |
| 921 | |
| 922 copy_horizontal_loop_16x21: | |
| 923 lvsl v15, 0, r3 ;# permutate value for alignment | |
| 924 | |
| 925 lvx v1, 0, r3 | |
| 926 lvx v2, r10, r3 | |
| 927 | |
| 928 vperm v8, v1, v2, v15 | |
| 929 | |
| 930 stvx v8, 0, r9 | |
| 931 addi r9, r9, 16 | |
| 932 | |
| 933 add r3, r3, r4 | |
| 934 | |
| 935 bdnz copy_horizontal_loop_16x21 | |
| 936 | |
| 937 second_pass_16x16: | |
| 938 | |
| 939 ;# always read from the stack when doing a vertical filter | |
| 940 la r9, 48(r1) | |
| 941 | |
| 942 ;# downshift by 7 ( divide by 128 ) at the end | |
| 943 vspltish v7, 7 | |
| 944 | |
| 945 vpre_load | |
| 946 | |
| 947 luma_vsix | |
| 948 luma_vsix | |
| 949 luma_vfour | |
| 950 | |
| 951 end_16x16: | |
| 952 | |
| 953 addi r1, r1, 416 ;# recover stack | |
| 954 | |
| 955 mtspr 256, r11 ;# reset old VRSAVE | |
| 956 | |
| 957 blr | |
| 958 | |
| 959 .data | |
| 960 | |
| 961 .align 4 | |
| 962 HFilter: | |
| 963 .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 | |
| 964 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 965 .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 | |
| 966 .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 | |
| 967 .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 | |
| 968 .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 | |
| 969 .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 | |
| 970 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 | |
| 971 .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 | |
| 972 .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 | |
| 973 .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 | |
| 974 .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 | |
| 975 .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 | |
| 976 .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 | |
| 977 .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 | |
| 978 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 | |
| 979 | |
| 980 .align 4 | |
| 981 VFilter: | |
| 982 .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 983 .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 984 .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 985 .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 986 .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 987 .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 988 .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 989 .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
| 990 | |
| 991 .align 4 | |
| 992 b_hperm: | |
| 993 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 | |
| 994 | |
| 995 .align 4 | |
| 996 B_0123: | |
| 997 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 | |
| 998 | |
| 999 .align 4 | |
| 1000 B_4567: | |
| 1001 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 | |
| 1002 | |
| 1003 .align 4 | |
| 1004 B_89AB: | |
| 1005 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 | |
| 1006 | |
| 1007 .align 4 | |
| 1008 b_hilo: | |
| 1009 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 | |
| 1010 | |
| 1011 .align 4 | |
| 1012 b_hilo_4x4: | |
| 1013 .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 | |
| OLD | NEW |