| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
| 3 ; | |
| 4 ; Use of this source code is governed by a BSD-style license | |
| 5 ; that can be found in the LICENSE file in the root of the source | |
| 6 ; tree. An additional intellectual property rights grant can be found | |
| 7 ; in the file PATENTS. All contributing project authors may | |
| 8 ; be found in the AUTHORS file in the root of the source tree. | |
| 9 ; | |
| 10 | |
| 11 | |
| 12 .globl vp8_get8x8var_ppc | |
| 13 .globl vp8_get16x16var_ppc | |
| 14 .globl vp8_mse16x16_ppc | |
| 15 .globl vp8_variance16x16_ppc | |
| 16 .globl vp8_variance16x8_ppc | |
| 17 .globl vp8_variance8x16_ppc | |
| 18 .globl vp8_variance8x8_ppc | |
| 19 .globl vp8_variance4x4_ppc | |
| 20 | |
| 21 .macro load_aligned_16 V R O | |
| 22 lvsl v3, 0, \R ;# permutate value for alignment | |
| 23 | |
| 24 lvx v1, 0, \R | |
| 25 lvx v2, \O, \R | |
| 26 | |
| 27 vperm \V, v1, v2, v3 | |
| 28 .endm | |
| 29 | |
| 30 .macro prologue | |
| 31 mfspr r11, 256 ;# get old VRSAVE | |
| 32 oris r12, r11, 0xffc0 | |
| 33 mtspr 256, r12 ;# set VRSAVE | |
| 34 | |
| 35 stwu r1, -32(r1) ;# create space on the stack | |
| 36 | |
| 37 li r10, 16 ;# load offset and loop counter | |
| 38 | |
| 39 vspltisw v7, 0 ;# zero for merging | |
| 40 vspltisw v8, 0 ;# zero out total to start | |
| 41 vspltisw v9, 0 ;# zero out total for dif^2 | |
| 42 .endm | |
| 43 | |
| 44 .macro epilogue | |
| 45 addi r1, r1, 32 ;# recover stack | |
| 46 | |
| 47 mtspr 256, r11 ;# reset old VRSAVE | |
| 48 .endm | |
| 49 | |
| 50 .macro compute_sum_sse | |
| 51 ;# Compute sum first. Unpack to so signed subract | |
| 52 ;# can be used. Only have a half word signed | |
| 53 ;# subract. Do high, then low. | |
| 54 vmrghb v2, v7, v4 | |
| 55 vmrghb v3, v7, v5 | |
| 56 vsubshs v2, v2, v3 | |
| 57 vsum4shs v8, v2, v8 | |
| 58 | |
| 59 vmrglb v2, v7, v4 | |
| 60 vmrglb v3, v7, v5 | |
| 61 vsubshs v2, v2, v3 | |
| 62 vsum4shs v8, v2, v8 | |
| 63 | |
| 64 ;# Now compute sse. | |
| 65 vsububs v2, v4, v5 | |
| 66 vsububs v3, v5, v4 | |
| 67 vor v2, v2, v3 | |
| 68 | |
| 69 vmsumubm v9, v2, v2, v9 | |
| 70 .endm | |
| 71 | |
| 72 .macro variance_16 DS loop_label store_sum | |
| 73 \loop_label: | |
| 74 ;# only one of the inputs should need to be aligned. | |
| 75 load_aligned_16 v4, r3, r10 | |
| 76 load_aligned_16 v5, r5, r10 | |
| 77 | |
| 78 ;# move onto the next line | |
| 79 add r3, r3, r4 | |
| 80 add r5, r5, r6 | |
| 81 | |
| 82 compute_sum_sse | |
| 83 | |
| 84 bdnz \loop_label | |
| 85 | |
| 86 vsumsws v8, v8, v7 | |
| 87 vsumsws v9, v9, v7 | |
| 88 | |
| 89 stvx v8, 0, r1 | |
| 90 lwz r3, 12(r1) | |
| 91 | |
| 92 stvx v9, 0, r1 | |
| 93 lwz r4, 12(r1) | |
| 94 | |
| 95 .if \store_sum | |
| 96 stw r3, 0(r8) ;# sum | |
| 97 .endif | |
| 98 stw r4, 0(r7) ;# sse | |
| 99 | |
| 100 mullw r3, r3, r3 ;# sum*sum | |
| 101 srlwi r3, r3, \DS ;# (sum*sum) >> DS | |
| 102 subf r3, r3, r4 ;# sse - ((sum*sum) >> DS) | |
| 103 .endm | |
| 104 | |
| 105 .macro variance_8 DS loop_label store_sum | |
| 106 \loop_label: | |
| 107 ;# only one of the inputs should need to be aligned. | |
| 108 load_aligned_16 v4, r3, r10 | |
| 109 load_aligned_16 v5, r5, r10 | |
| 110 | |
| 111 ;# move onto the next line | |
| 112 add r3, r3, r4 | |
| 113 add r5, r5, r6 | |
| 114 | |
| 115 ;# only one of the inputs should need to be aligned. | |
| 116 load_aligned_16 v6, r3, r10 | |
| 117 load_aligned_16 v0, r5, r10 | |
| 118 | |
| 119 ;# move onto the next line | |
| 120 add r3, r3, r4 | |
| 121 add r5, r5, r6 | |
| 122 | |
| 123 vmrghb v4, v4, v6 | |
| 124 vmrghb v5, v5, v0 | |
| 125 | |
| 126 compute_sum_sse | |
| 127 | |
| 128 bdnz \loop_label | |
| 129 | |
| 130 vsumsws v8, v8, v7 | |
| 131 vsumsws v9, v9, v7 | |
| 132 | |
| 133 stvx v8, 0, r1 | |
| 134 lwz r3, 12(r1) | |
| 135 | |
| 136 stvx v9, 0, r1 | |
| 137 lwz r4, 12(r1) | |
| 138 | |
| 139 .if \store_sum | |
| 140 stw r3, 0(r8) ;# sum | |
| 141 .endif | |
| 142 stw r4, 0(r7) ;# sse | |
| 143 | |
| 144 mullw r3, r3, r3 ;# sum*sum | |
| 145 srlwi r3, r3, \DS ;# (sum*sum) >> 8 | |
| 146 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) | |
| 147 .endm | |
| 148 | |
| 149 .align 2 | |
| 150 ;# r3 unsigned char *src_ptr | |
| 151 ;# r4 int source_stride | |
| 152 ;# r5 unsigned char *ref_ptr | |
| 153 ;# r6 int recon_stride | |
| 154 ;# r7 unsigned int *SSE | |
| 155 ;# r8 int *Sum | |
| 156 ;# | |
| 157 ;# r3 return value | |
| 158 vp8_get8x8var_ppc: | |
| 159 | |
| 160 prologue | |
| 161 | |
| 162 li r9, 4 | |
| 163 mtctr r9 | |
| 164 | |
| 165 variance_8 6, get8x8var_loop, 1 | |
| 166 | |
| 167 epilogue | |
| 168 | |
| 169 blr | |
| 170 | |
| 171 .align 2 | |
| 172 ;# r3 unsigned char *src_ptr | |
| 173 ;# r4 int source_stride | |
| 174 ;# r5 unsigned char *ref_ptr | |
| 175 ;# r6 int recon_stride | |
| 176 ;# r7 unsigned int *SSE | |
| 177 ;# r8 int *Sum | |
| 178 ;# | |
| 179 ;# r3 return value | |
| 180 vp8_get16x16var_ppc: | |
| 181 | |
| 182 prologue | |
| 183 | |
| 184 mtctr r10 | |
| 185 | |
| 186 variance_16 8, get16x16var_loop, 1 | |
| 187 | |
| 188 epilogue | |
| 189 | |
| 190 blr | |
| 191 | |
| 192 .align 2 | |
| 193 ;# r3 unsigned char *src_ptr | |
| 194 ;# r4 int source_stride | |
| 195 ;# r5 unsigned char *ref_ptr | |
| 196 ;# r6 int recon_stride | |
| 197 ;# r7 unsigned int *sse | |
| 198 ;# | |
| 199 ;# r 3 return value | |
| 200 vp8_mse16x16_ppc: | |
| 201 prologue | |
| 202 | |
| 203 mtctr r10 | |
| 204 | |
| 205 mse16x16_loop: | |
| 206 ;# only one of the inputs should need to be aligned. | |
| 207 load_aligned_16 v4, r3, r10 | |
| 208 load_aligned_16 v5, r5, r10 | |
| 209 | |
| 210 ;# move onto the next line | |
| 211 add r3, r3, r4 | |
| 212 add r5, r5, r6 | |
| 213 | |
| 214 ;# Now compute sse. | |
| 215 vsububs v2, v4, v5 | |
| 216 vsububs v3, v5, v4 | |
| 217 vor v2, v2, v3 | |
| 218 | |
| 219 vmsumubm v9, v2, v2, v9 | |
| 220 | |
| 221 bdnz mse16x16_loop | |
| 222 | |
| 223 vsumsws v9, v9, v7 | |
| 224 | |
| 225 stvx v9, 0, r1 | |
| 226 lwz r3, 12(r1) | |
| 227 | |
| 228 stvx v9, 0, r1 | |
| 229 lwz r3, 12(r1) | |
| 230 | |
| 231 stw r3, 0(r7) ;# sse | |
| 232 | |
| 233 epilogue | |
| 234 | |
| 235 blr | |
| 236 | |
| 237 .align 2 | |
| 238 ;# r3 unsigned char *src_ptr | |
| 239 ;# r4 int source_stride | |
| 240 ;# r5 unsigned char *ref_ptr | |
| 241 ;# r6 int recon_stride | |
| 242 ;# r7 unsigned int *sse | |
| 243 ;# | |
| 244 ;# r3 return value | |
| 245 vp8_variance16x16_ppc: | |
| 246 | |
| 247 prologue | |
| 248 | |
| 249 mtctr r10 | |
| 250 | |
| 251 variance_16 8, variance16x16_loop, 0 | |
| 252 | |
| 253 epilogue | |
| 254 | |
| 255 blr | |
| 256 | |
| 257 .align 2 | |
| 258 ;# r3 unsigned char *src_ptr | |
| 259 ;# r4 int source_stride | |
| 260 ;# r5 unsigned char *ref_ptr | |
| 261 ;# r6 int recon_stride | |
| 262 ;# r7 unsigned int *sse | |
| 263 ;# | |
| 264 ;# r3 return value | |
| 265 vp8_variance16x8_ppc: | |
| 266 | |
| 267 prologue | |
| 268 | |
| 269 li r9, 8 | |
| 270 mtctr r9 | |
| 271 | |
| 272 variance_16 7, variance16x8_loop, 0 | |
| 273 | |
| 274 epilogue | |
| 275 | |
| 276 blr | |
| 277 | |
| 278 .align 2 | |
| 279 ;# r3 unsigned char *src_ptr | |
| 280 ;# r4 int source_stride | |
| 281 ;# r5 unsigned char *ref_ptr | |
| 282 ;# r6 int recon_stride | |
| 283 ;# r7 unsigned int *sse | |
| 284 ;# | |
| 285 ;# r3 return value | |
| 286 vp8_variance8x16_ppc: | |
| 287 | |
| 288 prologue | |
| 289 | |
| 290 li r9, 8 | |
| 291 mtctr r9 | |
| 292 | |
| 293 variance_8 7, variance8x16_loop, 0 | |
| 294 | |
| 295 epilogue | |
| 296 | |
| 297 blr | |
| 298 | |
| 299 .align 2 | |
| 300 ;# r3 unsigned char *src_ptr | |
| 301 ;# r4 int source_stride | |
| 302 ;# r5 unsigned char *ref_ptr | |
| 303 ;# r6 int recon_stride | |
| 304 ;# r7 unsigned int *sse | |
| 305 ;# | |
| 306 ;# r3 return value | |
| 307 vp8_variance8x8_ppc: | |
| 308 | |
| 309 prologue | |
| 310 | |
| 311 li r9, 4 | |
| 312 mtctr r9 | |
| 313 | |
| 314 variance_8 6, variance8x8_loop, 0 | |
| 315 | |
| 316 epilogue | |
| 317 | |
| 318 blr | |
| 319 | |
| 320 .macro transfer_4x4 I P | |
| 321 lwz r0, 0(\I) | |
| 322 add \I, \I, \P | |
| 323 | |
| 324 lwz r10,0(\I) | |
| 325 add \I, \I, \P | |
| 326 | |
| 327 lwz r8, 0(\I) | |
| 328 add \I, \I, \P | |
| 329 | |
| 330 lwz r9, 0(\I) | |
| 331 | |
| 332 stw r0, 0(r1) | |
| 333 stw r10, 4(r1) | |
| 334 stw r8, 8(r1) | |
| 335 stw r9, 12(r1) | |
| 336 .endm | |
| 337 | |
| 338 .align 2 | |
| 339 ;# r3 unsigned char *src_ptr | |
| 340 ;# r4 int source_stride | |
| 341 ;# r5 unsigned char *ref_ptr | |
| 342 ;# r6 int recon_stride | |
| 343 ;# r7 unsigned int *sse | |
| 344 ;# | |
| 345 ;# r3 return value | |
| 346 vp8_variance4x4_ppc: | |
| 347 | |
| 348 prologue | |
| 349 | |
| 350 transfer_4x4 r3, r4 | |
| 351 lvx v4, 0, r1 | |
| 352 | |
| 353 transfer_4x4 r5, r6 | |
| 354 lvx v5, 0, r1 | |
| 355 | |
| 356 compute_sum_sse | |
| 357 | |
| 358 vsumsws v8, v8, v7 | |
| 359 vsumsws v9, v9, v7 | |
| 360 | |
| 361 stvx v8, 0, r1 | |
| 362 lwz r3, 12(r1) | |
| 363 | |
| 364 stvx v9, 0, r1 | |
| 365 lwz r4, 12(r1) | |
| 366 | |
| 367 stw r4, 0(r7) ;# sse | |
| 368 | |
| 369 mullw r3, r3, r3 ;# sum*sum | |
| 370 srlwi r3, r3, 4 ;# (sum*sum) >> 4 | |
| 371 subf r3, r3, r4 ;# sse - ((sum*sum) >> 4) | |
| 372 | |
| 373 epilogue | |
| 374 | |
| 375 blr | |
| OLD | NEW |