OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 .globl vp8_sad16x16_ppc |
| 13 .globl vp8_sad16x8_ppc |
| 14 .globl vp8_sad8x16_ppc |
| 15 .globl vp8_sad8x8_ppc |
| 16 .globl vp8_sad4x4_ppc |
| 17 |
| 18 .macro load_aligned_16 V R O |
| 19 lvsl v3, 0, \R ;# permutate value for alignment |
| 20 |
| 21 lvx v1, 0, \R |
| 22 lvx v2, \O, \R |
| 23 |
| 24 vperm \V, v1, v2, v3 |
| 25 .endm |
| 26 |
| 27 .macro prologue |
| 28 mfspr r11, 256 ;# get old VRSAVE |
| 29 oris r12, r11, 0xffc0 |
| 30 mtspr 256, r12 ;# set VRSAVE |
| 31 |
| 32 stwu r1, -32(r1) ;# create space on the stack |
| 33 |
| 34 li r10, 16 ;# load offset and loop counter |
| 35 |
| 36 vspltisw v8, 0 ;# zero out total to start |
| 37 .endm |
| 38 |
| 39 .macro epilogue |
| 40 addi r1, r1, 32 ;# recover stack |
| 41 |
| 42 mtspr 256, r11 ;# reset old VRSAVE |
| 43 .endm |
| 44 |
| 45 .macro SAD_16 |
| 46 ;# v6 = abs (v4 - v5) |
| 47 vsububs v6, v4, v5 |
| 48 vsububs v7, v5, v4 |
| 49 vor v6, v6, v7 |
| 50 |
| 51 ;# v8 += abs (v4 - v5) |
| 52 vsum4ubs v8, v6, v8 |
| 53 .endm |
| 54 |
| 55 .macro sad_16_loop loop_label |
| 56 lvsl v3, 0, r5 ;# only needs to be done once per block |
| 57 |
| 58 ;# preload a line of data before getting into the loop |
| 59 lvx v4, 0, r3 |
| 60 lvx v1, 0, r5 |
| 61 lvx v2, r10, r5 |
| 62 |
| 63 add r5, r5, r6 |
| 64 add r3, r3, r4 |
| 65 |
| 66 vperm v5, v1, v2, v3 |
| 67 |
| 68 .align 4 |
| 69 \loop_label: |
| 70 ;# compute difference on first row |
| 71 vsububs v6, v4, v5 |
| 72 vsububs v7, v5, v4 |
| 73 |
| 74 ;# load up next set of data |
| 75 lvx v9, 0, r3 |
| 76 lvx v1, 0, r5 |
| 77 lvx v2, r10, r5 |
| 78 |
| 79 ;# perform abs() of difference |
| 80 vor v6, v6, v7 |
| 81 add r3, r3, r4 |
| 82 |
| 83 ;# add to the running tally |
| 84 vsum4ubs v8, v6, v8 |
| 85 |
| 86 ;# now onto the next line |
| 87 vperm v5, v1, v2, v3 |
| 88 add r5, r5, r6 |
| 89 lvx v4, 0, r3 |
| 90 |
| 91 ;# compute difference on second row |
| 92 vsububs v6, v9, v5 |
| 93 lvx v1, 0, r5 |
| 94 vsububs v7, v5, v9 |
| 95 lvx v2, r10, r5 |
| 96 vor v6, v6, v7 |
| 97 add r3, r3, r4 |
| 98 vsum4ubs v8, v6, v8 |
| 99 vperm v5, v1, v2, v3 |
| 100 add r5, r5, r6 |
| 101 |
| 102 bdnz \loop_label |
| 103 |
| 104 vspltisw v7, 0 |
| 105 |
| 106 vsumsws v8, v8, v7 |
| 107 |
| 108 stvx v8, 0, r1 |
| 109 lwz r3, 12(r1) |
| 110 .endm |
| 111 |
| 112 .macro sad_8_loop loop_label |
| 113 .align 4 |
| 114 \loop_label: |
| 115 ;# only one of the inputs should need to be aligned. |
| 116 load_aligned_16 v4, r3, r10 |
| 117 load_aligned_16 v5, r5, r10 |
| 118 |
| 119 ;# move onto the next line |
| 120 add r3, r3, r4 |
| 121 add r5, r5, r6 |
| 122 |
| 123 ;# only one of the inputs should need to be aligned. |
| 124 load_aligned_16 v6, r3, r10 |
| 125 load_aligned_16 v7, r5, r10 |
| 126 |
| 127 ;# move onto the next line |
| 128 add r3, r3, r4 |
| 129 add r5, r5, r6 |
| 130 |
| 131 vmrghb v4, v4, v6 |
| 132 vmrghb v5, v5, v7 |
| 133 |
| 134 SAD_16 |
| 135 |
| 136 bdnz \loop_label |
| 137 |
| 138 vspltisw v7, 0 |
| 139 |
| 140 vsumsws v8, v8, v7 |
| 141 |
| 142 stvx v8, 0, r1 |
| 143 lwz r3, 12(r1) |
| 144 .endm |
| 145 |
| 146 .align 2 |
| 147 ;# r3 unsigned char *src_ptr |
| 148 ;# r4 int src_stride |
| 149 ;# r5 unsigned char *ref_ptr |
| 150 ;# r6 int ref_stride |
| 151 ;# |
| 152 ;# r3 return value |
| 153 vp8_sad16x16_ppc: |
| 154 |
| 155 prologue |
| 156 |
| 157 li r9, 8 |
| 158 mtctr r9 |
| 159 |
| 160 sad_16_loop sad16x16_loop |
| 161 |
| 162 epilogue |
| 163 |
| 164 blr |
| 165 |
| 166 .align 2 |
| 167 ;# r3 unsigned char *src_ptr |
| 168 ;# r4 int src_stride |
| 169 ;# r5 unsigned char *ref_ptr |
| 170 ;# r6 int ref_stride |
| 171 ;# |
| 172 ;# r3 return value |
| 173 vp8_sad16x8_ppc: |
| 174 |
| 175 prologue |
| 176 |
| 177 li r9, 4 |
| 178 mtctr r9 |
| 179 |
| 180 sad_16_loop sad16x8_loop |
| 181 |
| 182 epilogue |
| 183 |
| 184 blr |
| 185 |
| 186 .align 2 |
| 187 ;# r3 unsigned char *src_ptr |
| 188 ;# r4 int src_stride |
| 189 ;# r5 unsigned char *ref_ptr |
| 190 ;# r6 int ref_stride |
| 191 ;# |
| 192 ;# r3 return value |
| 193 vp8_sad8x16_ppc: |
| 194 |
| 195 prologue |
| 196 |
| 197 li r9, 8 |
| 198 mtctr r9 |
| 199 |
| 200 sad_8_loop sad8x16_loop |
| 201 |
| 202 epilogue |
| 203 |
| 204 blr |
| 205 |
| 206 .align 2 |
| 207 ;# r3 unsigned char *src_ptr |
| 208 ;# r4 int src_stride |
| 209 ;# r5 unsigned char *ref_ptr |
| 210 ;# r6 int ref_stride |
| 211 ;# |
| 212 ;# r3 return value |
| 213 vp8_sad8x8_ppc: |
| 214 |
| 215 prologue |
| 216 |
| 217 li r9, 4 |
| 218 mtctr r9 |
| 219 |
| 220 sad_8_loop sad8x8_loop |
| 221 |
| 222 epilogue |
| 223 |
| 224 blr |
| 225 |
| 226 .macro transfer_4x4 I P |
| 227 lwz r0, 0(\I) |
| 228 add \I, \I, \P |
| 229 |
| 230 lwz r7, 0(\I) |
| 231 add \I, \I, \P |
| 232 |
| 233 lwz r8, 0(\I) |
| 234 add \I, \I, \P |
| 235 |
| 236 lwz r9, 0(\I) |
| 237 |
| 238 stw r0, 0(r1) |
| 239 stw r7, 4(r1) |
| 240 stw r8, 8(r1) |
| 241 stw r9, 12(r1) |
| 242 .endm |
| 243 |
| 244 .align 2 |
| 245 ;# r3 unsigned char *src_ptr |
| 246 ;# r4 int src_stride |
| 247 ;# r5 unsigned char *ref_ptr |
| 248 ;# r6 int ref_stride |
| 249 ;# |
| 250 ;# r3 return value |
| 251 vp8_sad4x4_ppc: |
| 252 |
| 253 prologue |
| 254 |
| 255 transfer_4x4 r3, r4 |
| 256 lvx v4, 0, r1 |
| 257 |
| 258 transfer_4x4 r5, r6 |
| 259 lvx v5, 0, r1 |
| 260 |
| 261 vspltisw v8, 0 ;# zero out total to start |
| 262 |
| 263 ;# v6 = abs (v4 - v5) |
| 264 vsububs v6, v4, v5 |
| 265 vsububs v7, v5, v4 |
| 266 vor v6, v6, v7 |
| 267 |
| 268 ;# v8 += abs (v4 - v5) |
| 269 vsum4ubs v7, v6, v8 |
| 270 vsumsws v7, v7, v8 |
| 271 |
| 272 stvx v7, 0, r1 |
| 273 lwz r3, 12(r1) |
| 274 |
| 275 epilogue |
| 276 |
| 277 blr |
OLD | NEW |