OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 .globl vp8_get8x8var_ppc | |
13 .globl vp8_get16x16var_ppc | |
14 .globl vp8_mse16x16_ppc | |
15 .globl vp8_variance16x16_ppc | |
16 .globl vp8_variance16x8_ppc | |
17 .globl vp8_variance8x16_ppc | |
18 .globl vp8_variance8x8_ppc | |
19 .globl vp8_variance4x4_ppc | |
20 | |
21 .macro load_aligned_16 V R O | |
22 lvsl v3, 0, \R ;# permutate value for alignment | |
23 | |
24 lvx v1, 0, \R | |
25 lvx v2, \O, \R | |
26 | |
27 vperm \V, v1, v2, v3 | |
28 .endm | |
29 | |
30 .macro prologue | |
31 mfspr r11, 256 ;# get old VRSAVE | |
32 oris r12, r11, 0xffc0 | |
33 mtspr 256, r12 ;# set VRSAVE | |
34 | |
35 stwu r1, -32(r1) ;# create space on the stack | |
36 | |
37 li r10, 16 ;# load offset and loop counter | |
38 | |
39 vspltisw v7, 0 ;# zero for merging | |
40 vspltisw v8, 0 ;# zero out total to start | |
41 vspltisw v9, 0 ;# zero out total for dif^2 | |
42 .endm | |
43 | |
44 .macro epilogue | |
45 addi r1, r1, 32 ;# recover stack | |
46 | |
47 mtspr 256, r11 ;# reset old VRSAVE | |
48 .endm | |
49 | |
50 .macro compute_sum_sse | |
51 ;# Compute sum first. Unpack to so signed subract | |
52 ;# can be used. Only have a half word signed | |
53 ;# subract. Do high, then low. | |
54 vmrghb v2, v7, v4 | |
55 vmrghb v3, v7, v5 | |
56 vsubshs v2, v2, v3 | |
57 vsum4shs v8, v2, v8 | |
58 | |
59 vmrglb v2, v7, v4 | |
60 vmrglb v3, v7, v5 | |
61 vsubshs v2, v2, v3 | |
62 vsum4shs v8, v2, v8 | |
63 | |
64 ;# Now compute sse. | |
65 vsububs v2, v4, v5 | |
66 vsububs v3, v5, v4 | |
67 vor v2, v2, v3 | |
68 | |
69 vmsumubm v9, v2, v2, v9 | |
70 .endm | |
71 | |
72 .macro variance_16 DS loop_label store_sum | |
73 \loop_label: | |
74 ;# only one of the inputs should need to be aligned. | |
75 load_aligned_16 v4, r3, r10 | |
76 load_aligned_16 v5, r5, r10 | |
77 | |
78 ;# move onto the next line | |
79 add r3, r3, r4 | |
80 add r5, r5, r6 | |
81 | |
82 compute_sum_sse | |
83 | |
84 bdnz \loop_label | |
85 | |
86 vsumsws v8, v8, v7 | |
87 vsumsws v9, v9, v7 | |
88 | |
89 stvx v8, 0, r1 | |
90 lwz r3, 12(r1) | |
91 | |
92 stvx v9, 0, r1 | |
93 lwz r4, 12(r1) | |
94 | |
95 .if \store_sum | |
96 stw r3, 0(r8) ;# sum | |
97 .endif | |
98 stw r4, 0(r7) ;# sse | |
99 | |
100 mullw r3, r3, r3 ;# sum*sum | |
101 srlwi r3, r3, \DS ;# (sum*sum) >> DS | |
102 subf r3, r3, r4 ;# sse - ((sum*sum) >> DS) | |
103 .endm | |
104 | |
105 .macro variance_8 DS loop_label store_sum | |
106 \loop_label: | |
107 ;# only one of the inputs should need to be aligned. | |
108 load_aligned_16 v4, r3, r10 | |
109 load_aligned_16 v5, r5, r10 | |
110 | |
111 ;# move onto the next line | |
112 add r3, r3, r4 | |
113 add r5, r5, r6 | |
114 | |
115 ;# only one of the inputs should need to be aligned. | |
116 load_aligned_16 v6, r3, r10 | |
117 load_aligned_16 v0, r5, r10 | |
118 | |
119 ;# move onto the next line | |
120 add r3, r3, r4 | |
121 add r5, r5, r6 | |
122 | |
123 vmrghb v4, v4, v6 | |
124 vmrghb v5, v5, v0 | |
125 | |
126 compute_sum_sse | |
127 | |
128 bdnz \loop_label | |
129 | |
130 vsumsws v8, v8, v7 | |
131 vsumsws v9, v9, v7 | |
132 | |
133 stvx v8, 0, r1 | |
134 lwz r3, 12(r1) | |
135 | |
136 stvx v9, 0, r1 | |
137 lwz r4, 12(r1) | |
138 | |
139 .if \store_sum | |
140 stw r3, 0(r8) ;# sum | |
141 .endif | |
142 stw r4, 0(r7) ;# sse | |
143 | |
144 mullw r3, r3, r3 ;# sum*sum | |
145 srlwi r3, r3, \DS ;# (sum*sum) >> 8 | |
146 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) | |
147 .endm | |
148 | |
149 .align 2 | |
150 ;# r3 unsigned char *src_ptr | |
151 ;# r4 int source_stride | |
152 ;# r5 unsigned char *ref_ptr | |
153 ;# r6 int recon_stride | |
154 ;# r7 unsigned int *SSE | |
155 ;# r8 int *Sum | |
156 ;# | |
157 ;# r3 return value | |
158 vp8_get8x8var_ppc: | |
159 | |
160 prologue | |
161 | |
162 li r9, 4 | |
163 mtctr r9 | |
164 | |
165 variance_8 6, get8x8var_loop, 1 | |
166 | |
167 epilogue | |
168 | |
169 blr | |
170 | |
171 .align 2 | |
172 ;# r3 unsigned char *src_ptr | |
173 ;# r4 int source_stride | |
174 ;# r5 unsigned char *ref_ptr | |
175 ;# r6 int recon_stride | |
176 ;# r7 unsigned int *SSE | |
177 ;# r8 int *Sum | |
178 ;# | |
179 ;# r3 return value | |
180 vp8_get16x16var_ppc: | |
181 | |
182 prologue | |
183 | |
184 mtctr r10 | |
185 | |
186 variance_16 8, get16x16var_loop, 1 | |
187 | |
188 epilogue | |
189 | |
190 blr | |
191 | |
192 .align 2 | |
193 ;# r3 unsigned char *src_ptr | |
194 ;# r4 int source_stride | |
195 ;# r5 unsigned char *ref_ptr | |
196 ;# r6 int recon_stride | |
197 ;# r7 unsigned int *sse | |
198 ;# | |
199 ;# r 3 return value | |
200 vp8_mse16x16_ppc: | |
201 prologue | |
202 | |
203 mtctr r10 | |
204 | |
205 mse16x16_loop: | |
206 ;# only one of the inputs should need to be aligned. | |
207 load_aligned_16 v4, r3, r10 | |
208 load_aligned_16 v5, r5, r10 | |
209 | |
210 ;# move onto the next line | |
211 add r3, r3, r4 | |
212 add r5, r5, r6 | |
213 | |
214 ;# Now compute sse. | |
215 vsububs v2, v4, v5 | |
216 vsububs v3, v5, v4 | |
217 vor v2, v2, v3 | |
218 | |
219 vmsumubm v9, v2, v2, v9 | |
220 | |
221 bdnz mse16x16_loop | |
222 | |
223 vsumsws v9, v9, v7 | |
224 | |
225 stvx v9, 0, r1 | |
226 lwz r3, 12(r1) | |
227 | |
228 stvx v9, 0, r1 | |
229 lwz r3, 12(r1) | |
230 | |
231 stw r3, 0(r7) ;# sse | |
232 | |
233 epilogue | |
234 | |
235 blr | |
236 | |
237 .align 2 | |
238 ;# r3 unsigned char *src_ptr | |
239 ;# r4 int source_stride | |
240 ;# r5 unsigned char *ref_ptr | |
241 ;# r6 int recon_stride | |
242 ;# r7 unsigned int *sse | |
243 ;# | |
244 ;# r3 return value | |
245 vp8_variance16x16_ppc: | |
246 | |
247 prologue | |
248 | |
249 mtctr r10 | |
250 | |
251 variance_16 8, variance16x16_loop, 0 | |
252 | |
253 epilogue | |
254 | |
255 blr | |
256 | |
257 .align 2 | |
258 ;# r3 unsigned char *src_ptr | |
259 ;# r4 int source_stride | |
260 ;# r5 unsigned char *ref_ptr | |
261 ;# r6 int recon_stride | |
262 ;# r7 unsigned int *sse | |
263 ;# | |
264 ;# r3 return value | |
265 vp8_variance16x8_ppc: | |
266 | |
267 prologue | |
268 | |
269 li r9, 8 | |
270 mtctr r9 | |
271 | |
272 variance_16 7, variance16x8_loop, 0 | |
273 | |
274 epilogue | |
275 | |
276 blr | |
277 | |
278 .align 2 | |
279 ;# r3 unsigned char *src_ptr | |
280 ;# r4 int source_stride | |
281 ;# r5 unsigned char *ref_ptr | |
282 ;# r6 int recon_stride | |
283 ;# r7 unsigned int *sse | |
284 ;# | |
285 ;# r3 return value | |
286 vp8_variance8x16_ppc: | |
287 | |
288 prologue | |
289 | |
290 li r9, 8 | |
291 mtctr r9 | |
292 | |
293 variance_8 7, variance8x16_loop, 0 | |
294 | |
295 epilogue | |
296 | |
297 blr | |
298 | |
299 .align 2 | |
300 ;# r3 unsigned char *src_ptr | |
301 ;# r4 int source_stride | |
302 ;# r5 unsigned char *ref_ptr | |
303 ;# r6 int recon_stride | |
304 ;# r7 unsigned int *sse | |
305 ;# | |
306 ;# r3 return value | |
307 vp8_variance8x8_ppc: | |
308 | |
309 prologue | |
310 | |
311 li r9, 4 | |
312 mtctr r9 | |
313 | |
314 variance_8 6, variance8x8_loop, 0 | |
315 | |
316 epilogue | |
317 | |
318 blr | |
319 | |
320 .macro transfer_4x4 I P | |
321 lwz r0, 0(\I) | |
322 add \I, \I, \P | |
323 | |
324 lwz r10,0(\I) | |
325 add \I, \I, \P | |
326 | |
327 lwz r8, 0(\I) | |
328 add \I, \I, \P | |
329 | |
330 lwz r9, 0(\I) | |
331 | |
332 stw r0, 0(r1) | |
333 stw r10, 4(r1) | |
334 stw r8, 8(r1) | |
335 stw r9, 12(r1) | |
336 .endm | |
337 | |
338 .align 2 | |
339 ;# r3 unsigned char *src_ptr | |
340 ;# r4 int source_stride | |
341 ;# r5 unsigned char *ref_ptr | |
342 ;# r6 int recon_stride | |
343 ;# r7 unsigned int *sse | |
344 ;# | |
345 ;# r3 return value | |
346 vp8_variance4x4_ppc: | |
347 | |
348 prologue | |
349 | |
350 transfer_4x4 r3, r4 | |
351 lvx v4, 0, r1 | |
352 | |
353 transfer_4x4 r5, r6 | |
354 lvx v5, 0, r1 | |
355 | |
356 compute_sum_sse | |
357 | |
358 vsumsws v8, v8, v7 | |
359 vsumsws v9, v9, v7 | |
360 | |
361 stvx v8, 0, r1 | |
362 lwz r3, 12(r1) | |
363 | |
364 stvx v9, 0, r1 | |
365 lwz r4, 12(r1) | |
366 | |
367 stw r4, 0(r7) ;# sse | |
368 | |
369 mullw r3, r3, r3 ;# sum*sum | |
370 srlwi r3, r3, 4 ;# (sum*sum) >> 4 | |
371 subf r3, r3, r4 ;# sse - ((sum*sum) >> 4) | |
372 | |
373 epilogue | |
374 | |
375 blr | |
OLD | NEW |