Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(452)

Side by Side Diff: source/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 .globl vp8_sub_pixel_variance4x4_ppc
13 .globl vp8_sub_pixel_variance8x8_ppc
14 .globl vp8_sub_pixel_variance8x16_ppc
15 .globl vp8_sub_pixel_variance16x8_ppc
16 .globl vp8_sub_pixel_variance16x16_ppc
17
18 .macro load_c V, LABEL, OFF, R0, R1
19 lis \R0, \LABEL@ha
20 la \R1, \LABEL@l(\R0)
21 lvx \V, \OFF, \R1
22 .endm
23
24 .macro load_vfilter V0, V1
25 load_c \V0, vfilter_b, r6, r12, r10
26
27 addi r6, r6, 16
28 lvx \V1, r6, r10
29 .endm
30
31 .macro HProlog jump_label
32 ;# load up horizontal filter
33 slwi. r5, r5, 4 ;# index into horizontal filter array
34
35 ;# index to the next set of vectors in the row.
36 li r10, 16
37
38 ;# downshift by 7 ( divide by 128 ) at the end
39 vspltish v19, 7
40
41 ;# If there isn't any filtering to be done for the horizontal, then
42 ;# just skip to the second pass.
43 beq \jump_label
44
45 load_c v20, hfilter_b, r5, r12, r0
46
47 ;# setup constants
48 ;# v14 permutation value for alignment
49 load_c v28, b_hperm_b, 0, r12, r0
50
51 ;# index to the next set of vectors in the row.
52 li r12, 32
53
54 ;# rounding added in on the multiply
55 vspltisw v21, 8
56 vspltisw v18, 3
57 vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
58
59 slwi. r6, r6, 5 ;# index into vertical filter array
60 .endm
61
62 ;# Filters a horizontal line
63 ;# expects:
64 ;# r3 src_ptr
65 ;# r4 pitch
66 ;# r10 16
67 ;# r12 32
68 ;# v17 perm intput
69 ;# v18 rounding
70 ;# v19 shift
71 ;# v20 filter taps
72 ;# v21 tmp
73 ;# v22 tmp
74 ;# v23 tmp
75 ;# v24 tmp
76 ;# v25 tmp
77 ;# v26 tmp
78 ;# v27 tmp
79 ;# v28 perm output
80 ;#
81
82 .macro hfilter_8 V, hp, lp, increment_counter
83 lvsl v17, 0, r3 ;# permutate value for alignment
84
85 ;# input to filter is 9 bytes wide, output is 8 bytes.
86 lvx v21, 0, r3
87 lvx v22, r10, r3
88
89 .if \increment_counter
90 add r3, r3, r4
91 .endif
92 vperm v21, v21, v22, v17
93
94 vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
95 vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
96
97 vmsummbm v24, v20, v24, v18
98 vmsummbm v25, v20, v25, v18
99
100 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
101
102 vsrh v24, v24, v19 ;# divide v0, v1 by 128
103
104 vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
105 .endm
106
107 .macro vfilter_16 P0 P1
108 vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
109 vadduhm v22, v18, v22
110 vmuloub v23, \P0, v20
111 vadduhm v23, v18, v23
112
113 vmuleub v24, \P1, v21
114 vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
115 vmuloub v25, \P1, v21
116 vadduhm v23, v23, v25 ;# Ro = odds
117
118 vsrh v22, v22, v19 ;# divide by 128
119 vsrh v23, v23, v19 ;# v16 v17 = evens, odds
120 vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
121 vmrglh v23, v22, v23
122 vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
123 .endm
124
125 .macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
126 ;# Compute sum first. Unpack to so signed subract
127 ;# can be used. Only have a half word signed
128 ;# subract. Do high, then low.
129 vmrghb \t1, \z0, \src
130 vmrghb \t2, \z0, \ref
131 vsubshs \t1, \t1, \t2
132 vsum4shs \sum, \t1, \sum
133
134 vmrglb \t1, \z0, \src
135 vmrglb \t2, \z0, \ref
136 vsubshs \t1, \t1, \t2
137 vsum4shs \sum, \t1, \sum
138
139 ;# Now compute sse.
140 vsububs \t1, \src, \ref
141 vsububs \t2, \ref, \src
142 vor \t1, \t1, \t2
143
144 vmsumubm \sse, \t1, \t1, \sse
145 .endm
146
147 .macro variance_final sum, sse, z0, DS
148 vsumsws \sum, \sum, \z0
149 vsumsws \sse, \sse, \z0
150
151 stvx \sum, 0, r1
152 lwz r3, 12(r1)
153
154 stvx \sse, 0, r1
155 lwz r4, 12(r1)
156
157 stw r4, 0(r9) ;# sse
158
159 mullw r3, r3, r3 ;# sum*sum
160 srlwi r3, r3, \DS ;# (sum*sum) >> 8
161 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
162 .endm
163
164 .macro compute_sum_sse_16 V, increment_counter
165 load_and_align_16 v16, r7, r8, \increment_counter
166 compute_sum_sse \V, v16, v18, v19, v20, v21, v23
167 .endm
168
169 .macro load_and_align_16 V, R, P, increment_counter
170 lvsl v17, 0, \R ;# permutate value for alignment
171
172 ;# input to filter is 21 bytes wide, output is 16 bytes.
173 ;# input will can span three vectors if not aligned correctly.
174 lvx v21, 0, \R
175 lvx v22, r10, \R
176
177 .if \increment_counter
178 add \R, \R, \P
179 .endif
180
181 vperm \V, v21, v22, v17
182 .endm
183
184 .align 2
185 ;# r3 unsigned char *src_ptr
186 ;# r4 int src_pixels_per_line
187 ;# r5 int xoffset
188 ;# r6 int yoffset
189 ;# r7 unsigned char *dst_ptr
190 ;# r8 int dst_pixels_per_line
191 ;# r9 unsigned int *sse
192 ;#
193 ;# r3 return value
194 vp8_sub_pixel_variance4x4_ppc:
195 mfspr r11, 256 ;# get old VRSAVE
196 oris r12, r11, 0xf830
197 ori r12, r12, 0xfff8
198 mtspr 256, r12 ;# set VRSAVE
199
200 stwu r1,-32(r1) ;# create space on the stack
201
202 HProlog second_pass_4x4_pre_copy_b
203
204 ;# Load up permutation constants
205 load_c v10, b_0123_b, 0, r12, r0
206 load_c v11, b_4567_b, 0, r12, r0
207
208 hfilter_8 v0, v10, v11, 1
209 hfilter_8 v1, v10, v11, 1
210 hfilter_8 v2, v10, v11, 1
211 hfilter_8 v3, v10, v11, 1
212
213 ;# Finished filtering main horizontal block. If there is no
214 ;# vertical filtering, jump to storing the data. Otherwise
215 ;# load up and filter the additional line that is needed
216 ;# for the vertical filter.
217 beq compute_sum_sse_4x4_b
218
219 hfilter_8 v4, v10, v11, 0
220
221 b second_pass_4x4_b
222
223 second_pass_4x4_pre_copy_b:
224 slwi r6, r6, 5 ;# index into vertical filter array
225
226 load_and_align_16 v0, r3, r4, 1
227 load_and_align_16 v1, r3, r4, 1
228 load_and_align_16 v2, r3, r4, 1
229 load_and_align_16 v3, r3, r4, 1
230 load_and_align_16 v4, r3, r4, 0
231
232 second_pass_4x4_b:
233 vspltish v20, 8
234 vspltish v18, 3
235 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
236
237 load_vfilter v20, v21
238
239 vfilter_16 v0, v1
240 vfilter_16 v1, v2
241 vfilter_16 v2, v3
242 vfilter_16 v3, v4
243
244 compute_sum_sse_4x4_b:
245 vspltish v18, 0 ;# sum
246 vspltish v19, 0 ;# sse
247 vspltish v23, 0 ;# unpack
248 li r10, 16
249
250 load_and_align_16 v4, r7, r8, 1
251 load_and_align_16 v5, r7, r8, 1
252 load_and_align_16 v6, r7, r8, 1
253 load_and_align_16 v7, r7, r8, 1
254
255 vmrghb v0, v0, v1
256 vmrghb v1, v2, v3
257
258 vmrghb v2, v4, v5
259 vmrghb v3, v6, v7
260
261 load_c v10, b_hilo_b, 0, r12, r0
262
263 vperm v0, v0, v1, v10
264 vperm v1, v2, v3, v10
265
266 compute_sum_sse v0, v1, v18, v19, v20, v21, v23
267
268 variance_final v18, v19, v23, 4
269
270 addi r1, r1, 32 ;# recover stack
271 mtspr 256, r11 ;# reset old VRSAVE
272
273 blr
274
275 .align 2
276 ;# r3 unsigned char *src_ptr
277 ;# r4 int src_pixels_per_line
278 ;# r5 int xoffset
279 ;# r6 int yoffset
280 ;# r7 unsigned char *dst_ptr
281 ;# r8 int dst_pixels_per_line
282 ;# r9 unsigned int *sse
283 ;#
284 ;# r3 return value
285 vp8_sub_pixel_variance8x8_ppc:
286 mfspr r11, 256 ;# get old VRSAVE
287 oris r12, r11, 0xfff0
288 ori r12, r12, 0xffff
289 mtspr 256, r12 ;# set VRSAVE
290
291 stwu r1,-32(r1) ;# create space on the stack
292
293 HProlog second_pass_8x8_pre_copy_b
294
295 ;# Load up permutation constants
296 load_c v10, b_0123_b, 0, r12, r0
297 load_c v11, b_4567_b, 0, r12, r0
298
299 hfilter_8 v0, v10, v11, 1
300 hfilter_8 v1, v10, v11, 1
301 hfilter_8 v2, v10, v11, 1
302 hfilter_8 v3, v10, v11, 1
303 hfilter_8 v4, v10, v11, 1
304 hfilter_8 v5, v10, v11, 1
305 hfilter_8 v6, v10, v11, 1
306 hfilter_8 v7, v10, v11, 1
307
308 ;# Finished filtering main horizontal block. If there is no
309 ;# vertical filtering, jump to storing the data. Otherwise
310 ;# load up and filter the additional line that is needed
311 ;# for the vertical filter.
312 beq compute_sum_sse_8x8_b
313
314 hfilter_8 v8, v10, v11, 0
315
316 b second_pass_8x8_b
317
318 second_pass_8x8_pre_copy_b:
319 slwi. r6, r6, 5 ;# index into vertical filter array
320
321 load_and_align_16 v0, r3, r4, 1
322 load_and_align_16 v1, r3, r4, 1
323 load_and_align_16 v2, r3, r4, 1
324 load_and_align_16 v3, r3, r4, 1
325 load_and_align_16 v4, r3, r4, 1
326 load_and_align_16 v5, r3, r4, 1
327 load_and_align_16 v6, r3, r4, 1
328 load_and_align_16 v7, r3, r4, 1
329 load_and_align_16 v8, r3, r4, 0
330
331 beq compute_sum_sse_8x8_b
332
333 second_pass_8x8_b:
334 vspltish v20, 8
335 vspltish v18, 3
336 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
337
338 load_vfilter v20, v21
339
340 vfilter_16 v0, v1
341 vfilter_16 v1, v2
342 vfilter_16 v2, v3
343 vfilter_16 v3, v4
344 vfilter_16 v4, v5
345 vfilter_16 v5, v6
346 vfilter_16 v6, v7
347 vfilter_16 v7, v8
348
349 compute_sum_sse_8x8_b:
350 vspltish v18, 0 ;# sum
351 vspltish v19, 0 ;# sse
352 vspltish v23, 0 ;# unpack
353 li r10, 16
354
355 vmrghb v0, v0, v1
356 vmrghb v1, v2, v3
357 vmrghb v2, v4, v5
358 vmrghb v3, v6, v7
359
360 load_and_align_16 v4, r7, r8, 1
361 load_and_align_16 v5, r7, r8, 1
362 load_and_align_16 v6, r7, r8, 1
363 load_and_align_16 v7, r7, r8, 1
364 load_and_align_16 v8, r7, r8, 1
365 load_and_align_16 v9, r7, r8, 1
366 load_and_align_16 v10, r7, r8, 1
367 load_and_align_16 v11, r7, r8, 0
368
369 vmrghb v4, v4, v5
370 vmrghb v5, v6, v7
371 vmrghb v6, v8, v9
372 vmrghb v7, v10, v11
373
374 compute_sum_sse v0, v4, v18, v19, v20, v21, v23
375 compute_sum_sse v1, v5, v18, v19, v20, v21, v23
376 compute_sum_sse v2, v6, v18, v19, v20, v21, v23
377 compute_sum_sse v3, v7, v18, v19, v20, v21, v23
378
379 variance_final v18, v19, v23, 6
380
381 addi r1, r1, 32 ;# recover stack
382 mtspr 256, r11 ;# reset old VRSAVE
383 blr
384
385 .align 2
386 ;# r3 unsigned char *src_ptr
387 ;# r4 int src_pixels_per_line
388 ;# r5 int xoffset
389 ;# r6 int yoffset
390 ;# r7 unsigned char *dst_ptr
391 ;# r8 int dst_pixels_per_line
392 ;# r9 unsigned int *sse
393 ;#
394 ;# r3 return value
395 vp8_sub_pixel_variance8x16_ppc:
396 mfspr r11, 256 ;# get old VRSAVE
397 oris r12, r11, 0xffff
398 ori r12, r12, 0xfffc
399 mtspr 256, r12 ;# set VRSAVE
400
401 stwu r1,-32(r1) ;# create space on the stack
402
403 HProlog second_pass_8x16_pre_copy_b
404
405 ;# Load up permutation constants
406 load_c v29, b_0123_b, 0, r12, r0
407 load_c v30, b_4567_b, 0, r12, r0
408
409 hfilter_8 v0, v29, v30, 1
410 hfilter_8 v1, v29, v30, 1
411 hfilter_8 v2, v29, v30, 1
412 hfilter_8 v3, v29, v30, 1
413 hfilter_8 v4, v29, v30, 1
414 hfilter_8 v5, v29, v30, 1
415 hfilter_8 v6, v29, v30, 1
416 hfilter_8 v7, v29, v30, 1
417 hfilter_8 v8, v29, v30, 1
418 hfilter_8 v9, v29, v30, 1
419 hfilter_8 v10, v29, v30, 1
420 hfilter_8 v11, v29, v30, 1
421 hfilter_8 v12, v29, v30, 1
422 hfilter_8 v13, v29, v30, 1
423 hfilter_8 v14, v29, v30, 1
424 hfilter_8 v15, v29, v30, 1
425
426 ;# Finished filtering main horizontal block. If there is no
427 ;# vertical filtering, jump to storing the data. Otherwise
428 ;# load up and filter the additional line that is needed
429 ;# for the vertical filter.
430 beq compute_sum_sse_8x16_b
431
432 hfilter_8 v16, v29, v30, 0
433
434 b second_pass_8x16_b
435
436 second_pass_8x16_pre_copy_b:
437 slwi. r6, r6, 5 ;# index into vertical filter array
438
439 load_and_align_16 v0, r3, r4, 1
440 load_and_align_16 v1, r3, r4, 1
441 load_and_align_16 v2, r3, r4, 1
442 load_and_align_16 v3, r3, r4, 1
443 load_and_align_16 v4, r3, r4, 1
444 load_and_align_16 v5, r3, r4, 1
445 load_and_align_16 v6, r3, r4, 1
446 load_and_align_16 v7, r3, r4, 1
447 load_and_align_16 v8, r3, r4, 1
448 load_and_align_16 v9, r3, r4, 1
449 load_and_align_16 v10, r3, r4, 1
450 load_and_align_16 v11, r3, r4, 1
451 load_and_align_16 v12, r3, r4, 1
452 load_and_align_16 v13, r3, r4, 1
453 load_and_align_16 v14, r3, r4, 1
454 load_and_align_16 v15, r3, r4, 1
455 load_and_align_16 v16, r3, r4, 0
456
457 beq compute_sum_sse_8x16_b
458
459 second_pass_8x16_b:
460 vspltish v20, 8
461 vspltish v18, 3
462 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
463
464 load_vfilter v20, v21
465
466 vfilter_16 v0, v1
467 vfilter_16 v1, v2
468 vfilter_16 v2, v3
469 vfilter_16 v3, v4
470 vfilter_16 v4, v5
471 vfilter_16 v5, v6
472 vfilter_16 v6, v7
473 vfilter_16 v7, v8
474 vfilter_16 v8, v9
475 vfilter_16 v9, v10
476 vfilter_16 v10, v11
477 vfilter_16 v11, v12
478 vfilter_16 v12, v13
479 vfilter_16 v13, v14
480 vfilter_16 v14, v15
481 vfilter_16 v15, v16
482
483 compute_sum_sse_8x16_b:
484 vspltish v18, 0 ;# sum
485 vspltish v19, 0 ;# sse
486 vspltish v23, 0 ;# unpack
487 li r10, 16
488
489 vmrghb v0, v0, v1
490 vmrghb v1, v2, v3
491 vmrghb v2, v4, v5
492 vmrghb v3, v6, v7
493 vmrghb v4, v8, v9
494 vmrghb v5, v10, v11
495 vmrghb v6, v12, v13
496 vmrghb v7, v14, v15
497
498 load_and_align_16 v8, r7, r8, 1
499 load_and_align_16 v9, r7, r8, 1
500 load_and_align_16 v10, r7, r8, 1
501 load_and_align_16 v11, r7, r8, 1
502 load_and_align_16 v12, r7, r8, 1
503 load_and_align_16 v13, r7, r8, 1
504 load_and_align_16 v14, r7, r8, 1
505 load_and_align_16 v15, r7, r8, 1
506
507 vmrghb v8, v8, v9
508 vmrghb v9, v10, v11
509 vmrghb v10, v12, v13
510 vmrghb v11, v14, v15
511
512 compute_sum_sse v0, v8, v18, v19, v20, v21, v23
513 compute_sum_sse v1, v9, v18, v19, v20, v21, v23
514 compute_sum_sse v2, v10, v18, v19, v20, v21, v23
515 compute_sum_sse v3, v11, v18, v19, v20, v21, v23
516
517 load_and_align_16 v8, r7, r8, 1
518 load_and_align_16 v9, r7, r8, 1
519 load_and_align_16 v10, r7, r8, 1
520 load_and_align_16 v11, r7, r8, 1
521 load_and_align_16 v12, r7, r8, 1
522 load_and_align_16 v13, r7, r8, 1
523 load_and_align_16 v14, r7, r8, 1
524 load_and_align_16 v15, r7, r8, 0
525
526 vmrghb v8, v8, v9
527 vmrghb v9, v10, v11
528 vmrghb v10, v12, v13
529 vmrghb v11, v14, v15
530
531 compute_sum_sse v4, v8, v18, v19, v20, v21, v23
532 compute_sum_sse v5, v9, v18, v19, v20, v21, v23
533 compute_sum_sse v6, v10, v18, v19, v20, v21, v23
534 compute_sum_sse v7, v11, v18, v19, v20, v21, v23
535
536 variance_final v18, v19, v23, 7
537
538 addi r1, r1, 32 ;# recover stack
539 mtspr 256, r11 ;# reset old VRSAVE
540 blr
541
542 ;# Filters a horizontal line
543 ;# expects:
544 ;# r3 src_ptr
545 ;# r4 pitch
546 ;# r10 16
547 ;# r12 32
548 ;# v17 perm intput
549 ;# v18 rounding
550 ;# v19 shift
551 ;# v20 filter taps
552 ;# v21 tmp
553 ;# v22 tmp
554 ;# v23 tmp
555 ;# v24 tmp
556 ;# v25 tmp
557 ;# v26 tmp
558 ;# v27 tmp
559 ;# v28 perm output
560 ;#
561 .macro hfilter_16 V, increment_counter
562
563 lvsl v17, 0, r3 ;# permutate value for alignment
564
565 ;# input to filter is 21 bytes wide, output is 16 bytes.
566 ;# input will can span three vectors if not aligned correctly.
567 lvx v21, 0, r3
568 lvx v22, r10, r3
569 lvx v23, r12, r3
570
571 .if \increment_counter
572 add r3, r3, r4
573 .endif
574 vperm v21, v21, v22, v17
575 vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
576
577 ;# set 0
578 vmsummbm v24, v20, v21, v18 ;# taps times elements
579
580 ;# set 1
581 vsldoi v23, v21, v22, 1
582 vmsummbm v25, v20, v23, v18
583
584 ;# set 2
585 vsldoi v23, v21, v22, 2
586 vmsummbm v26, v20, v23, v18
587
588 ;# set 3
589 vsldoi v23, v21, v22, 3
590 vmsummbm v27, v20, v23, v18
591
592 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
593 vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
594
595 vsrh v24, v24, v19 ;# divide v0, v1 by 128
596 vsrh v25, v25, v19
597
598 vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
599 vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
600 .endm
601
602 .align 2
603 ;# r3 unsigned char *src_ptr
604 ;# r4 int src_pixels_per_line
605 ;# r5 int xoffset
606 ;# r6 int yoffset
607 ;# r7 unsigned char *dst_ptr
608 ;# r8 int dst_pixels_per_line
609 ;# r9 unsigned int *sse
610 ;#
611 ;# r3 return value
612 vp8_sub_pixel_variance16x8_ppc:
613 mfspr r11, 256 ;# get old VRSAVE
614 oris r12, r11, 0xffff
615 ori r12, r12, 0xfff8
616 mtspr 256, r12 ;# set VRSAVE
617
618 stwu r1, -32(r1) ;# create space on the stack
619
620 HProlog second_pass_16x8_pre_copy_b
621
622 hfilter_16 v0, 1
623 hfilter_16 v1, 1
624 hfilter_16 v2, 1
625 hfilter_16 v3, 1
626 hfilter_16 v4, 1
627 hfilter_16 v5, 1
628 hfilter_16 v6, 1
629 hfilter_16 v7, 1
630
631 ;# Finished filtering main horizontal block. If there is no
632 ;# vertical filtering, jump to storing the data. Otherwise
633 ;# load up and filter the additional line that is needed
634 ;# for the vertical filter.
635 beq compute_sum_sse_16x8_b
636
637 hfilter_16 v8, 0
638
639 b second_pass_16x8_b
640
641 second_pass_16x8_pre_copy_b:
642 slwi. r6, r6, 5 ;# index into vertical filter array
643
644 load_and_align_16 v0, r3, r4, 1
645 load_and_align_16 v1, r3, r4, 1
646 load_and_align_16 v2, r3, r4, 1
647 load_and_align_16 v3, r3, r4, 1
648 load_and_align_16 v4, r3, r4, 1
649 load_and_align_16 v5, r3, r4, 1
650 load_and_align_16 v6, r3, r4, 1
651 load_and_align_16 v7, r3, r4, 1
652 load_and_align_16 v8, r3, r4, 1
653
654 beq compute_sum_sse_16x8_b
655
656 second_pass_16x8_b:
657 vspltish v20, 8
658 vspltish v18, 3
659 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
660
661 load_vfilter v20, v21
662
663 vfilter_16 v0, v1
664 vfilter_16 v1, v2
665 vfilter_16 v2, v3
666 vfilter_16 v3, v4
667 vfilter_16 v4, v5
668 vfilter_16 v5, v6
669 vfilter_16 v6, v7
670 vfilter_16 v7, v8
671
672 compute_sum_sse_16x8_b:
673 vspltish v18, 0 ;# sum
674 vspltish v19, 0 ;# sse
675 vspltish v23, 0 ;# unpack
676 li r10, 16
677
678 compute_sum_sse_16 v0, 1
679 compute_sum_sse_16 v1, 1
680 compute_sum_sse_16 v2, 1
681 compute_sum_sse_16 v3, 1
682 compute_sum_sse_16 v4, 1
683 compute_sum_sse_16 v5, 1
684 compute_sum_sse_16 v6, 1
685 compute_sum_sse_16 v7, 0
686
687 variance_final v18, v19, v23, 7
688
689 addi r1, r1, 32 ;# recover stack
690
691 mtspr 256, r11 ;# reset old VRSAVE
692
693 blr
694
695 .align 2
696 ;# r3 unsigned char *src_ptr
697 ;# r4 int src_pixels_per_line
698 ;# r5 int xoffset
699 ;# r6 int yoffset
700 ;# r7 unsigned char *dst_ptr
701 ;# r8 int dst_pixels_per_line
702 ;# r9 unsigned int *sse
703 ;#
704 ;# r3 return value
705 vp8_sub_pixel_variance16x16_ppc:
706 mfspr r11, 256 ;# get old VRSAVE
707 oris r12, r11, 0xffff
708 ori r12, r12, 0xfff8
709 mtspr 256, r12 ;# set VRSAVE
710
711 stwu r1, -32(r1) ;# create space on the stack
712
713 HProlog second_pass_16x16_pre_copy_b
714
715 hfilter_16 v0, 1
716 hfilter_16 v1, 1
717 hfilter_16 v2, 1
718 hfilter_16 v3, 1
719 hfilter_16 v4, 1
720 hfilter_16 v5, 1
721 hfilter_16 v6, 1
722 hfilter_16 v7, 1
723 hfilter_16 v8, 1
724 hfilter_16 v9, 1
725 hfilter_16 v10, 1
726 hfilter_16 v11, 1
727 hfilter_16 v12, 1
728 hfilter_16 v13, 1
729 hfilter_16 v14, 1
730 hfilter_16 v15, 1
731
732 ;# Finished filtering main horizontal block. If there is no
733 ;# vertical filtering, jump to storing the data. Otherwise
734 ;# load up and filter the additional line that is needed
735 ;# for the vertical filter.
736 beq compute_sum_sse_16x16_b
737
738 hfilter_16 v16, 0
739
740 b second_pass_16x16_b
741
742 second_pass_16x16_pre_copy_b:
743 slwi. r6, r6, 5 ;# index into vertical filter array
744
745 load_and_align_16 v0, r3, r4, 1
746 load_and_align_16 v1, r3, r4, 1
747 load_and_align_16 v2, r3, r4, 1
748 load_and_align_16 v3, r3, r4, 1
749 load_and_align_16 v4, r3, r4, 1
750 load_and_align_16 v5, r3, r4, 1
751 load_and_align_16 v6, r3, r4, 1
752 load_and_align_16 v7, r3, r4, 1
753 load_and_align_16 v8, r3, r4, 1
754 load_and_align_16 v9, r3, r4, 1
755 load_and_align_16 v10, r3, r4, 1
756 load_and_align_16 v11, r3, r4, 1
757 load_and_align_16 v12, r3, r4, 1
758 load_and_align_16 v13, r3, r4, 1
759 load_and_align_16 v14, r3, r4, 1
760 load_and_align_16 v15, r3, r4, 1
761 load_and_align_16 v16, r3, r4, 0
762
763 beq compute_sum_sse_16x16_b
764
765 second_pass_16x16_b:
766 vspltish v20, 8
767 vspltish v18, 3
768 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
769
770 load_vfilter v20, v21
771
772 vfilter_16 v0, v1
773 vfilter_16 v1, v2
774 vfilter_16 v2, v3
775 vfilter_16 v3, v4
776 vfilter_16 v4, v5
777 vfilter_16 v5, v6
778 vfilter_16 v6, v7
779 vfilter_16 v7, v8
780 vfilter_16 v8, v9
781 vfilter_16 v9, v10
782 vfilter_16 v10, v11
783 vfilter_16 v11, v12
784 vfilter_16 v12, v13
785 vfilter_16 v13, v14
786 vfilter_16 v14, v15
787 vfilter_16 v15, v16
788
789 compute_sum_sse_16x16_b:
790 vspltish v18, 0 ;# sum
791 vspltish v19, 0 ;# sse
792 vspltish v23, 0 ;# unpack
793 li r10, 16
794
795 compute_sum_sse_16 v0, 1
796 compute_sum_sse_16 v1, 1
797 compute_sum_sse_16 v2, 1
798 compute_sum_sse_16 v3, 1
799 compute_sum_sse_16 v4, 1
800 compute_sum_sse_16 v5, 1
801 compute_sum_sse_16 v6, 1
802 compute_sum_sse_16 v7, 1
803 compute_sum_sse_16 v8, 1
804 compute_sum_sse_16 v9, 1
805 compute_sum_sse_16 v10, 1
806 compute_sum_sse_16 v11, 1
807 compute_sum_sse_16 v12, 1
808 compute_sum_sse_16 v13, 1
809 compute_sum_sse_16 v14, 1
810 compute_sum_sse_16 v15, 0
811
812 variance_final v18, v19, v23, 8
813
814 addi r1, r1, 32 ;# recover stack
815
816 mtspr 256, r11 ;# reset old VRSAVE
817
818 blr
819
820 .data
821
822 .align 4
823 hfilter_b:
824 .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
825 .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
826 .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
827 .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
828 .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
829 .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
830 .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
831 .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
832
833 .align 4
834 vfilter_b:
835 .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
836 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
837 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
838 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
839 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
840 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
841 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
842 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
843 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
844 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
845 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
846 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
847 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
848 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
849 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
850 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
851
852 .align 4
853 b_hperm_b:
854 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
855
856 .align 4
857 b_0123_b:
858 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
859
860 .align 4
861 b_4567_b:
862 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
863
864 b_hilo_b:
865 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
OLDNEW
« no previous file with comments | « source/libvpx/vp8/common/ppc/variance_altivec.asm ('k') | source/libvpx/vp8/common/reconinter.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698