Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Side by Side Diff: source/libvpx/vp8/common/ppc/filter_altivec.asm

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 .globl sixtap_predict_ppc
13 .globl sixtap_predict8x4_ppc
14 .globl sixtap_predict8x8_ppc
15 .globl sixtap_predict16x16_ppc
16
17 .macro load_c V, LABEL, OFF, R0, R1
18 lis \R0, \LABEL@ha
19 la \R1, \LABEL@l(\R0)
20 lvx \V, \OFF, \R1
21 .endm
22
23 .macro load_hfilter V0, V1
24 load_c \V0, HFilter, r5, r9, r10
25
26 addi r5, r5, 16
27 lvx \V1, r5, r10
28 .endm
29
30 ;# Vertical filtering
31 .macro Vprolog
32 load_c v0, VFilter, r6, r3, r10
33
34 vspltish v5, 8
35 vspltish v6, 3
36 vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
37
38 vspltb v1, v0, 1
39 vspltb v2, v0, 2
40 vspltb v3, v0, 3
41 vspltb v4, v0, 4
42 vspltb v5, v0, 5
43 vspltb v0, v0, 0
44 .endm
45
46 .macro vpre_load
47 Vprolog
48 li r10, 16
49 lvx v10, 0, r9 ;# v10..v14 = first 5 rows
50 lvx v11, r10, r9
51 addi r9, r9, 32
52 lvx v12, 0, r9
53 lvx v13, r10, r9
54 addi r9, r9, 32
55 lvx v14, 0, r9
56 .endm
57
58 .macro Msum Re, Ro, V, T, TMP
59 ;# (Re,Ro) += (V*T)
60 vmuleub \TMP, \V, \T ;# trashes v8
61 vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary
62 vmuloub \TMP, \V, \T
63 vadduhm \Ro, \Ro, \TMP ;# Ro = odds
64 .endm
65
66 .macro vinterp_no_store P0 P1 P2 P3 P4 P5
67 vmuleub v8, \P0, v0 ;# 64 + 4 positive taps
68 vadduhm v16, v6, v8
69 vmuloub v8, \P0, v0
70 vadduhm v17, v6, v8
71 Msum v16, v17, \P2, v2, v8
72 Msum v16, v17, \P3, v3, v8
73 Msum v16, v17, \P5, v5, v8
74
75 vmuleub v18, \P1, v1 ;# 2 negative taps
76 vmuloub v19, \P1, v1
77 Msum v18, v19, \P4, v4, v8
78
79 vsubuhs v16, v16, v18 ;# subtract neg from pos
80 vsubuhs v17, v17, v19
81 vsrh v16, v16, v7 ;# divide by 128
82 vsrh v17, v17, v7 ;# v16 v17 = evens, odds
83 vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order
84 vmrglh v19, v16, v17
85 vpkuhus \P0, v18, v19 ;# P0 = 8-bit result
86 .endm
87
88 .macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
89 vmuleub v24, \P0, v13 ;# 64 + 4 positive taps
90 vadduhm v21, v20, v24
91 vmuloub v24, \P0, v13
92 vadduhm v22, v20, v24
93 Msum v21, v22, \P2, v15, v25
94 Msum v21, v22, \P3, v16, v25
95 Msum v21, v22, \P5, v18, v25
96
97 vmuleub v23, \P1, v14 ;# 2 negative taps
98 vmuloub v24, \P1, v14
99 Msum v23, v24, \P4, v17, v25
100
101 vsubuhs v21, v21, v23 ;# subtract neg from pos
102 vsubuhs v22, v22, v24
103 vsrh v21, v21, v19 ;# divide by 128
104 vsrh v22, v22, v19 ;# v16 v17 = evens, odds
105 vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order
106 vmrglh v24, v21, v22
107 vpkuhus \P0, v23, v24 ;# P0 = 8-bit result
108 .endm
109
110
111 .macro Vinterp P0 P1 P2 P3 P4 P5
112 vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
113 stvx \P0, 0, r7
114 add r7, r7, r8 ;# 33 ops per 16 pels
115 .endm
116
117
118 .macro luma_v P0, P1, P2, P3, P4, P5
119 addi r9, r9, 16 ;# P5 = newest input row
120 lvx \P5, 0, r9
121 Vinterp \P0, \P1, \P2, \P3, \P4, \P5
122 .endm
123
124 .macro luma_vtwo
125 luma_v v10, v11, v12, v13, v14, v15
126 luma_v v11, v12, v13, v14, v15, v10
127 .endm
128
129 .macro luma_vfour
130 luma_vtwo
131 luma_v v12, v13, v14, v15, v10, v11
132 luma_v v13, v14, v15, v10, v11, v12
133 .endm
134
135 .macro luma_vsix
136 luma_vfour
137 luma_v v14, v15, v10, v11, v12, v13
138 luma_v v15, v10, v11, v12, v13, v14
139 .endm
140
141 .macro Interp4 R I I4
142 vmsummbm \R, v13, \I, v15
143 vmsummbm \R, v14, \I4, \R
144 .endm
145
146 .macro Read8x8 VD, RS, RP, increment_counter
147 lvsl v21, 0, \RS ;# permutate value for alignment
148
149 ;# input to filter is 21 bytes wide, output is 16 bytes.
150 ;# input will can span three vectors if not aligned correctly.
151 lvx \VD, 0, \RS
152 lvx v20, r10, \RS
153
154 .if \increment_counter
155 add \RS, \RS, \RP
156 .endif
157
158 vperm \VD, \VD, v20, v21
159 .endm
160
161 .macro interp_8x8 R
162 vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456
163 vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A
164 Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3
165 vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx
166 Interp4 v21, v21, \R ;# v21 = result 4 5 6 7
167
168 vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7
169 vsrh \R, \R, v19
170
171 vpkuhus \R, \R, \R ;# saturate and pack
172
173 .endm
174
175 .macro Read4x4 VD, RS, RP, increment_counter
176 lvsl v21, 0, \RS ;# permutate value for alignment
177
178 ;# input to filter is 21 bytes wide, output is 16 bytes.
179 ;# input will can span three vectors if not aligned correctly.
180 lvx v20, 0, \RS
181
182 .if \increment_counter
183 add \RS, \RS, \RP
184 .endif
185
186 vperm \VD, v20, v20, v21
187 .endm
188 .text
189
190 .align 2
191 ;# r3 unsigned char * src
192 ;# r4 int src_pitch
193 ;# r5 int x_offset
194 ;# r6 int y_offset
195 ;# r7 unsigned char * dst
196 ;# r8 int dst_pitch
197 sixtap_predict_ppc:
198 mfspr r11, 256 ;# get old VRSAVE
199 oris r12, r11, 0xff87
200 ori r12, r12, 0xffc0
201 mtspr 256, r12 ;# set VRSAVE
202
203 stwu r1,-32(r1) ;# create space on the stack
204
205 slwi. r5, r5, 5 ;# index into horizontal filter array
206
207 vspltish v19, 7
208
209 ;# If there isn't any filtering to be done for the horizontal, then
210 ;# just skip to the second pass.
211 beq- vertical_only_4x4
212
213 ;# load up horizontal filter
214 load_hfilter v13, v14
215
216 ;# rounding added in on the multiply
217 vspltisw v16, 8
218 vspltisw v15, 3
219 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
220
221 ;# Load up permutation constants
222 load_c v16, B_0123, 0, r9, r10
223 load_c v17, B_4567, 0, r9, r10
224 load_c v18, B_89AB, 0, r9, r10
225
226 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
227 addi r3, r3, -2
228
229 addi r9, r3, 0
230 li r10, 16
231 Read8x8 v2, r3, r4, 1
232 Read8x8 v3, r3, r4, 1
233 Read8x8 v4, r3, r4, 1
234 Read8x8 v5, r3, r4, 1
235
236 slwi. r6, r6, 4 ;# index into vertical filter array
237
238 ;# filter a line
239 interp_8x8 v2
240 interp_8x8 v3
241 interp_8x8 v4
242 interp_8x8 v5
243
244 ;# Finished filtering main horizontal block. If there is no
245 ;# vertical filtering, jump to storing the data. Otherwise
246 ;# load up and filter the additional 5 lines that are needed
247 ;# for the vertical filter.
248 beq- store_4x4
249
250 ;# only needed if there is a vertical filter present
251 ;# if the second filter is not null then need to back off by 2*pitch
252 sub r9, r9, r4
253 sub r9, r9, r4
254
255 Read8x8 v0, r9, r4, 1
256 Read8x8 v1, r9, r4, 0
257 Read8x8 v6, r3, r4, 1
258 Read8x8 v7, r3, r4, 1
259 Read8x8 v8, r3, r4, 0
260
261 interp_8x8 v0
262 interp_8x8 v1
263 interp_8x8 v6
264 interp_8x8 v7
265 interp_8x8 v8
266
267 b second_pass_4x4
268
269 vertical_only_4x4:
270 ;# only needed if there is a vertical filter present
271 ;# if the second filter is not null then need to back off by 2*pitch
272 sub r3, r3, r4
273 sub r3, r3, r4
274 li r10, 16
275
276 Read8x8 v0, r3, r4, 1
277 Read8x8 v1, r3, r4, 1
278 Read8x8 v2, r3, r4, 1
279 Read8x8 v3, r3, r4, 1
280 Read8x8 v4, r3, r4, 1
281 Read8x8 v5, r3, r4, 1
282 Read8x8 v6, r3, r4, 1
283 Read8x8 v7, r3, r4, 1
284 Read8x8 v8, r3, r4, 0
285
286 slwi r6, r6, 4 ;# index into vertical filter array
287
288 second_pass_4x4:
289 load_c v20, b_hilo_4x4, 0, r9, r10
290 load_c v21, b_hilo, 0, r9, r10
291
292 ;# reposition input so that it can go through the
293 ;# filtering phase with one pass.
294 vperm v0, v0, v1, v20 ;# 0 1 x x
295 vperm v2, v2, v3, v20 ;# 2 3 x x
296 vperm v4, v4, v5, v20 ;# 4 5 x x
297 vperm v6, v6, v7, v20 ;# 6 7 x x
298
299 vperm v0, v0, v2, v21 ;# 0 1 2 3
300 vperm v4, v4, v6, v21 ;# 4 5 6 7
301
302 vsldoi v1, v0, v4, 4
303 vsldoi v2, v0, v4, 8
304 vsldoi v3, v0, v4, 12
305
306 vsldoi v5, v4, v8, 4
307
308 load_c v13, VFilter, r6, r9, r10
309
310 vspltish v15, 8
311 vspltish v20, 3
312 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
313
314 vspltb v14, v13, 1
315 vspltb v15, v13, 2
316 vspltb v16, v13, 3
317 vspltb v17, v13, 4
318 vspltb v18, v13, 5
319 vspltb v13, v13, 0
320
321 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
322
323 stvx v0, 0, r1
324
325 lwz r0, 0(r1)
326 stw r0, 0(r7)
327 add r7, r7, r8
328
329 lwz r0, 4(r1)
330 stw r0, 0(r7)
331 add r7, r7, r8
332
333 lwz r0, 8(r1)
334 stw r0, 0(r7)
335 add r7, r7, r8
336
337 lwz r0, 12(r1)
338 stw r0, 0(r7)
339
340 b exit_4x4
341
342 store_4x4:
343
344 stvx v2, 0, r1
345 lwz r0, 0(r1)
346 stw r0, 0(r7)
347 add r7, r7, r8
348
349 stvx v3, 0, r1
350 lwz r0, 0(r1)
351 stw r0, 0(r7)
352 add r7, r7, r8
353
354 stvx v4, 0, r1
355 lwz r0, 0(r1)
356 stw r0, 0(r7)
357 add r7, r7, r8
358
359 stvx v5, 0, r1
360 lwz r0, 0(r1)
361 stw r0, 0(r7)
362
363 exit_4x4:
364
365 addi r1, r1, 32 ;# recover stack
366
367 mtspr 256, r11 ;# reset old VRSAVE
368
369 blr
370
371 .macro w_8x8 V, D, R, P
372 stvx \V, 0, r1
373 lwz \R, 0(r1)
374 stw \R, 0(r7)
375 lwz \R, 4(r1)
376 stw \R, 4(r7)
377 add \D, \D, \P
378 .endm
379
380 .align 2
381 ;# r3 unsigned char * src
382 ;# r4 int src_pitch
383 ;# r5 int x_offset
384 ;# r6 int y_offset
385 ;# r7 unsigned char * dst
386 ;# r8 int dst_pitch
387
388 sixtap_predict8x4_ppc:
389 mfspr r11, 256 ;# get old VRSAVE
390 oris r12, r11, 0xffff
391 ori r12, r12, 0xffc0
392 mtspr 256, r12 ;# set VRSAVE
393
394 stwu r1,-32(r1) ;# create space on the stack
395
396 slwi. r5, r5, 5 ;# index into horizontal filter array
397
398 vspltish v19, 7
399
400 ;# If there isn't any filtering to be done for the horizontal, then
401 ;# just skip to the second pass.
402 beq- second_pass_pre_copy_8x4
403
404 load_hfilter v13, v14
405
406 ;# rounding added in on the multiply
407 vspltisw v16, 8
408 vspltisw v15, 3
409 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
410
411 ;# Load up permutation constants
412 load_c v16, B_0123, 0, r9, r10
413 load_c v17, B_4567, 0, r9, r10
414 load_c v18, B_89AB, 0, r9, r10
415
416 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
417 addi r3, r3, -2
418
419 addi r9, r3, 0
420 li r10, 16
421 Read8x8 v2, r3, r4, 1
422 Read8x8 v3, r3, r4, 1
423 Read8x8 v4, r3, r4, 1
424 Read8x8 v5, r3, r4, 1
425
426 slwi. r6, r6, 4 ;# index into vertical filter array
427
428 ;# filter a line
429 interp_8x8 v2
430 interp_8x8 v3
431 interp_8x8 v4
432 interp_8x8 v5
433
434 ;# Finished filtering main horizontal block. If there is no
435 ;# vertical filtering, jump to storing the data. Otherwise
436 ;# load up and filter the additional 5 lines that are needed
437 ;# for the vertical filter.
438 beq- store_8x4
439
440 ;# only needed if there is a vertical filter present
441 ;# if the second filter is not null then need to back off by 2*pitch
442 sub r9, r9, r4
443 sub r9, r9, r4
444
445 Read8x8 v0, r9, r4, 1
446 Read8x8 v1, r9, r4, 0
447 Read8x8 v6, r3, r4, 1
448 Read8x8 v7, r3, r4, 1
449 Read8x8 v8, r3, r4, 0
450
451 interp_8x8 v0
452 interp_8x8 v1
453 interp_8x8 v6
454 interp_8x8 v7
455 interp_8x8 v8
456
457 b second_pass_8x4
458
459 second_pass_pre_copy_8x4:
460 ;# only needed if there is a vertical filter present
461 ;# if the second filter is not null then need to back off by 2*pitch
462 sub r3, r3, r4
463 sub r3, r3, r4
464 li r10, 16
465
466 Read8x8 v0, r3, r4, 1
467 Read8x8 v1, r3, r4, 1
468 Read8x8 v2, r3, r4, 1
469 Read8x8 v3, r3, r4, 1
470 Read8x8 v4, r3, r4, 1
471 Read8x8 v5, r3, r4, 1
472 Read8x8 v6, r3, r4, 1
473 Read8x8 v7, r3, r4, 1
474 Read8x8 v8, r3, r4, 1
475
476 slwi r6, r6, 4 ;# index into vertical filter array
477
478 second_pass_8x4:
479 load_c v13, VFilter, r6, r9, r10
480
481 vspltish v15, 8
482 vspltish v20, 3
483 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
484
485 vspltb v14, v13, 1
486 vspltb v15, v13, 2
487 vspltb v16, v13, 3
488 vspltb v17, v13, 4
489 vspltb v18, v13, 5
490 vspltb v13, v13, 0
491
492 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
493 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
494 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
495 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
496
497 cmpi cr0, r8, 8
498 beq cr0, store_aligned_8x4
499
500 w_8x8 v0, r7, r0, r8
501 w_8x8 v1, r7, r0, r8
502 w_8x8 v2, r7, r0, r8
503 w_8x8 v3, r7, r0, r8
504
505 b exit_8x4
506
507 store_aligned_8x4:
508
509 load_c v10, b_hilo, 0, r9, r10
510
511 vperm v0, v0, v1, v10
512 vperm v2, v2, v3, v10
513
514 stvx v0, 0, r7
515 addi r7, r7, 16
516 stvx v2, 0, r7
517
518 b exit_8x4
519
520 store_8x4:
521 cmpi cr0, r8, 8
522 beq cr0, store_aligned2_8x4
523
524 w_8x8 v2, r7, r0, r8
525 w_8x8 v3, r7, r0, r8
526 w_8x8 v4, r7, r0, r8
527 w_8x8 v5, r7, r0, r8
528
529 b exit_8x4
530
531 store_aligned2_8x4:
532 load_c v10, b_hilo, 0, r9, r10
533
534 vperm v2, v2, v3, v10
535 vperm v4, v4, v5, v10
536
537 stvx v2, 0, r7
538 addi r7, r7, 16
539 stvx v4, 0, r7
540
541 exit_8x4:
542
543 addi r1, r1, 32 ;# recover stack
544
545 mtspr 256, r11 ;# reset old VRSAVE
546
547
548 blr
549
550 .align 2
551 ;# r3 unsigned char * src
552 ;# r4 int src_pitch
553 ;# r5 int x_offset
554 ;# r6 int y_offset
555 ;# r7 unsigned char * dst
556 ;# r8 int dst_pitch
557
558 ;# Because the width that needs to be filtered will fit in a single altivec
559 ;# register there is no need to loop. Everything can stay in registers.
560 sixtap_predict8x8_ppc:
561 mfspr r11, 256 ;# get old VRSAVE
562 oris r12, r11, 0xffff
563 ori r12, r12, 0xffc0
564 mtspr 256, r12 ;# set VRSAVE
565
566 stwu r1,-32(r1) ;# create space on the stack
567
568 slwi. r5, r5, 5 ;# index into horizontal filter array
569
570 vspltish v19, 7
571
572 ;# If there isn't any filtering to be done for the horizontal, then
573 ;# just skip to the second pass.
574 beq- second_pass_pre_copy_8x8
575
576 load_hfilter v13, v14
577
578 ;# rounding added in on the multiply
579 vspltisw v16, 8
580 vspltisw v15, 3
581 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
582
583 ;# Load up permutation constants
584 load_c v16, B_0123, 0, r9, r10
585 load_c v17, B_4567, 0, r9, r10
586 load_c v18, B_89AB, 0, r9, r10
587
588 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
589 addi r3, r3, -2
590
591 addi r9, r3, 0
592 li r10, 16
593 Read8x8 v2, r3, r4, 1
594 Read8x8 v3, r3, r4, 1
595 Read8x8 v4, r3, r4, 1
596 Read8x8 v5, r3, r4, 1
597 Read8x8 v6, r3, r4, 1
598 Read8x8 v7, r3, r4, 1
599 Read8x8 v8, r3, r4, 1
600 Read8x8 v9, r3, r4, 1
601
602 slwi. r6, r6, 4 ;# index into vertical filter array
603
604 ;# filter a line
605 interp_8x8 v2
606 interp_8x8 v3
607 interp_8x8 v4
608 interp_8x8 v5
609 interp_8x8 v6
610 interp_8x8 v7
611 interp_8x8 v8
612 interp_8x8 v9
613
614 ;# Finished filtering main horizontal block. If there is no
615 ;# vertical filtering, jump to storing the data. Otherwise
616 ;# load up and filter the additional 5 lines that are needed
617 ;# for the vertical filter.
618 beq- store_8x8
619
620 ;# only needed if there is a vertical filter present
621 ;# if the second filter is not null then need to back off by 2*pitch
622 sub r9, r9, r4
623 sub r9, r9, r4
624
625 Read8x8 v0, r9, r4, 1
626 Read8x8 v1, r9, r4, 0
627 Read8x8 v10, r3, r4, 1
628 Read8x8 v11, r3, r4, 1
629 Read8x8 v12, r3, r4, 0
630
631 interp_8x8 v0
632 interp_8x8 v1
633 interp_8x8 v10
634 interp_8x8 v11
635 interp_8x8 v12
636
637 b second_pass_8x8
638
639 second_pass_pre_copy_8x8:
640 ;# only needed if there is a vertical filter present
641 ;# if the second filter is not null then need to back off by 2*pitch
642 sub r3, r3, r4
643 sub r3, r3, r4
644 li r10, 16
645
646 Read8x8 v0, r3, r4, 1
647 Read8x8 v1, r3, r4, 1
648 Read8x8 v2, r3, r4, 1
649 Read8x8 v3, r3, r4, 1
650 Read8x8 v4, r3, r4, 1
651 Read8x8 v5, r3, r4, 1
652 Read8x8 v6, r3, r4, 1
653 Read8x8 v7, r3, r4, 1
654 Read8x8 v8, r3, r4, 1
655 Read8x8 v9, r3, r4, 1
656 Read8x8 v10, r3, r4, 1
657 Read8x8 v11, r3, r4, 1
658 Read8x8 v12, r3, r4, 0
659
660 slwi r6, r6, 4 ;# index into vertical filter array
661
662 second_pass_8x8:
663 load_c v13, VFilter, r6, r9, r10
664
665 vspltish v15, 8
666 vspltish v20, 3
667 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
668
669 vspltb v14, v13, 1
670 vspltb v15, v13, 2
671 vspltb v16, v13, 3
672 vspltb v17, v13, 4
673 vspltb v18, v13, 5
674 vspltb v13, v13, 0
675
676 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
677 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
678 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
679 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
680 vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9
681 vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10
682 vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11
683 vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
684
685 cmpi cr0, r8, 8
686 beq cr0, store_aligned_8x8
687
688 w_8x8 v0, r7, r0, r8
689 w_8x8 v1, r7, r0, r8
690 w_8x8 v2, r7, r0, r8
691 w_8x8 v3, r7, r0, r8
692 w_8x8 v4, r7, r0, r8
693 w_8x8 v5, r7, r0, r8
694 w_8x8 v6, r7, r0, r8
695 w_8x8 v7, r7, r0, r8
696
697 b exit_8x8
698
699 store_aligned_8x8:
700
701 load_c v10, b_hilo, 0, r9, r10
702
703 vperm v0, v0, v1, v10
704 vperm v2, v2, v3, v10
705 vperm v4, v4, v5, v10
706 vperm v6, v6, v7, v10
707
708 stvx v0, 0, r7
709 addi r7, r7, 16
710 stvx v2, 0, r7
711 addi r7, r7, 16
712 stvx v4, 0, r7
713 addi r7, r7, 16
714 stvx v6, 0, r7
715
716 b exit_8x8
717
718 store_8x8:
719 cmpi cr0, r8, 8
720 beq cr0, store_aligned2_8x8
721
722 w_8x8 v2, r7, r0, r8
723 w_8x8 v3, r7, r0, r8
724 w_8x8 v4, r7, r0, r8
725 w_8x8 v5, r7, r0, r8
726 w_8x8 v6, r7, r0, r8
727 w_8x8 v7, r7, r0, r8
728 w_8x8 v8, r7, r0, r8
729 w_8x8 v9, r7, r0, r8
730
731 b exit_8x8
732
733 store_aligned2_8x8:
734 load_c v10, b_hilo, 0, r9, r10
735
736 vperm v2, v2, v3, v10
737 vperm v4, v4, v5, v10
738 vperm v6, v6, v7, v10
739 vperm v8, v8, v9, v10
740
741 stvx v2, 0, r7
742 addi r7, r7, 16
743 stvx v4, 0, r7
744 addi r7, r7, 16
745 stvx v6, 0, r7
746 addi r7, r7, 16
747 stvx v8, 0, r7
748
749 exit_8x8:
750
751 addi r1, r1, 32 ;# recover stack
752
753 mtspr 256, r11 ;# reset old VRSAVE
754
755 blr
756
757 .align 2
758 ;# r3 unsigned char * src
759 ;# r4 int src_pitch
760 ;# r5 int x_offset
761 ;# r6 int y_offset
762 ;# r7 unsigned char * dst
763 ;# r8 int dst_pitch
764
765 ;# Two pass filtering. First pass is Horizontal edges, second pass is vertical
766 ;# edges. One of the filters can be null, but both won't be. Needs to use a
767 ;# temporary buffer because the source buffer can't be modified and the buffer
768 ;# for the destination is not large enough to hold the temporary data.
769 sixtap_predict16x16_ppc:
770 mfspr r11, 256 ;# get old VRSAVE
771 oris r12, r11, 0xffff
772 ori r12, r12, 0xf000
773 mtspr 256, r12 ;# set VRSAVE
774
775 stwu r1,-416(r1) ;# create space on the stack
776
777 ;# Three possiblities
778 ;# 1. First filter is null. Don't use a temp buffer.
779 ;# 2. Second filter is null. Don't use a temp buffer.
780 ;# 3. Neither are null, use temp buffer.
781
782 ;# First Pass (horizontal edge)
783 ;# setup pointers for src
784 ;# if possiblity (1) then setup the src pointer to be the orginal and jump
785 ;# to second pass. this is based on if x_offset is 0.
786
787 ;# load up horizontal filter
788 slwi. r5, r5, 5 ;# index into horizontal filter array
789
790 load_hfilter v4, v5
791
792 beq- copy_horizontal_16x21
793
794 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
795 addi r3, r3, -2
796
797 slwi. r6, r6, 4 ;# index into vertical filter array
798
799 ;# setup constants
800 ;# v14 permutation value for alignment
801 load_c v14, b_hperm, 0, r9, r10
802
803 ;# These statements are guessing that there won't be a second pass,
804 ;# but if there is then inside the bypass they need to be set
805 li r0, 16 ;# prepare for no vertical filter
806
807 ;# Change the output pointer and pitch to be the actual
808 ;# desination instead of a temporary buffer.
809 addi r9, r7, 0
810 addi r5, r8, 0
811
812 ;# no vertical filter, so write the output from the first pass
813 ;# directly into the output buffer.
814 beq- no_vertical_filter_bypass
815
816 ;# if the second filter is not null then need to back off by 2*pitch
817 sub r3, r3, r4
818 sub r3, r3, r4
819
820 ;# setup counter for the number of lines that are going to be filtered
821 li r0, 21
822
823 ;# use the stack as temporary storage
824 la r9, 48(r1)
825 li r5, 16
826
827 no_vertical_filter_bypass:
828
829 mtctr r0
830
831 ;# rounding added in on the multiply
832 vspltisw v10, 8
833 vspltisw v12, 3
834 vslw v12, v10, v12 ;# 0x00000040000000400000004000000040
835
836 ;# downshift by 7 ( divide by 128 ) at the end
837 vspltish v13, 7
838
839 ;# index to the next set of vectors in the row.
840 li r10, 16
841 li r12, 32
842
843 horizontal_loop_16x16:
844
845 lvsl v15, 0, r3 ;# permutate value for alignment
846
847 ;# input to filter is 21 bytes wide, output is 16 bytes.
848 ;# input will can span three vectors if not aligned correctly.
849 lvx v1, 0, r3
850 lvx v2, r10, r3
851 lvx v3, r12, r3
852
853 vperm v8, v1, v2, v15
854 vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified
855
856 vsldoi v11, v8, v9, 4
857
858 ;# set 0
859 vmsummbm v6, v4, v8, v12 ;# taps times elements
860 vmsummbm v0, v5, v11, v6
861
862 ;# set 1
863 vsldoi v10, v8, v9, 1
864 vsldoi v11, v8, v9, 5
865
866 vmsummbm v6, v4, v10, v12
867 vmsummbm v1, v5, v11, v6
868
869 ;# set 2
870 vsldoi v10, v8, v9, 2
871 vsldoi v11, v8, v9, 6
872
873 vmsummbm v6, v4, v10, v12
874 vmsummbm v2, v5, v11, v6
875
876 ;# set 3
877 vsldoi v10, v8, v9, 3
878 vsldoi v11, v8, v9, 7
879
880 vmsummbm v6, v4, v10, v12
881 vmsummbm v3, v5, v11, v6
882
883 vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
884 vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F
885
886 vsrh v0, v0, v13 ;# divide v0, v1 by 128
887 vsrh v1, v1, v13
888
889 vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result
890 vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result
891
892 stvx v0, 0, r9
893 add r9, r9, r5
894
895 add r3, r3, r4
896
897 bdnz horizontal_loop_16x16
898
899 ;# check again to see if vertical filter needs to be done.
900 cmpi cr0, r6, 0
901 beq cr0, end_16x16
902
903 ;# yes there is, so go to the second pass
904 b second_pass_16x16
905
906 copy_horizontal_16x21:
907 li r10, 21
908 mtctr r10
909
910 li r10, 16
911
912 sub r3, r3, r4
913 sub r3, r3, r4
914
915 ;# this is done above if there is a horizontal filter,
916 ;# if not it needs to be done down here.
917 slwi r6, r6, 4 ;# index into vertical filter array
918
919 ;# always write to the stack when doing a horizontal copy
920 la r9, 48(r1)
921
922 copy_horizontal_loop_16x21:
923 lvsl v15, 0, r3 ;# permutate value for alignment
924
925 lvx v1, 0, r3
926 lvx v2, r10, r3
927
928 vperm v8, v1, v2, v15
929
930 stvx v8, 0, r9
931 addi r9, r9, 16
932
933 add r3, r3, r4
934
935 bdnz copy_horizontal_loop_16x21
936
937 second_pass_16x16:
938
939 ;# always read from the stack when doing a vertical filter
940 la r9, 48(r1)
941
942 ;# downshift by 7 ( divide by 128 ) at the end
943 vspltish v7, 7
944
945 vpre_load
946
947 luma_vsix
948 luma_vsix
949 luma_vfour
950
951 end_16x16:
952
953 addi r1, r1, 416 ;# recover stack
954
955 mtspr 256, r11 ;# reset old VRSAVE
956
957 blr
958
959 .data
960
961 .align 4
962 HFilter:
963 .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0
964 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
965 .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12
966 .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0
967 .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36
968 .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0
969 .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50
970 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
971 .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77
972 .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0
973 .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93
974 .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0
975 .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108
976 .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0
977 .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123
978 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
979
980 .align 4
981 VFilter:
982 .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
983 .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
984 .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
985 .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
986 .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
987 .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
988 .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
989 .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
990
991 .align 4
992 b_hperm:
993 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
994
995 .align 4
996 B_0123:
997 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
998
999 .align 4
1000 B_4567:
1001 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
1002
1003 .align 4
1004 B_89AB:
1005 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
1006
1007 .align 4
1008 b_hilo:
1009 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
1010
1011 .align 4
1012 b_hilo_4x4:
1013 .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
OLDNEW
« no previous file with comments | « source/libvpx/vp8/common/ppc/copy_altivec.asm ('k') | source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698