OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 .globl sixtap_predict_ppc | |
13 .globl sixtap_predict8x4_ppc | |
14 .globl sixtap_predict8x8_ppc | |
15 .globl sixtap_predict16x16_ppc | |
16 | |
17 .macro load_c V, LABEL, OFF, R0, R1 | |
18 lis \R0, \LABEL@ha | |
19 la \R1, \LABEL@l(\R0) | |
20 lvx \V, \OFF, \R1 | |
21 .endm | |
22 | |
23 .macro load_hfilter V0, V1 | |
24 load_c \V0, HFilter, r5, r9, r10 | |
25 | |
26 addi r5, r5, 16 | |
27 lvx \V1, r5, r10 | |
28 .endm | |
29 | |
30 ;# Vertical filtering | |
31 .macro Vprolog | |
32 load_c v0, VFilter, r6, r3, r10 | |
33 | |
34 vspltish v5, 8 | |
35 vspltish v6, 3 | |
36 vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
37 | |
38 vspltb v1, v0, 1 | |
39 vspltb v2, v0, 2 | |
40 vspltb v3, v0, 3 | |
41 vspltb v4, v0, 4 | |
42 vspltb v5, v0, 5 | |
43 vspltb v0, v0, 0 | |
44 .endm | |
45 | |
46 .macro vpre_load | |
47 Vprolog | |
48 li r10, 16 | |
49 lvx v10, 0, r9 ;# v10..v14 = first 5 rows | |
50 lvx v11, r10, r9 | |
51 addi r9, r9, 32 | |
52 lvx v12, 0, r9 | |
53 lvx v13, r10, r9 | |
54 addi r9, r9, 32 | |
55 lvx v14, 0, r9 | |
56 .endm | |
57 | |
58 .macro Msum Re, Ro, V, T, TMP | |
59 ;# (Re,Ro) += (V*T) | |
60 vmuleub \TMP, \V, \T ;# trashes v8 | |
61 vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary | |
62 vmuloub \TMP, \V, \T | |
63 vadduhm \Ro, \Ro, \TMP ;# Ro = odds | |
64 .endm | |
65 | |
66 .macro vinterp_no_store P0 P1 P2 P3 P4 P5 | |
67 vmuleub v8, \P0, v0 ;# 64 + 4 positive taps | |
68 vadduhm v16, v6, v8 | |
69 vmuloub v8, \P0, v0 | |
70 vadduhm v17, v6, v8 | |
71 Msum v16, v17, \P2, v2, v8 | |
72 Msum v16, v17, \P3, v3, v8 | |
73 Msum v16, v17, \P5, v5, v8 | |
74 | |
75 vmuleub v18, \P1, v1 ;# 2 negative taps | |
76 vmuloub v19, \P1, v1 | |
77 Msum v18, v19, \P4, v4, v8 | |
78 | |
79 vsubuhs v16, v16, v18 ;# subtract neg from pos | |
80 vsubuhs v17, v17, v19 | |
81 vsrh v16, v16, v7 ;# divide by 128 | |
82 vsrh v17, v17, v7 ;# v16 v17 = evens, odds | |
83 vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order | |
84 vmrglh v19, v16, v17 | |
85 vpkuhus \P0, v18, v19 ;# P0 = 8-bit result | |
86 .endm | |
87 | |
88 .macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 | |
89 vmuleub v24, \P0, v13 ;# 64 + 4 positive taps | |
90 vadduhm v21, v20, v24 | |
91 vmuloub v24, \P0, v13 | |
92 vadduhm v22, v20, v24 | |
93 Msum v21, v22, \P2, v15, v25 | |
94 Msum v21, v22, \P3, v16, v25 | |
95 Msum v21, v22, \P5, v18, v25 | |
96 | |
97 vmuleub v23, \P1, v14 ;# 2 negative taps | |
98 vmuloub v24, \P1, v14 | |
99 Msum v23, v24, \P4, v17, v25 | |
100 | |
101 vsubuhs v21, v21, v23 ;# subtract neg from pos | |
102 vsubuhs v22, v22, v24 | |
103 vsrh v21, v21, v19 ;# divide by 128 | |
104 vsrh v22, v22, v19 ;# v16 v17 = evens, odds | |
105 vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order | |
106 vmrglh v24, v21, v22 | |
107 vpkuhus \P0, v23, v24 ;# P0 = 8-bit result | |
108 .endm | |
109 | |
110 | |
111 .macro Vinterp P0 P1 P2 P3 P4 P5 | |
112 vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 | |
113 stvx \P0, 0, r7 | |
114 add r7, r7, r8 ;# 33 ops per 16 pels | |
115 .endm | |
116 | |
117 | |
118 .macro luma_v P0, P1, P2, P3, P4, P5 | |
119 addi r9, r9, 16 ;# P5 = newest input row | |
120 lvx \P5, 0, r9 | |
121 Vinterp \P0, \P1, \P2, \P3, \P4, \P5 | |
122 .endm | |
123 | |
124 .macro luma_vtwo | |
125 luma_v v10, v11, v12, v13, v14, v15 | |
126 luma_v v11, v12, v13, v14, v15, v10 | |
127 .endm | |
128 | |
129 .macro luma_vfour | |
130 luma_vtwo | |
131 luma_v v12, v13, v14, v15, v10, v11 | |
132 luma_v v13, v14, v15, v10, v11, v12 | |
133 .endm | |
134 | |
135 .macro luma_vsix | |
136 luma_vfour | |
137 luma_v v14, v15, v10, v11, v12, v13 | |
138 luma_v v15, v10, v11, v12, v13, v14 | |
139 .endm | |
140 | |
141 .macro Interp4 R I I4 | |
142 vmsummbm \R, v13, \I, v15 | |
143 vmsummbm \R, v14, \I4, \R | |
144 .endm | |
145 | |
146 .macro Read8x8 VD, RS, RP, increment_counter | |
147 lvsl v21, 0, \RS ;# permutate value for alignment | |
148 | |
149 ;# input to filter is 21 bytes wide, output is 16 bytes. | |
150 ;# input will can span three vectors if not aligned correctly. | |
151 lvx \VD, 0, \RS | |
152 lvx v20, r10, \RS | |
153 | |
154 .if \increment_counter | |
155 add \RS, \RS, \RP | |
156 .endif | |
157 | |
158 vperm \VD, \VD, v20, v21 | |
159 .endm | |
160 | |
161 .macro interp_8x8 R | |
162 vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 | |
163 vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A | |
164 Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 | |
165 vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx | |
166 Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 | |
167 | |
168 vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 | |
169 vsrh \R, \R, v19 | |
170 | |
171 vpkuhus \R, \R, \R ;# saturate and pack | |
172 | |
173 .endm | |
174 | |
175 .macro Read4x4 VD, RS, RP, increment_counter | |
176 lvsl v21, 0, \RS ;# permutate value for alignment | |
177 | |
178 ;# input to filter is 21 bytes wide, output is 16 bytes. | |
179 ;# input will can span three vectors if not aligned correctly. | |
180 lvx v20, 0, \RS | |
181 | |
182 .if \increment_counter | |
183 add \RS, \RS, \RP | |
184 .endif | |
185 | |
186 vperm \VD, v20, v20, v21 | |
187 .endm | |
188 .text | |
189 | |
190 .align 2 | |
191 ;# r3 unsigned char * src | |
192 ;# r4 int src_pitch | |
193 ;# r5 int x_offset | |
194 ;# r6 int y_offset | |
195 ;# r7 unsigned char * dst | |
196 ;# r8 int dst_pitch | |
197 sixtap_predict_ppc: | |
198 mfspr r11, 256 ;# get old VRSAVE | |
199 oris r12, r11, 0xff87 | |
200 ori r12, r12, 0xffc0 | |
201 mtspr 256, r12 ;# set VRSAVE | |
202 | |
203 stwu r1,-32(r1) ;# create space on the stack | |
204 | |
205 slwi. r5, r5, 5 ;# index into horizontal filter array | |
206 | |
207 vspltish v19, 7 | |
208 | |
209 ;# If there isn't any filtering to be done for the horizontal, then | |
210 ;# just skip to the second pass. | |
211 beq- vertical_only_4x4 | |
212 | |
213 ;# load up horizontal filter | |
214 load_hfilter v13, v14 | |
215 | |
216 ;# rounding added in on the multiply | |
217 vspltisw v16, 8 | |
218 vspltisw v15, 3 | |
219 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 | |
220 | |
221 ;# Load up permutation constants | |
222 load_c v16, B_0123, 0, r9, r10 | |
223 load_c v17, B_4567, 0, r9, r10 | |
224 load_c v18, B_89AB, 0, r9, r10 | |
225 | |
226 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after | |
227 addi r3, r3, -2 | |
228 | |
229 addi r9, r3, 0 | |
230 li r10, 16 | |
231 Read8x8 v2, r3, r4, 1 | |
232 Read8x8 v3, r3, r4, 1 | |
233 Read8x8 v4, r3, r4, 1 | |
234 Read8x8 v5, r3, r4, 1 | |
235 | |
236 slwi. r6, r6, 4 ;# index into vertical filter array | |
237 | |
238 ;# filter a line | |
239 interp_8x8 v2 | |
240 interp_8x8 v3 | |
241 interp_8x8 v4 | |
242 interp_8x8 v5 | |
243 | |
244 ;# Finished filtering main horizontal block. If there is no | |
245 ;# vertical filtering, jump to storing the data. Otherwise | |
246 ;# load up and filter the additional 5 lines that are needed | |
247 ;# for the vertical filter. | |
248 beq- store_4x4 | |
249 | |
250 ;# only needed if there is a vertical filter present | |
251 ;# if the second filter is not null then need to back off by 2*pitch | |
252 sub r9, r9, r4 | |
253 sub r9, r9, r4 | |
254 | |
255 Read8x8 v0, r9, r4, 1 | |
256 Read8x8 v1, r9, r4, 0 | |
257 Read8x8 v6, r3, r4, 1 | |
258 Read8x8 v7, r3, r4, 1 | |
259 Read8x8 v8, r3, r4, 0 | |
260 | |
261 interp_8x8 v0 | |
262 interp_8x8 v1 | |
263 interp_8x8 v6 | |
264 interp_8x8 v7 | |
265 interp_8x8 v8 | |
266 | |
267 b second_pass_4x4 | |
268 | |
269 vertical_only_4x4: | |
270 ;# only needed if there is a vertical filter present | |
271 ;# if the second filter is not null then need to back off by 2*pitch | |
272 sub r3, r3, r4 | |
273 sub r3, r3, r4 | |
274 li r10, 16 | |
275 | |
276 Read8x8 v0, r3, r4, 1 | |
277 Read8x8 v1, r3, r4, 1 | |
278 Read8x8 v2, r3, r4, 1 | |
279 Read8x8 v3, r3, r4, 1 | |
280 Read8x8 v4, r3, r4, 1 | |
281 Read8x8 v5, r3, r4, 1 | |
282 Read8x8 v6, r3, r4, 1 | |
283 Read8x8 v7, r3, r4, 1 | |
284 Read8x8 v8, r3, r4, 0 | |
285 | |
286 slwi r6, r6, 4 ;# index into vertical filter array | |
287 | |
288 second_pass_4x4: | |
289 load_c v20, b_hilo_4x4, 0, r9, r10 | |
290 load_c v21, b_hilo, 0, r9, r10 | |
291 | |
292 ;# reposition input so that it can go through the | |
293 ;# filtering phase with one pass. | |
294 vperm v0, v0, v1, v20 ;# 0 1 x x | |
295 vperm v2, v2, v3, v20 ;# 2 3 x x | |
296 vperm v4, v4, v5, v20 ;# 4 5 x x | |
297 vperm v6, v6, v7, v20 ;# 6 7 x x | |
298 | |
299 vperm v0, v0, v2, v21 ;# 0 1 2 3 | |
300 vperm v4, v4, v6, v21 ;# 4 5 6 7 | |
301 | |
302 vsldoi v1, v0, v4, 4 | |
303 vsldoi v2, v0, v4, 8 | |
304 vsldoi v3, v0, v4, 12 | |
305 | |
306 vsldoi v5, v4, v8, 4 | |
307 | |
308 load_c v13, VFilter, r6, r9, r10 | |
309 | |
310 vspltish v15, 8 | |
311 vspltish v20, 3 | |
312 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
313 | |
314 vspltb v14, v13, 1 | |
315 vspltb v15, v13, 2 | |
316 vspltb v16, v13, 3 | |
317 vspltb v17, v13, 4 | |
318 vspltb v18, v13, 5 | |
319 vspltb v13, v13, 0 | |
320 | |
321 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 | |
322 | |
323 stvx v0, 0, r1 | |
324 | |
325 lwz r0, 0(r1) | |
326 stw r0, 0(r7) | |
327 add r7, r7, r8 | |
328 | |
329 lwz r0, 4(r1) | |
330 stw r0, 0(r7) | |
331 add r7, r7, r8 | |
332 | |
333 lwz r0, 8(r1) | |
334 stw r0, 0(r7) | |
335 add r7, r7, r8 | |
336 | |
337 lwz r0, 12(r1) | |
338 stw r0, 0(r7) | |
339 | |
340 b exit_4x4 | |
341 | |
342 store_4x4: | |
343 | |
344 stvx v2, 0, r1 | |
345 lwz r0, 0(r1) | |
346 stw r0, 0(r7) | |
347 add r7, r7, r8 | |
348 | |
349 stvx v3, 0, r1 | |
350 lwz r0, 0(r1) | |
351 stw r0, 0(r7) | |
352 add r7, r7, r8 | |
353 | |
354 stvx v4, 0, r1 | |
355 lwz r0, 0(r1) | |
356 stw r0, 0(r7) | |
357 add r7, r7, r8 | |
358 | |
359 stvx v5, 0, r1 | |
360 lwz r0, 0(r1) | |
361 stw r0, 0(r7) | |
362 | |
363 exit_4x4: | |
364 | |
365 addi r1, r1, 32 ;# recover stack | |
366 | |
367 mtspr 256, r11 ;# reset old VRSAVE | |
368 | |
369 blr | |
370 | |
371 .macro w_8x8 V, D, R, P | |
372 stvx \V, 0, r1 | |
373 lwz \R, 0(r1) | |
374 stw \R, 0(r7) | |
375 lwz \R, 4(r1) | |
376 stw \R, 4(r7) | |
377 add \D, \D, \P | |
378 .endm | |
379 | |
380 .align 2 | |
381 ;# r3 unsigned char * src | |
382 ;# r4 int src_pitch | |
383 ;# r5 int x_offset | |
384 ;# r6 int y_offset | |
385 ;# r7 unsigned char * dst | |
386 ;# r8 int dst_pitch | |
387 | |
388 sixtap_predict8x4_ppc: | |
389 mfspr r11, 256 ;# get old VRSAVE | |
390 oris r12, r11, 0xffff | |
391 ori r12, r12, 0xffc0 | |
392 mtspr 256, r12 ;# set VRSAVE | |
393 | |
394 stwu r1,-32(r1) ;# create space on the stack | |
395 | |
396 slwi. r5, r5, 5 ;# index into horizontal filter array | |
397 | |
398 vspltish v19, 7 | |
399 | |
400 ;# If there isn't any filtering to be done for the horizontal, then | |
401 ;# just skip to the second pass. | |
402 beq- second_pass_pre_copy_8x4 | |
403 | |
404 load_hfilter v13, v14 | |
405 | |
406 ;# rounding added in on the multiply | |
407 vspltisw v16, 8 | |
408 vspltisw v15, 3 | |
409 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 | |
410 | |
411 ;# Load up permutation constants | |
412 load_c v16, B_0123, 0, r9, r10 | |
413 load_c v17, B_4567, 0, r9, r10 | |
414 load_c v18, B_89AB, 0, r9, r10 | |
415 | |
416 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after | |
417 addi r3, r3, -2 | |
418 | |
419 addi r9, r3, 0 | |
420 li r10, 16 | |
421 Read8x8 v2, r3, r4, 1 | |
422 Read8x8 v3, r3, r4, 1 | |
423 Read8x8 v4, r3, r4, 1 | |
424 Read8x8 v5, r3, r4, 1 | |
425 | |
426 slwi. r6, r6, 4 ;# index into vertical filter array | |
427 | |
428 ;# filter a line | |
429 interp_8x8 v2 | |
430 interp_8x8 v3 | |
431 interp_8x8 v4 | |
432 interp_8x8 v5 | |
433 | |
434 ;# Finished filtering main horizontal block. If there is no | |
435 ;# vertical filtering, jump to storing the data. Otherwise | |
436 ;# load up and filter the additional 5 lines that are needed | |
437 ;# for the vertical filter. | |
438 beq- store_8x4 | |
439 | |
440 ;# only needed if there is a vertical filter present | |
441 ;# if the second filter is not null then need to back off by 2*pitch | |
442 sub r9, r9, r4 | |
443 sub r9, r9, r4 | |
444 | |
445 Read8x8 v0, r9, r4, 1 | |
446 Read8x8 v1, r9, r4, 0 | |
447 Read8x8 v6, r3, r4, 1 | |
448 Read8x8 v7, r3, r4, 1 | |
449 Read8x8 v8, r3, r4, 0 | |
450 | |
451 interp_8x8 v0 | |
452 interp_8x8 v1 | |
453 interp_8x8 v6 | |
454 interp_8x8 v7 | |
455 interp_8x8 v8 | |
456 | |
457 b second_pass_8x4 | |
458 | |
459 second_pass_pre_copy_8x4: | |
460 ;# only needed if there is a vertical filter present | |
461 ;# if the second filter is not null then need to back off by 2*pitch | |
462 sub r3, r3, r4 | |
463 sub r3, r3, r4 | |
464 li r10, 16 | |
465 | |
466 Read8x8 v0, r3, r4, 1 | |
467 Read8x8 v1, r3, r4, 1 | |
468 Read8x8 v2, r3, r4, 1 | |
469 Read8x8 v3, r3, r4, 1 | |
470 Read8x8 v4, r3, r4, 1 | |
471 Read8x8 v5, r3, r4, 1 | |
472 Read8x8 v6, r3, r4, 1 | |
473 Read8x8 v7, r3, r4, 1 | |
474 Read8x8 v8, r3, r4, 1 | |
475 | |
476 slwi r6, r6, 4 ;# index into vertical filter array | |
477 | |
478 second_pass_8x4: | |
479 load_c v13, VFilter, r6, r9, r10 | |
480 | |
481 vspltish v15, 8 | |
482 vspltish v20, 3 | |
483 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
484 | |
485 vspltb v14, v13, 1 | |
486 vspltb v15, v13, 2 | |
487 vspltb v16, v13, 3 | |
488 vspltb v17, v13, 4 | |
489 vspltb v18, v13, 5 | |
490 vspltb v13, v13, 0 | |
491 | |
492 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 | |
493 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 | |
494 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 | |
495 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 | |
496 | |
497 cmpi cr0, r8, 8 | |
498 beq cr0, store_aligned_8x4 | |
499 | |
500 w_8x8 v0, r7, r0, r8 | |
501 w_8x8 v1, r7, r0, r8 | |
502 w_8x8 v2, r7, r0, r8 | |
503 w_8x8 v3, r7, r0, r8 | |
504 | |
505 b exit_8x4 | |
506 | |
507 store_aligned_8x4: | |
508 | |
509 load_c v10, b_hilo, 0, r9, r10 | |
510 | |
511 vperm v0, v0, v1, v10 | |
512 vperm v2, v2, v3, v10 | |
513 | |
514 stvx v0, 0, r7 | |
515 addi r7, r7, 16 | |
516 stvx v2, 0, r7 | |
517 | |
518 b exit_8x4 | |
519 | |
520 store_8x4: | |
521 cmpi cr0, r8, 8 | |
522 beq cr0, store_aligned2_8x4 | |
523 | |
524 w_8x8 v2, r7, r0, r8 | |
525 w_8x8 v3, r7, r0, r8 | |
526 w_8x8 v4, r7, r0, r8 | |
527 w_8x8 v5, r7, r0, r8 | |
528 | |
529 b exit_8x4 | |
530 | |
531 store_aligned2_8x4: | |
532 load_c v10, b_hilo, 0, r9, r10 | |
533 | |
534 vperm v2, v2, v3, v10 | |
535 vperm v4, v4, v5, v10 | |
536 | |
537 stvx v2, 0, r7 | |
538 addi r7, r7, 16 | |
539 stvx v4, 0, r7 | |
540 | |
541 exit_8x4: | |
542 | |
543 addi r1, r1, 32 ;# recover stack | |
544 | |
545 mtspr 256, r11 ;# reset old VRSAVE | |
546 | |
547 | |
548 blr | |
549 | |
550 .align 2 | |
551 ;# r3 unsigned char * src | |
552 ;# r4 int src_pitch | |
553 ;# r5 int x_offset | |
554 ;# r6 int y_offset | |
555 ;# r7 unsigned char * dst | |
556 ;# r8 int dst_pitch | |
557 | |
558 ;# Because the width that needs to be filtered will fit in a single altivec | |
559 ;# register there is no need to loop. Everything can stay in registers. | |
560 sixtap_predict8x8_ppc: | |
561 mfspr r11, 256 ;# get old VRSAVE | |
562 oris r12, r11, 0xffff | |
563 ori r12, r12, 0xffc0 | |
564 mtspr 256, r12 ;# set VRSAVE | |
565 | |
566 stwu r1,-32(r1) ;# create space on the stack | |
567 | |
568 slwi. r5, r5, 5 ;# index into horizontal filter array | |
569 | |
570 vspltish v19, 7 | |
571 | |
572 ;# If there isn't any filtering to be done for the horizontal, then | |
573 ;# just skip to the second pass. | |
574 beq- second_pass_pre_copy_8x8 | |
575 | |
576 load_hfilter v13, v14 | |
577 | |
578 ;# rounding added in on the multiply | |
579 vspltisw v16, 8 | |
580 vspltisw v15, 3 | |
581 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 | |
582 | |
583 ;# Load up permutation constants | |
584 load_c v16, B_0123, 0, r9, r10 | |
585 load_c v17, B_4567, 0, r9, r10 | |
586 load_c v18, B_89AB, 0, r9, r10 | |
587 | |
588 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after | |
589 addi r3, r3, -2 | |
590 | |
591 addi r9, r3, 0 | |
592 li r10, 16 | |
593 Read8x8 v2, r3, r4, 1 | |
594 Read8x8 v3, r3, r4, 1 | |
595 Read8x8 v4, r3, r4, 1 | |
596 Read8x8 v5, r3, r4, 1 | |
597 Read8x8 v6, r3, r4, 1 | |
598 Read8x8 v7, r3, r4, 1 | |
599 Read8x8 v8, r3, r4, 1 | |
600 Read8x8 v9, r3, r4, 1 | |
601 | |
602 slwi. r6, r6, 4 ;# index into vertical filter array | |
603 | |
604 ;# filter a line | |
605 interp_8x8 v2 | |
606 interp_8x8 v3 | |
607 interp_8x8 v4 | |
608 interp_8x8 v5 | |
609 interp_8x8 v6 | |
610 interp_8x8 v7 | |
611 interp_8x8 v8 | |
612 interp_8x8 v9 | |
613 | |
614 ;# Finished filtering main horizontal block. If there is no | |
615 ;# vertical filtering, jump to storing the data. Otherwise | |
616 ;# load up and filter the additional 5 lines that are needed | |
617 ;# for the vertical filter. | |
618 beq- store_8x8 | |
619 | |
620 ;# only needed if there is a vertical filter present | |
621 ;# if the second filter is not null then need to back off by 2*pitch | |
622 sub r9, r9, r4 | |
623 sub r9, r9, r4 | |
624 | |
625 Read8x8 v0, r9, r4, 1 | |
626 Read8x8 v1, r9, r4, 0 | |
627 Read8x8 v10, r3, r4, 1 | |
628 Read8x8 v11, r3, r4, 1 | |
629 Read8x8 v12, r3, r4, 0 | |
630 | |
631 interp_8x8 v0 | |
632 interp_8x8 v1 | |
633 interp_8x8 v10 | |
634 interp_8x8 v11 | |
635 interp_8x8 v12 | |
636 | |
637 b second_pass_8x8 | |
638 | |
639 second_pass_pre_copy_8x8: | |
640 ;# only needed if there is a vertical filter present | |
641 ;# if the second filter is not null then need to back off by 2*pitch | |
642 sub r3, r3, r4 | |
643 sub r3, r3, r4 | |
644 li r10, 16 | |
645 | |
646 Read8x8 v0, r3, r4, 1 | |
647 Read8x8 v1, r3, r4, 1 | |
648 Read8x8 v2, r3, r4, 1 | |
649 Read8x8 v3, r3, r4, 1 | |
650 Read8x8 v4, r3, r4, 1 | |
651 Read8x8 v5, r3, r4, 1 | |
652 Read8x8 v6, r3, r4, 1 | |
653 Read8x8 v7, r3, r4, 1 | |
654 Read8x8 v8, r3, r4, 1 | |
655 Read8x8 v9, r3, r4, 1 | |
656 Read8x8 v10, r3, r4, 1 | |
657 Read8x8 v11, r3, r4, 1 | |
658 Read8x8 v12, r3, r4, 0 | |
659 | |
660 slwi r6, r6, 4 ;# index into vertical filter array | |
661 | |
662 second_pass_8x8: | |
663 load_c v13, VFilter, r6, r9, r10 | |
664 | |
665 vspltish v15, 8 | |
666 vspltish v20, 3 | |
667 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 | |
668 | |
669 vspltb v14, v13, 1 | |
670 vspltb v15, v13, 2 | |
671 vspltb v16, v13, 3 | |
672 vspltb v17, v13, 4 | |
673 vspltb v18, v13, 5 | |
674 vspltb v13, v13, 0 | |
675 | |
676 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 | |
677 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 | |
678 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 | |
679 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 | |
680 vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 | |
681 vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 | |
682 vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 | |
683 vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 | |
684 | |
685 cmpi cr0, r8, 8 | |
686 beq cr0, store_aligned_8x8 | |
687 | |
688 w_8x8 v0, r7, r0, r8 | |
689 w_8x8 v1, r7, r0, r8 | |
690 w_8x8 v2, r7, r0, r8 | |
691 w_8x8 v3, r7, r0, r8 | |
692 w_8x8 v4, r7, r0, r8 | |
693 w_8x8 v5, r7, r0, r8 | |
694 w_8x8 v6, r7, r0, r8 | |
695 w_8x8 v7, r7, r0, r8 | |
696 | |
697 b exit_8x8 | |
698 | |
699 store_aligned_8x8: | |
700 | |
701 load_c v10, b_hilo, 0, r9, r10 | |
702 | |
703 vperm v0, v0, v1, v10 | |
704 vperm v2, v2, v3, v10 | |
705 vperm v4, v4, v5, v10 | |
706 vperm v6, v6, v7, v10 | |
707 | |
708 stvx v0, 0, r7 | |
709 addi r7, r7, 16 | |
710 stvx v2, 0, r7 | |
711 addi r7, r7, 16 | |
712 stvx v4, 0, r7 | |
713 addi r7, r7, 16 | |
714 stvx v6, 0, r7 | |
715 | |
716 b exit_8x8 | |
717 | |
718 store_8x8: | |
719 cmpi cr0, r8, 8 | |
720 beq cr0, store_aligned2_8x8 | |
721 | |
722 w_8x8 v2, r7, r0, r8 | |
723 w_8x8 v3, r7, r0, r8 | |
724 w_8x8 v4, r7, r0, r8 | |
725 w_8x8 v5, r7, r0, r8 | |
726 w_8x8 v6, r7, r0, r8 | |
727 w_8x8 v7, r7, r0, r8 | |
728 w_8x8 v8, r7, r0, r8 | |
729 w_8x8 v9, r7, r0, r8 | |
730 | |
731 b exit_8x8 | |
732 | |
733 store_aligned2_8x8: | |
734 load_c v10, b_hilo, 0, r9, r10 | |
735 | |
736 vperm v2, v2, v3, v10 | |
737 vperm v4, v4, v5, v10 | |
738 vperm v6, v6, v7, v10 | |
739 vperm v8, v8, v9, v10 | |
740 | |
741 stvx v2, 0, r7 | |
742 addi r7, r7, 16 | |
743 stvx v4, 0, r7 | |
744 addi r7, r7, 16 | |
745 stvx v6, 0, r7 | |
746 addi r7, r7, 16 | |
747 stvx v8, 0, r7 | |
748 | |
749 exit_8x8: | |
750 | |
751 addi r1, r1, 32 ;# recover stack | |
752 | |
753 mtspr 256, r11 ;# reset old VRSAVE | |
754 | |
755 blr | |
756 | |
757 .align 2 | |
758 ;# r3 unsigned char * src | |
759 ;# r4 int src_pitch | |
760 ;# r5 int x_offset | |
761 ;# r6 int y_offset | |
762 ;# r7 unsigned char * dst | |
763 ;# r8 int dst_pitch | |
764 | |
765 ;# Two pass filtering. First pass is Horizontal edges, second pass is vertical | |
766 ;# edges. One of the filters can be null, but both won't be. Needs to use a | |
767 ;# temporary buffer because the source buffer can't be modified and the buffer | |
768 ;# for the destination is not large enough to hold the temporary data. | |
769 sixtap_predict16x16_ppc: | |
770 mfspr r11, 256 ;# get old VRSAVE | |
771 oris r12, r11, 0xffff | |
772 ori r12, r12, 0xf000 | |
773 mtspr 256, r12 ;# set VRSAVE | |
774 | |
775 stwu r1,-416(r1) ;# create space on the stack | |
776 | |
777 ;# Three possiblities | |
778 ;# 1. First filter is null. Don't use a temp buffer. | |
779 ;# 2. Second filter is null. Don't use a temp buffer. | |
780 ;# 3. Neither are null, use temp buffer. | |
781 | |
782 ;# First Pass (horizontal edge) | |
783 ;# setup pointers for src | |
784 ;# if possiblity (1) then setup the src pointer to be the orginal and jump | |
785 ;# to second pass. this is based on if x_offset is 0. | |
786 | |
787 ;# load up horizontal filter | |
788 slwi. r5, r5, 5 ;# index into horizontal filter array | |
789 | |
790 load_hfilter v4, v5 | |
791 | |
792 beq- copy_horizontal_16x21 | |
793 | |
794 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after | |
795 addi r3, r3, -2 | |
796 | |
797 slwi. r6, r6, 4 ;# index into vertical filter array | |
798 | |
799 ;# setup constants | |
800 ;# v14 permutation value for alignment | |
801 load_c v14, b_hperm, 0, r9, r10 | |
802 | |
803 ;# These statements are guessing that there won't be a second pass, | |
804 ;# but if there is then inside the bypass they need to be set | |
805 li r0, 16 ;# prepare for no vertical filter | |
806 | |
807 ;# Change the output pointer and pitch to be the actual | |
808 ;# desination instead of a temporary buffer. | |
809 addi r9, r7, 0 | |
810 addi r5, r8, 0 | |
811 | |
812 ;# no vertical filter, so write the output from the first pass | |
813 ;# directly into the output buffer. | |
814 beq- no_vertical_filter_bypass | |
815 | |
816 ;# if the second filter is not null then need to back off by 2*pitch | |
817 sub r3, r3, r4 | |
818 sub r3, r3, r4 | |
819 | |
820 ;# setup counter for the number of lines that are going to be filtered | |
821 li r0, 21 | |
822 | |
823 ;# use the stack as temporary storage | |
824 la r9, 48(r1) | |
825 li r5, 16 | |
826 | |
827 no_vertical_filter_bypass: | |
828 | |
829 mtctr r0 | |
830 | |
831 ;# rounding added in on the multiply | |
832 vspltisw v10, 8 | |
833 vspltisw v12, 3 | |
834 vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 | |
835 | |
836 ;# downshift by 7 ( divide by 128 ) at the end | |
837 vspltish v13, 7 | |
838 | |
839 ;# index to the next set of vectors in the row. | |
840 li r10, 16 | |
841 li r12, 32 | |
842 | |
843 horizontal_loop_16x16: | |
844 | |
845 lvsl v15, 0, r3 ;# permutate value for alignment | |
846 | |
847 ;# input to filter is 21 bytes wide, output is 16 bytes. | |
848 ;# input will can span three vectors if not aligned correctly. | |
849 lvx v1, 0, r3 | |
850 lvx v2, r10, r3 | |
851 lvx v3, r12, r3 | |
852 | |
853 vperm v8, v1, v2, v15 | |
854 vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified | |
855 | |
856 vsldoi v11, v8, v9, 4 | |
857 | |
858 ;# set 0 | |
859 vmsummbm v6, v4, v8, v12 ;# taps times elements | |
860 vmsummbm v0, v5, v11, v6 | |
861 | |
862 ;# set 1 | |
863 vsldoi v10, v8, v9, 1 | |
864 vsldoi v11, v8, v9, 5 | |
865 | |
866 vmsummbm v6, v4, v10, v12 | |
867 vmsummbm v1, v5, v11, v6 | |
868 | |
869 ;# set 2 | |
870 vsldoi v10, v8, v9, 2 | |
871 vsldoi v11, v8, v9, 6 | |
872 | |
873 vmsummbm v6, v4, v10, v12 | |
874 vmsummbm v2, v5, v11, v6 | |
875 | |
876 ;# set 3 | |
877 vsldoi v10, v8, v9, 3 | |
878 vsldoi v11, v8, v9, 7 | |
879 | |
880 vmsummbm v6, v4, v10, v12 | |
881 vmsummbm v3, v5, v11, v6 | |
882 | |
883 vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) | |
884 vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F | |
885 | |
886 vsrh v0, v0, v13 ;# divide v0, v1 by 128 | |
887 vsrh v1, v1, v13 | |
888 | |
889 vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result | |
890 vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result | |
891 | |
892 stvx v0, 0, r9 | |
893 add r9, r9, r5 | |
894 | |
895 add r3, r3, r4 | |
896 | |
897 bdnz horizontal_loop_16x16 | |
898 | |
899 ;# check again to see if vertical filter needs to be done. | |
900 cmpi cr0, r6, 0 | |
901 beq cr0, end_16x16 | |
902 | |
903 ;# yes there is, so go to the second pass | |
904 b second_pass_16x16 | |
905 | |
906 copy_horizontal_16x21: | |
907 li r10, 21 | |
908 mtctr r10 | |
909 | |
910 li r10, 16 | |
911 | |
912 sub r3, r3, r4 | |
913 sub r3, r3, r4 | |
914 | |
915 ;# this is done above if there is a horizontal filter, | |
916 ;# if not it needs to be done down here. | |
917 slwi r6, r6, 4 ;# index into vertical filter array | |
918 | |
919 ;# always write to the stack when doing a horizontal copy | |
920 la r9, 48(r1) | |
921 | |
922 copy_horizontal_loop_16x21: | |
923 lvsl v15, 0, r3 ;# permutate value for alignment | |
924 | |
925 lvx v1, 0, r3 | |
926 lvx v2, r10, r3 | |
927 | |
928 vperm v8, v1, v2, v15 | |
929 | |
930 stvx v8, 0, r9 | |
931 addi r9, r9, 16 | |
932 | |
933 add r3, r3, r4 | |
934 | |
935 bdnz copy_horizontal_loop_16x21 | |
936 | |
937 second_pass_16x16: | |
938 | |
939 ;# always read from the stack when doing a vertical filter | |
940 la r9, 48(r1) | |
941 | |
942 ;# downshift by 7 ( divide by 128 ) at the end | |
943 vspltish v7, 7 | |
944 | |
945 vpre_load | |
946 | |
947 luma_vsix | |
948 luma_vsix | |
949 luma_vfour | |
950 | |
951 end_16x16: | |
952 | |
953 addi r1, r1, 416 ;# recover stack | |
954 | |
955 mtspr 256, r11 ;# reset old VRSAVE | |
956 | |
957 blr | |
958 | |
959 .data | |
960 | |
961 .align 4 | |
962 HFilter: | |
963 .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 | |
964 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
965 .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 | |
966 .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 | |
967 .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 | |
968 .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 | |
969 .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 | |
970 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 | |
971 .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 | |
972 .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 | |
973 .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 | |
974 .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 | |
975 .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 | |
976 .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 | |
977 .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 | |
978 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 | |
979 | |
980 .align 4 | |
981 VFilter: | |
982 .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
983 .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
984 .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
985 .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
986 .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
987 .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
988 .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
989 .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
990 | |
991 .align 4 | |
992 b_hperm: | |
993 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 | |
994 | |
995 .align 4 | |
996 B_0123: | |
997 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 | |
998 | |
999 .align 4 | |
1000 B_4567: | |
1001 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 | |
1002 | |
1003 .align 4 | |
1004 B_89AB: | |
1005 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 | |
1006 | |
1007 .align 4 | |
1008 b_hilo: | |
1009 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 | |
1010 | |
1011 .align 4 | |
1012 b_hilo_4x4: | |
1013 .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 | |
OLD | NEW |