source/libvpx/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm - Issue 11555023: libvpx: Add VP9 decoder.

Unified Diff: source/libvpx/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/libvpx/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm

===================================================================

--- source/libvpx/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm (revision 0)

+++ source/libvpx/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm (revision 0)

@@ -0,0 +1,1253 @@

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+ .globl mbloop_filter_horizontal_edge_y_ppc

+ .globl loop_filter_horizontal_edge_y_ppc

+ .globl mbloop_filter_vertical_edge_y_ppc

+ .globl loop_filter_vertical_edge_y_ppc

+ .globl mbloop_filter_horizontal_edge_uv_ppc

+ .globl loop_filter_horizontal_edge_uv_ppc

+ .globl mbloop_filter_vertical_edge_uv_ppc

+ .globl loop_filter_vertical_edge_uv_ppc

+ .globl loop_filter_simple_horizontal_edge_ppc

+ .globl loop_filter_simple_vertical_edge_ppc

+ .text

+;# We often need to perform transposes (and other transpose-like operations)

+;# on matrices of data. This is simplified by the fact that we usually

+;# operate on hunks of data whose dimensions are powers of 2, or at least

+;# divisible by highish powers of 2.

+;#

+;# These operations can be very confusing. They become more straightforward

+;# when we think of them as permutations of address bits: Concatenate a

+;# group of vector registers and think of it as occupying a block of

+;# memory beginning at address zero. The low four bits 0...3 of the

+;# address then correspond to position within a register, the higher-order

+;# address bits select the register.

+;#

+;# Although register selection, at the code level, is arbitrary, things

+;# are simpler if we use contiguous ranges of register numbers, simpler

+;# still if the low-order bits of the register number correspond to

+;# conceptual address bits. We do this whenever reasonable.

+;#

+;# A 16x16 transpose can then be thought of as an operation on

+;# a 256-element block of memory. It takes 8 bits 0...7 to address this

+;# memory and the effect of a transpose is to interchange address bit

+;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the

+;# column, which is interchanged with the row addressed by bits 4..7.

+;#

+;# The altivec merge instructions provide a rapid means of effecting

+;# many of these transforms. They operate at three widths (8,16,32).

+;# Writing V(x) for vector register #x, paired merges permute address

+;# indices as follows.

+;#

+;# 0->1 1->2 2->3 3->(4+d) (4+s)->0:

+;#

+;# vmrghb V( x), V( y), V( y + (1<<s))

+;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s))

+;#

+;# =0= 1->2 2->3 3->(4+d) (4+s)->1:

+;#

+;# vmrghh V( x), V( y), V( y + (1<<s))

+;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s))

+;#

+;# =0= =1= 2->3 3->(4+d) (4+s)->2:

+;#

+;# vmrghw V( x), V( y), V( y + (1<<s))

+;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s))

+;#

+;# Unfortunately, there is no doubleword merge instruction.

+;# The following sequence uses "vperm" is a substitute.

+;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)

+;# are in registers Vhihi and Vlolo, we can also effect the permutation

+;#

+;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence:

+;#

+;# vperm V( x), V( y), V( y + (1<<s)), Vhihi

+;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo

+;#

+;# Except for bits s and d, the other relationships between register

+;# number (= high-order part of address) bits are at the disposal of

+;# the programmer.

+;#

+;# To avoid excess transposes, we filter all 3 vertical luma subblock

+;# edges together. This requires a single 16x16 transpose, which, in

+;# the above language, amounts to the following permutation of address

+;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by

+;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.

+;#

+;# Except for the fact that the destination registers get written

+;# before we are done referencing the old contents, the cyclic transform

+;# is effected by

+;#

+;# x = 0; do {

+;# vmrghb V(2x), V(x), V(x+8);

+;# vmrghb V(2x+1), V(x), V(x+8);

+;# } while( ++x < 8);

+;#

+;# For clarity, and because we can afford it, we do this transpose

+;# using all 32 registers, alternating the banks 0..15 and 16 .. 31,

+;# leaving the final result in 16 .. 31, as the lower registers are

+;# used in the filtering itself.

+;#

+.macro Tpair A, B, X, Y

+ vmrghb \A, \X, \Y

+ vmrglb \B, \X, \Y

+.endm

+;# Each step takes 8*2 = 16 instructions

+.macro t16_even

+ Tpair v16,v17, v0,v8

+ Tpair v18,v19, v1,v9

+ Tpair v20,v21, v2,v10

+ Tpair v22,v23, v3,v11

+ Tpair v24,v25, v4,v12

+ Tpair v26,v27, v5,v13

+ Tpair v28,v29, v6,v14

+ Tpair v30,v31, v7,v15

+.endm

+.macro t16_odd

+ Tpair v0,v1, v16,v24

+ Tpair v2,v3, v17,v25

+ Tpair v4,v5, v18,v26

+ Tpair v6,v7, v19,v27

+ Tpair v8,v9, v20,v28

+ Tpair v10,v11, v21,v29

+ Tpair v12,v13, v22,v30

+ Tpair v14,v15, v23,v31

+.endm

+;# Whole transpose takes 4*16 = 64 instructions

+.macro t16_full

+ t16_odd

+ t16_even

+ t16_odd

+ t16_even

+.endm

+;# Vertical edge filtering requires transposes. For the simple filter,

+;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels

+;# each. Writing 0 ... 63 for the pixel indices, the desired result is:

+;#

+;# v0 = 0 1 ... 14 15

+;# v1 = 16 17 ... 30 31

+;# v2 = 32 33 ... 47 48

+;# v3 = 49 50 ... 62 63

+;#

+;# In frame-buffer memory, the layout is:

+;#

+;# 0 16 32 48

+;# 1 17 33 49

+;# ...

+;# 15 31 47 63.

+;#

+;# We begin by reading the data 32 bits at a time (using scalar operations)

+;# into a temporary array, reading the rows of the array into vector registers,

+;# with the following layout:

+;#

+;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60

+;# v1 = 1 17 33 49 5 21 ... 45 61

+;# v2 = 2 18 ... 46 62

+;# v3 = 3 19 ... 47 63

+;#

+;# From the "address-bit" perspective discussed above, we simply need to

+;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.

+;# In other words, we transpose each of the four 4x4 submatrices.

+;#

+;# This transformation is its own inverse, and we need to perform it

+;# again before writing the pixels back into the frame buffer.

+;#

+;# It acts in place on registers v0...v3, uses v4...v7 as temporaries,

+;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors

+;# defined above. We think of both groups of 4 registers as having

+;# "addresses" {0,1,2,3} * 16.

+;#

+.macro Transpose4times4x4 Vlo, Vhi

+ ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5=

+ vmrghb v4, v0, v1

+ vmrglb v5, v0, v1

+ vmrghb v6, v2, v3

+ vmrglb v7, v2, v3

+ ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1

+ vmrghh v0, v4, v6

+ vmrglh v1, v4, v6

+ vmrghh v2, v5, v7

+ vmrglh v3, v5, v7

+ ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5=

+ vmrghw v4, v0, v1

+ vmrglw v5, v0, v1

+ vmrghw v6, v2, v3

+ vmrglw v7, v2, v3

+ ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3

+ vperm v0, v4, v6, \Vlo

+ vperm v1, v4, v6, \Vhi

+ vperm v2, v5, v7, \Vlo

+ vperm v3, v5, v7, \Vhi

+.endm

+;# end Transpose4times4x4

+;# Normal mb vertical edge filter transpose.

+;#

+;# We read 8 columns of data, initially in the following pattern:

+;#

+;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1)

+;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3)

+;# ...

+;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)

+;#

+;# and wish to convert to:

+;#

+;# (0,0) ... (0,15)

+;# (1,0) ... (1,15)

+;# ...

+;# (7,0) ... (7,15).

+;#

+;# In "address bit" language, we wish to map

+;#

+;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7.

+;#

+;# This can be accomplished by 4 iterations of the cyclic transform

+;#

+;# I -> (I+1) mod 7;

+;#

+;# each iteration can be realized by (d=0, s=2):

+;#

+;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4);

+;#

+;# The input/output is in registers v0...v7. We use v10...v17 as mirrors;

+;# preserving v8 = sign converter.

+;#

+;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the

+;# result lands in the "mirror" registers v10...v17

+;#

+.macro t8x16_odd

+ Tpair v10, v11, v0, v4

+ Tpair v12, v13, v1, v5

+ Tpair v14, v15, v2, v6

+ Tpair v16, v17, v3, v7

+.endm

+.macro t8x16_even

+ Tpair v0, v1, v10, v14

+ Tpair v2, v3, v11, v15

+ Tpair v4, v5, v12, v16

+ Tpair v6, v7, v13, v17

+.endm

+.macro transpose8x16_fwd

+ t8x16_odd

+ t8x16_even

+ t8x16_odd

+ t8x16_even

+.endm

+.macro transpose8x16_inv

+ t8x16_odd

+ t8x16_even

+ t8x16_odd

+.endm

+.macro Transpose16x16

+ vmrghb v0, v16, v24

+ vmrglb v1, v16, v24

+ vmrghb v2, v17, v25

+ vmrglb v3, v17, v25

+ vmrghb v4, v18, v26

+ vmrglb v5, v18, v26

+ vmrghb v6, v19, v27

+ vmrglb v7, v19, v27

+ vmrghb v8, v20, v28

+ vmrglb v9, v20, v28

+ vmrghb v10, v21, v29

+ vmrglb v11, v21, v29

+ vmrghb v12, v22, v30

+ vmrglb v13, v22, v30

+ vmrghb v14, v23, v31

+ vmrglb v15, v23, v31

+ vmrghb v16, v0, v8

+ vmrglb v17, v0, v8

+ vmrghb v18, v1, v9

+ vmrglb v19, v1, v9

+ vmrghb v20, v2, v10

+ vmrglb v21, v2, v10

+ vmrghb v22, v3, v11

+ vmrglb v23, v3, v11

+ vmrghb v24, v4, v12

+ vmrglb v25, v4, v12

+ vmrghb v26, v5, v13

+ vmrglb v27, v5, v13

+ vmrghb v28, v6, v14

+ vmrglb v29, v6, v14

+ vmrghb v30, v7, v15

+ vmrglb v31, v7, v15

+ vmrghb v0, v16, v24

+ vmrglb v1, v16, v24

+ vmrghb v2, v17, v25

+ vmrglb v3, v17, v25

+ vmrghb v4, v18, v26

+ vmrglb v5, v18, v26

+ vmrghb v6, v19, v27

+ vmrglb v7, v19, v27

+ vmrghb v8, v20, v28

+ vmrglb v9, v20, v28

+ vmrghb v10, v21, v29

+ vmrglb v11, v21, v29

+ vmrghb v12, v22, v30

+ vmrglb v13, v22, v30

+ vmrghb v14, v23, v31

+ vmrglb v15, v23, v31

+ vmrghb v16, v0, v8

+ vmrglb v17, v0, v8

+ vmrghb v18, v1, v9

+ vmrglb v19, v1, v9

+ vmrghb v20, v2, v10

+ vmrglb v21, v2, v10

+ vmrghb v22, v3, v11

+ vmrglb v23, v3, v11

+ vmrghb v24, v4, v12

+ vmrglb v25, v4, v12

+ vmrghb v26, v5, v13

+ vmrglb v27, v5, v13

+ vmrghb v28, v6, v14

+ vmrglb v29, v6, v14

+ vmrghb v30, v7, v15

+ vmrglb v31, v7, v15

+.endm

+;# load_g loads a global vector (whose address is in the local variable Gptr)

+;# into vector register Vreg. Trashes r0

+.macro load_g Vreg, Gptr

+ lwz r0, \Gptr

+ lvx \Vreg, 0, r0

+.endm

+;# exploit the saturation here. if the answer is negative

+;# it will be clamped to 0. orring 0 with a positive

+;# number will be the positive number (abs)

+;# RES = abs( A-B), trashes TMP

+.macro Abs RES, TMP, A, B

+ vsububs \RES, \A, \B

+ vsububs \TMP, \B, \A

+ vor \RES, \RES, \TMP

+.endm

+;# RES = Max( RES, abs( A-B)), trashes TMP

+.macro max_abs RES, TMP, A, B

+ vsububs \TMP, \A, \B

+ vmaxub \RES, \RES, \TMP

+ vsububs \TMP, \B, \A

+ vmaxub \RES, \RES, \TMP

+.endm

+.macro Masks

+ ;# build masks

+ ;# input is all 8 bit unsigned (0-255). need to

+ ;# do abs(vala-valb) > limit. but no need to compare each

+ ;# value to the limit. find the max of the absolute differences

+ ;# and compare that to the limit.

+ ;# First hev

+ Abs v14, v13, v2, v3 ;# |P1 - P0|

+ max_abs v14, v13, v5, v4 ;# |Q1 - Q0|

+ vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded

+ ;# Next limit

+ max_abs v14, v13, v0, v1 ;# |P3 - P2|

+ max_abs v14, v13, v1, v2 ;# |P2 - P1|

+ max_abs v14, v13, v6, v5 ;# |Q2 - Q1|

+ max_abs v14, v13, v7, v6 ;# |Q3 - Q2|

+ vcmpgtub v9, v14, v9 ;# R = true if limit exceeded

+ ;# flimit

+ Abs v14, v13, v3, v4 ;# |P0 - Q0|

+ vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded

+ vor v8, v8, v9 ;# R = true if flimit or limit exceeded

+ ;# done building masks

+.endm

+.macro build_constants RFL, RLI, RTH, FL, LI, TH

+ ;# build constants

+ lvx \FL, 0, \RFL ;# flimit

+ lvx \LI, 0, \RLI ;# limit

+ lvx \TH, 0, \RTH ;# thresh

+ vspltisb v11, 8

+ vspltisb v12, 4

+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080

+.endm

+.macro load_data_y

+ ;# setup strides/pointers to be able to access

+ ;# all of the data

+ add r5, r4, r4 ;# r5 = 2 * stride

+ sub r6, r3, r5 ;# r6 -> 2 rows back

+ neg r7, r4 ;# r7 = -stride

+ ;# load 16 pixels worth of data to work on

+ sub r0, r6, r5 ;# r0 -> 4 rows back (temp)

+ lvx v0, 0, r0 ;# P3 (read only)

+ lvx v1, r7, r6 ;# P2

+ lvx v2, 0, r6 ;# P1

+ lvx v3, r7, r3 ;# P0

+ lvx v4, 0, r3 ;# Q0

+ lvx v5, r4, r3 ;# Q1

+ lvx v6, r5, r3 ;# Q2

+ add r0, r3, r5 ;# r0 -> 2 rows fwd (temp)

+ lvx v7, r4, r0 ;# Q3 (read only)

+.endm

+;# Expects

+;# v10 == HEV

+;# v13 == tmp

+;# v14 == tmp

+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT

+ vxor \P1, \P1, v11 ;# SP1

+ vxor \P0, \P0, v11 ;# SP0

+ vxor \Q0, \Q0, v11 ;# SQ0

+ vxor \Q1, \Q1, v11 ;# SQ1

+ vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1)

+.if \HEV_PRESENT

+ vand v13, v13, v10 ;# f &= hev

+.endif

+ vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126

+ vaddsbs v13, v13, v14

+ vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0))

+ vandc v13, v13, v8 ;# f &= mask

+ vspltisb v8, 3

+ vspltisb v9, 4

+ vaddsbs v14, v13, v9 ;# f1 = c (f+4)

+ vaddsbs v15, v13, v8 ;# f2 = c (f+3)

+ vsrab v13, v14, v8 ;# f1 >>= 3

+ vsrab v15, v15, v8 ;# f2 >>= 3

+ vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1)

+ vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2)

+.endm

+.macro vp8_mbfilter

+ Masks

+ ;# start the fitering here

+ vxor v1, v1, v11 ;# SP2

+ vxor v2, v2, v11 ;# SP1

+ vxor v3, v3, v11 ;# SP0

+ vxor v4, v4, v11 ;# SQ0

+ vxor v5, v5, v11 ;# SQ1

+ vxor v6, v6, v11 ;# SQ2

+ ;# add outer taps if we have high edge variance

+ vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1)

+ vsubsbs v14, v4, v3 ;# SQ0-SP0

+ vaddsbs v13, v13, v14

+ vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0))

+ vandc v13, v13, v8 ;# f &= mask

+ vand v15, v13, v10 ;# f2 = f & hev

+ ;# save bottom 3 bits so that we round one side +4 and the other +3

+ vspltisb v8, 3

+ vspltisb v9, 4

+ vaddsbs v14, v15, v9 ;# f1 = c (f+4)

+ vaddsbs v15, v15, v8 ;# f2 = c (f+3)

+ vsrab v14, v14, v8 ;# f1 >>= 3

+ vsrab v15, v15, v8 ;# f2 >>= 3

+ vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1)

+ vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2)

+ ;# only apply wider filter if not high edge variance

+ vandc v13, v13, v10 ;# f &= ~hev

+ vspltisb v9, 2

+ vnor v8, v8, v8

+ vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f

+ vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f

+ vspltisb v8, 9

+ ;# roughly 1/7th difference across boundary

+ vspltish v10, 7

+ vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))

+ vmulesb v15, v8, v13

+ vaddshs v14, v14, v9 ;# += 63

+ vaddshs v15, v15, v9

+ vsrah v14, v14, v10 ;# >>= 7

+ vsrah v15, v15, v10

+ vmrglh v10, v15, v14

+ vmrghh v15, v15, v14

+ vpkshss v10, v15, v10 ;# X = saturated down to bytes

+ vsubsbs v6, v6, v10 ;# subtract from Q and add to P

+ vaddsbs v1, v1, v10

+ vxor v6, v6, v11

+ vxor v1, v1, v11

+ ;# roughly 2/7th difference across boundary

+ vspltish v10, 7

+ vaddubm v12, v8, v8

+ vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))

+ vmulesb v15, v12, v13

+ vaddshs v14, v14, v9

+ vaddshs v15, v15, v9

+ vsrah v14, v14, v10 ;# >>= 7

+ vsrah v15, v15, v10

+ vmrglh v10, v15, v14

+ vmrghh v15, v15, v14

+ vpkshss v10, v15, v10 ;# X = saturated down to bytes

+ vsubsbs v5, v5, v10 ;# subtract from Q and add to P

+ vaddsbs v2, v2, v10

+ vxor v5, v5, v11

+ vxor v2, v2, v11

+ ;# roughly 3/7th difference across boundary

+ vspltish v10, 7

+ vaddubm v12, v12, v8

+ vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))

+ vmulesb v15, v12, v13

+ vaddshs v14, v14, v9

+ vaddshs v15, v15, v9

+ vsrah v14, v14, v10 ;# >>= 7

+ vsrah v15, v15, v10

+ vmrglh v10, v15, v14

+ vmrghh v15, v15, v14

+ vpkshss v10, v15, v10 ;# X = saturated down to bytes

+ vsubsbs v4, v4, v10 ;# subtract from Q and add to P

+ vaddsbs v3, v3, v10

+ vxor v4, v4, v11

+ vxor v3, v3, v11

+.endm

+.macro SBFilter

+ Masks

+ common_adjust v3, v4, v2, v5, 1

+ ;# outer tap adjustments

+ vspltisb v8, 1

+ vaddubm v13, v13, v8 ;# f += 1

+ vsrab v13, v13, v8 ;# f >>= 1

+ vandc v13, v13, v10 ;# f &= ~hev

+ vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f)

+ vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f)

+ vxor v2, v2, v11

+ vxor v3, v3, v11

+ vxor v4, v4, v11

+ vxor v5, v5, v11

+.endm

+ .align 2

+mbloop_filter_horizontal_edge_y_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ mtspr 256, r12 ;# set VRSAVE

+ build_constants r5, r6, r7, v8, v9, v10

+ load_data_y

+ vp8_mbfilter

+ stvx v1, r7, r6 ;# P2

+ stvx v2, 0, r6 ;# P1

+ stvx v3, r7, r3 ;# P0

+ stvx v4, 0, r3 ;# Q0

+ stvx v5, r4, r3 ;# Q1

+ stvx v6, r5, r3 ;# Q2

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+ .align 2

+;# r3 unsigned char *s

+;# r4 int p

+;# r5 const signed char *flimit

+;# r6 const signed char *limit

+;# r7 const signed char *thresh

+loop_filter_horizontal_edge_y_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ mtspr 256, r12 ;# set VRSAVE

+ build_constants r5, r6, r7, v8, v9, v10

+ load_data_y

+ SBFilter

+ stvx v2, 0, r6 ;# P1

+ stvx v3, r7, r3 ;# P0

+ stvx v4, 0, r3 ;# Q0

+ stvx v5, r4, r3 ;# Q1

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary.

+;# So we can read in an entire mb aligned. However if we want to filter the mb

+;# edge we run into problems. For the loopfilter we require 4 bytes before the mb

+;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit

+;# of a waste. So this is an even uglier way to get around that.

+;# Using the regular register file words are read in and then saved back out to

+;# memory to align and order them up. Then they are read in using the

+;# vector register file.

+.macro RLVmb V, R

+ lwzux r0, r3, r4

+ stw r0, 4(\R)

+ lwz r0,-4(r3)

+ stw r0, 0(\R)

+ lwzux r0, r3, r4

+ stw r0,12(\R)

+ lwz r0,-4(r3)

+ stw r0, 8(\R)

+ lvx \V, 0, \R

+.endm

+.macro WLVmb V, R

+ stvx \V, 0, \R

+ lwz r0,12(\R)

+ stwux r0, r3, r4

+ lwz r0, 8(\R)

+ stw r0,-4(r3)

+ lwz r0, 4(\R)

+ stwux r0, r3, r4

+ lwz r0, 0(\R)

+ stw r0,-4(r3)

+.endm

+ .align 2

+;# r3 unsigned char *s

+;# r4 int p

+;# r5 const signed char *flimit

+;# r6 const signed char *limit

+;# r7 const signed char *thresh

+mbloop_filter_vertical_edge_y_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ ori r12, r12, 0xc000

+ mtspr 256, r12 ;# set VRSAVE

+ la r9, -48(r1) ;# temporary space for reading in vectors

+ sub r3, r3, r4

+ RLVmb v0, r9

+ RLVmb v1, r9

+ RLVmb v2, r9

+ RLVmb v3, r9

+ RLVmb v4, r9

+ RLVmb v5, r9

+ RLVmb v6, r9

+ RLVmb v7, r9

+ transpose8x16_fwd

+ build_constants r5, r6, r7, v8, v9, v10

+ vp8_mbfilter

+ transpose8x16_inv

+ add r3, r3, r4

+ neg r4, r4

+ WLVmb v17, r9

+ WLVmb v16, r9

+ WLVmb v15, r9

+ WLVmb v14, r9

+ WLVmb v13, r9

+ WLVmb v12, r9

+ WLVmb v11, r9

+ WLVmb v10, r9

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+.macro RL V, R, P

+ lvx \V, 0, \R

+ add \R, \R, \P

+.endm

+.macro WL V, R, P

+ stvx \V, 0, \R

+ add \R, \R, \P

+.endm

+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3

+ ;# K = |P0-P1| already

+ Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1|

+ vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|)

+ vcmpgtub v10, v14, v0

+ Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1]

+ max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|)

+ max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|)

+ max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|)

+ vmaxub v14, v14, v4 ;# M = max interior abs diff

+ vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded

+ Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0)

+ vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded

+ vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded

+ ;# replace P1,Q1 w/signed versions

+ common_adjust \P0, \Q0, \P1, \Q1, 1

+ vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant

+ vsrab v13, v13, v1

+ vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev

+ vsubsbs \Q1, \Q1, v13

+ vaddsbs \P1, \P1, v13

+ vxor \P1, \P1, v11 ;# P1

+ vxor \P0, \P0, v11 ;# P0

+ vxor \Q0, \Q0, v11 ;# Q0

+ vxor \Q1, \Q1, v11 ;# Q1

+.endm

+ .align 2

+;# r3 unsigned char *s

+;# r4 int p

+;# r5 const signed char *flimit

+;# r6 const signed char *limit

+;# r7 const signed char *thresh

+loop_filter_vertical_edge_y_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ ori r12, r12, 0xffff

+ mtspr 256, r12 ;# set VRSAVE

+ addi r9, r3, 0

+ RL v16, r9, r4

+ RL v17, r9, r4

+ RL v18, r9, r4

+ RL v19, r9, r4

+ RL v20, r9, r4

+ RL v21, r9, r4

+ RL v22, r9, r4

+ RL v23, r9, r4

+ RL v24, r9, r4

+ RL v25, r9, r4

+ RL v26, r9, r4

+ RL v27, r9, r4

+ RL v28, r9, r4

+ RL v29, r9, r4

+ RL v30, r9, r4

+ lvx v31, 0, r9

+ Transpose16x16

+ vspltisb v1, 1

+ build_constants r5, r6, r7, v3, v2, v0

+ Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1|

+ Fil v16, v17, v18, v19, v20, v21, v22, v23

+ Fil v20, v21, v22, v23, v24, v25, v26, v27

+ Fil v24, v25, v26, v27, v28, v29, v30, v31

+ Transpose16x16

+ addi r9, r3, 0

+ WL v16, r9, r4

+ WL v17, r9, r4

+ WL v18, r9, r4

+ WL v19, r9, r4

+ WL v20, r9, r4

+ WL v21, r9, r4

+ WL v22, r9, r4

+ WL v23, r9, r4

+ WL v24, r9, r4

+ WL v25, r9, r4

+ WL v26, r9, r4

+ WL v27, r9, r4

+ WL v28, r9, r4

+ WL v29, r9, r4

+ WL v30, r9, r4

+ stvx v31, 0, r9

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+.macro active_chroma_sel V

+ andi. r7, r3, 8 ;# row origin modulo 16

+ add r7, r7, r7 ;# selects selectors

+ lis r12, _chromaSelectors@ha

+ la r0, _chromaSelectors@l(r12)

+ lwzux r0, r7, r0 ;# leave selector addr in r7

+ lvx \V, 0, r0 ;# mask to concatenate active U,V pels

+.endm

+.macro hread_uv Dest, U, V, Offs, VMask

+ lvx \U, \Offs, r3

+ lvx \V, \Offs, r4

+ vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V

+.endm

+.macro hwrite_uv New, U, V, Offs, Umask, Vmask

+ vperm \U, \New, \U, \Umask ;# Combine new pels with siblings

+ vperm \V, \New, \V, \Vmask

+ stvx \U, \Offs, r3 ;# Write to frame buffer

+ stvx \V, \Offs, r4

+.endm

+;# Process U,V in parallel.

+.macro load_chroma_h

+ neg r9, r5 ;# r9 = -1 * stride

+ add r8, r9, r9 ;# r8 = -2 * stride

+ add r10, r5, r5 ;# r10 = 2 * stride

+ active_chroma_sel v12

+ ;# P3, Q3 are read-only; need not save addresses or sibling pels

+ add r6, r8, r8 ;# r6 = -4 * stride

+ hread_uv v0, v14, v15, r6, v12

+ add r6, r10, r5 ;# r6 = 3 * stride

+ hread_uv v7, v14, v15, r6, v12

+ ;# Others are read/write; save addresses and sibling pels

+ add r6, r8, r9 ;# r6 = -3 * stride

+ hread_uv v1, v16, v17, r6, v12

+ hread_uv v2, v18, v19, r8, v12

+ hread_uv v3, v20, v21, r9, v12

+ hread_uv v4, v22, v23, 0, v12

+ hread_uv v5, v24, v25, r5, v12

+ hread_uv v6, v26, v27, r10, v12

+.endm

+.macro uresult_sel V

+ load_g \V, 4(r7)

+.endm

+.macro vresult_sel V

+ load_g \V, 8(r7)

+.endm

+;# always write P1,P0,Q0,Q1

+.macro store_chroma_h

+ uresult_sel v11

+ vresult_sel v12

+ hwrite_uv v2, v18, v19, r8, v11, v12

+ hwrite_uv v3, v20, v21, r9, v11, v12

+ hwrite_uv v4, v22, v23, 0, v11, v12

+ hwrite_uv v5, v24, v25, r5, v11, v12

+.endm

+ .align 2

+;# r3 unsigned char *u

+;# r4 unsigned char *v

+;# r5 int p

+;# r6 const signed char *flimit

+;# r7 const signed char *limit

+;# r8 const signed char *thresh

+mbloop_filter_horizontal_edge_uv_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ ori r12, r12, 0xffff

+ mtspr 256, r12 ;# set VRSAVE

+ build_constants r6, r7, r8, v8, v9, v10

+ load_chroma_h

+ vp8_mbfilter

+ store_chroma_h

+ hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2

+ hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+ .align 2

+;# r3 unsigned char *u

+;# r4 unsigned char *v

+;# r5 int p

+;# r6 const signed char *flimit

+;# r7 const signed char *limit

+;# r8 const signed char *thresh

+loop_filter_horizontal_edge_uv_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ ori r12, r12, 0xffff

+ mtspr 256, r12 ;# set VRSAVE

+ build_constants r6, r7, r8, v8, v9, v10

+ load_chroma_h

+ SBFilter

+ store_chroma_h

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+.macro R V, R

+ lwzux r0, r3, r5

+ stw r0, 4(\R)

+ lwz r0,-4(r3)

+ stw r0, 0(\R)

+ lwzux r0, r4, r5

+ stw r0,12(\R)

+ lwz r0,-4(r4)

+ stw r0, 8(\R)

+ lvx \V, 0, \R

+.endm

+.macro W V, R

+ stvx \V, 0, \R

+ lwz r0,12(\R)

+ stwux r0, r4, r5

+ lwz r0, 8(\R)

+ stw r0,-4(r4)

+ lwz r0, 4(\R)

+ stwux r0, r3, r5

+ lwz r0, 0(\R)

+ stw r0,-4(r3)

+.endm

+.macro chroma_vread R

+ sub r3, r3, r5 ;# back up one line for simplicity

+ sub r4, r4, r5

+ R v0, \R

+ R v1, \R

+ R v2, \R

+ R v3, \R

+ R v4, \R

+ R v5, \R

+ R v6, \R

+ R v7, \R

+ transpose8x16_fwd

+.endm

+.macro chroma_vwrite R

+ transpose8x16_inv

+ add r3, r3, r5

+ add r4, r4, r5

+ neg r5, r5 ;# Write rows back in reverse order

+ W v17, \R

+ W v16, \R

+ W v15, \R

+ W v14, \R

+ W v13, \R

+ W v12, \R

+ W v11, \R

+ W v10, \R

+.endm

+ .align 2

+;# r3 unsigned char *u

+;# r4 unsigned char *v

+;# r5 int p

+;# r6 const signed char *flimit

+;# r7 const signed char *limit

+;# r8 const signed char *thresh

+mbloop_filter_vertical_edge_uv_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ ori r12, r12, 0xc000

+ mtspr 256, r12 ;# set VRSAVE

+ la r9, -48(r1) ;# temporary space for reading in vectors

+ chroma_vread r9

+ build_constants r6, r7, r8, v8, v9, v10

+ vp8_mbfilter

+ chroma_vwrite r9

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+ .align 2

+;# r3 unsigned char *u

+;# r4 unsigned char *v

+;# r5 int p

+;# r6 const signed char *flimit

+;# r7 const signed char *limit

+;# r8 const signed char *thresh

+loop_filter_vertical_edge_uv_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ ori r12, r12, 0xc000

+ mtspr 256, r12 ;# set VRSAVE

+ la r9, -48(r1) ;# temporary space for reading in vectors

+ chroma_vread r9

+ build_constants r6, r7, r8, v8, v9, v10

+ SBFilter

+ chroma_vwrite r9

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-

+.macro vp8_simple_filter

+ Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0)

+ vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit

+ ;# preserve unsigned v0 and v3

+ common_adjust v1, v2, v0, v3, 0

+ vxor v1, v1, v11

+ vxor v2, v2, v11 ;# cvt Q0, P0 back to pels

+.endm

+.macro simple_vertical

+ addi r8, 0, 16

+ addi r7, r5, 32

+ lvx v0, 0, r5

+ lvx v1, r8, r5

+ lvx v2, 0, r7

+ lvx v3, r8, r7

+ lis r12, _B_hihi@ha

+ la r0, _B_hihi@l(r12)

+ lvx v16, 0, r0

+ lis r12, _B_lolo@ha

+ la r0, _B_lolo@l(r12)

+ lvx v17, 0, r0

+ Transpose4times4x4 v16, v17

+ vp8_simple_filter

+ vxor v0, v0, v11

+ vxor v3, v3, v11 ;# cvt Q0, P0 back to pels

+ Transpose4times4x4 v16, v17

+ stvx v0, 0, r5

+ stvx v1, r8, r5

+ stvx v2, 0, r7

+ stvx v3, r8, r7

+.endm

+ .align 2

+;# r3 unsigned char *s

+;# r4 int p

+;# r5 const signed char *flimit

+loop_filter_simple_horizontal_edge_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ mtspr 256, r12 ;# set VRSAVE

+ ;# build constants

+ lvx v8, 0, r5 ;# flimit

+ vspltisb v11, 8

+ vspltisb v12, 4

+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080

+ neg r5, r4 ;# r5 = -1 * stride

+ add r6, r5, r5 ;# r6 = -2 * stride

+ lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge

+ lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge

+ lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge

+ lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge

+ vp8_simple_filter

+ stvx v1, r5, r3 ;# store P0

+ stvx v2, 0, r3 ;# store Q0

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+.macro RLV Offs

+ stw r0, (\Offs*4)(r5)

+ lwzux r0, r7, r4

+.endm

+.macro WLV Offs

+ lwz r0, (\Offs*4)(r5)

+ stwux r0, r7, r4

+.endm

+ .align 2

+;# r3 unsigned char *s

+;# r4 int p

+;# r5 const signed char *flimit

+loop_filter_simple_vertical_edge_ppc:

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffff

+ ori r12, r12, 0xc000

+ mtspr 256, r12 ;# set VRSAVE

+ ;# build constants

+ lvx v8, 0, r5 ;# flimit

+ vspltisb v11, 8

+ vspltisb v12, 4

+ vslb v11, v11, v12 ;# 0x80808080808080808080808080808080

+ la r5, -96(r1) ;# temporary space for reading in vectors

+ ;# Store 4 pels at word "Offs" in temp array, then advance r7

+ ;# to next row and read another 4 pels from the frame buffer.

+ subi r7, r3, 2 ;# r7 -> 2 pels before start

+ lwzx r0, 0, r7 ;# read first 4 pels

+ ;# 16 unaligned word accesses

+ RLV 0

+ RLV 4

+ RLV 8

+ RLV 12

+ RLV 1

+ RLV 5

+ RLV 9

+ RLV 13

+ RLV 2

+ RLV 6

+ RLV 10

+ RLV 14

+ RLV 3

+ RLV 7

+ RLV 11

+ stw r0, (15*4)(r5) ;# write last 4 pels

+ simple_vertical

+ ;# Read temp array, write frame buffer.

+ subi r7, r3, 2 ;# r7 -> 2 pels before start

+ lwzx r0, 0, r5 ;# read/write first 4 pels

+ stwx r0, 0, r7

+ WLV 4

+ WLV 8

+ WLV 12

+ WLV 1

+ WLV 5

+ WLV 9

+ WLV 13

+ WLV 2

+ WLV 6

+ WLV 10

+ WLV 14

+ WLV 3

+ WLV 7

+ WLV 11

+ WLV 15

+ mtspr 256, r11 ;# reset old VRSAVE

+ blr

+ .data

+_chromaSelectors:

+ .long _B_hihi

+ .long _B_Ures0

+ .long _B_Vres0

+ .long 0

+ .long _B_lolo

+ .long _B_Ures8

+ .long _B_Vres8

+ .long 0

+ .align 4

+_B_Vres8:

+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15

+ .align 4

+_B_Ures8:

+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7

+ .align 4

+_B_lolo:

+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

+ .align 4

+_B_Vres0:

+ .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

+ .align 4

+_B_Ures0:

+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31

+ .align 4

+_B_hihi:

+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23

« libvpx.gyp ('K') | « source/libvpx/vp9/common/ppc/vp9_loopfilter_altivec.c ('k') | source/libvpx/vp9/common/ppc/vp9_platform_altivec.asm » ('j') | no next file with comments »