source/libvpx/vp9/encoder/ppc/vp9_variance_altivec.asm - Issue 11555023: libvpx: Add VP9 decoder.

Unified Diff: source/libvpx/vp9/encoder/ppc/vp9_variance_altivec.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/libvpx/vp9/encoder/ppc/vp9_variance_altivec.asm

===================================================================

--- source/libvpx/vp9/encoder/ppc/vp9_variance_altivec.asm (revision 0)

+++ source/libvpx/vp9/encoder/ppc/vp9_variance_altivec.asm (revision 0)

@@ -0,0 +1,375 @@

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+ .globl vp8_get8x8var_ppc

+ .globl vp8_get16x16var_ppc

+ .globl vp8_mse16x16_ppc

+ .globl vp9_variance16x16_ppc

+ .globl vp9_variance16x8_ppc

+ .globl vp9_variance8x16_ppc

+ .globl vp9_variance8x8_ppc

+ .globl vp9_variance4x4_ppc

+.macro load_aligned_16 V R O

+ lvsl v3, 0, \R ;# permutate value for alignment

+ lvx v1, 0, \R

+ lvx v2, \O, \R

+ vperm \V, v1, v2, v3

+.endm

+.macro prologue

+ mfspr r11, 256 ;# get old VRSAVE

+ oris r12, r11, 0xffc0

+ mtspr 256, r12 ;# set VRSAVE

+ stwu r1, -32(r1) ;# create space on the stack

+ li r10, 16 ;# load offset and loop counter

+ vspltisw v7, 0 ;# zero for merging

+ vspltisw v8, 0 ;# zero out total to start

+ vspltisw v9, 0 ;# zero out total for dif^2

+.endm

+.macro epilogue

+ addi r1, r1, 32 ;# recover stack

+ mtspr 256, r11 ;# reset old VRSAVE

+.endm

+.macro compute_sum_sse

+ ;# Compute sum first. Unpack to so signed subract

+ ;# can be used. Only have a half word signed

+ ;# subract. Do high, then low.

+ vmrghb v2, v7, v4

+ vmrghb v3, v7, v5

+ vsubshs v2, v2, v3

+ vsum4shs v8, v2, v8

+ vmrglb v2, v7, v4

+ vmrglb v3, v7, v5

+ vsubshs v2, v2, v3

+ vsum4shs v8, v2, v8

+ ;# Now compute sse.

+ vsububs v2, v4, v5

+ vsububs v3, v5, v4

+ vor v2, v2, v3

+ vmsumubm v9, v2, v2, v9

+.endm

+.macro variance_16 DS loop_label store_sum

+\loop_label:

+ ;# only one of the inputs should need to be aligned.

+ load_aligned_16 v4, r3, r10

+ load_aligned_16 v5, r5, r10

+ ;# move onto the next line

+ add r3, r3, r4

+ add r5, r5, r6

+ compute_sum_sse

+ bdnz \loop_label

+ vsumsws v8, v8, v7

+ vsumsws v9, v9, v7

+ stvx v8, 0, r1

+ lwz r3, 12(r1)

+ stvx v9, 0, r1

+ lwz r4, 12(r1)

+.if \store_sum

+ stw r3, 0(r8) ;# sum

+.endif

+ stw r4, 0(r7) ;# sse

+ mullw r3, r3, r3 ;# sum*sum

+ srawi r3, r3, \DS ;# (sum*sum) >> DS

+ subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)

+.endm

+.macro variance_8 DS loop_label store_sum

+\loop_label:

+ ;# only one of the inputs should need to be aligned.

+ load_aligned_16 v4, r3, r10

+ load_aligned_16 v5, r5, r10

+ ;# move onto the next line

+ add r3, r3, r4

+ add r5, r5, r6

+ ;# only one of the inputs should need to be aligned.

+ load_aligned_16 v6, r3, r10

+ load_aligned_16 v0, r5, r10

+ ;# move onto the next line

+ add r3, r3, r4

+ add r5, r5, r6

+ vmrghb v4, v4, v6

+ vmrghb v5, v5, v0

+ compute_sum_sse

+ bdnz \loop_label

+ vsumsws v8, v8, v7

+ vsumsws v9, v9, v7

+ stvx v8, 0, r1

+ lwz r3, 12(r1)

+ stvx v9, 0, r1

+ lwz r4, 12(r1)

+.if \store_sum

+ stw r3, 0(r8) ;# sum

+.endif

+ stw r4, 0(r7) ;# sse

+ mullw r3, r3, r3 ;# sum*sum

+ srawi r3, r3, \DS ;# (sum*sum) >> 8

+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)

+.endm

+ .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int recon_stride

+;# r7 unsigned int *SSE

+;# r8 int *Sum

+;#

+;# r3 return value

+vp8_get8x8var_ppc:

+ prologue

+ li r9, 4

+ mtctr r9

+ variance_8 6, get8x8var_loop, 1

+ epilogue

+ blr

+ .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int recon_stride

+;# r7 unsigned int *SSE

+;# r8 int *Sum

+;#

+;# r3 return value

+vp8_get16x16var_ppc:

+ prologue

+ mtctr r10

+ variance_16 8, get16x16var_loop, 1

+ epilogue

+ blr

+ .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int recon_stride

+;# r7 unsigned int *sse

+;#

+;# r 3 return value

+vp8_mse16x16_ppc:

+ prologue

+ mtctr r10

+mse16x16_loop:

+ ;# only one of the inputs should need to be aligned.

+ load_aligned_16 v4, r3, r10

+ load_aligned_16 v5, r5, r10

+ ;# move onto the next line

+ add r3, r3, r4

+ add r5, r5, r6

+ ;# Now compute sse.

+ vsububs v2, v4, v5

+ vsububs v3, v5, v4

+ vor v2, v2, v3

+ vmsumubm v9, v2, v2, v9

+ bdnz mse16x16_loop

+ vsumsws v9, v9, v7

+ stvx v9, 0, r1

+ lwz r3, 12(r1)

+ stvx v9, 0, r1

+ lwz r3, 12(r1)

+ stw r3, 0(r7) ;# sse

+ epilogue

+ blr

+ .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance16x16_ppc:

+ prologue

+ mtctr r10

+ variance_16 8, variance16x16_loop, 0

+ epilogue

+ blr

+ .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance16x8_ppc:

+ prologue

+ li r9, 8

+ mtctr r9

+ variance_16 7, variance16x8_loop, 0

+ epilogue

+ blr

+ .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance8x16_ppc:

+ prologue

+ li r9, 8

+ mtctr r9

+ variance_8 7, variance8x16_loop, 0

+ epilogue

+ blr

+ .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance8x8_ppc:

+ prologue

+ li r9, 4

+ mtctr r9

+ variance_8 6, variance8x8_loop, 0

+ epilogue

+ blr

+.macro transfer_4x4 I P

+ lwz r0, 0(\I)

+ add \I, \I, \P

+ lwz r10,0(\I)

+ add \I, \I, \P

+ lwz r8, 0(\I)

+ add \I, \I, \P

+ lwz r9, 0(\I)

+ stw r0, 0(r1)

+ stw r10, 4(r1)

+ stw r8, 8(r1)

+ stw r9, 12(r1)

+.endm

+ .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance4x4_ppc:

+ prologue

+ transfer_4x4 r3, r4

+ lvx v4, 0, r1

+ transfer_4x4 r5, r6

+ lvx v5, 0, r1

+ compute_sum_sse

+ vsumsws v8, v8, v7

+ vsumsws v9, v9, v7

+ stvx v8, 0, r1

+ lwz r3, 12(r1)

+ stvx v9, 0, r1

+ lwz r4, 12(r1)

+ stw r4, 0(r7) ;# sse

+ mullw r3, r3, r3 ;# sum*sum

+ srawi r3, r3, 4 ;# (sum*sum) >> 4

+ subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)

+ epilogue

+ blr

« libvpx.gyp ('K') | « source/libvpx/vp9/encoder/ppc/vp9_sad_altivec.asm ('k') | source/libvpx/vp9/encoder/ppc/vp9_variance_subpixel_altivec.asm » ('j') | no next file with comments »