OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 .globl vp8_short_fdct4x4_ppc |
| 13 .globl vp8_short_fdct8x4_ppc |
| 14 |
| 15 .macro load_c V, LABEL, OFF, R0, R1 |
| 16 lis \R0, \LABEL@ha |
| 17 la \R1, \LABEL@l(\R0) |
| 18 lvx \V, \OFF, \R1 |
| 19 .endm |
| 20 |
| 21 ;# Forward and inverse DCTs are nearly identical; only differences are |
| 22 ;# in normalization (fwd is twice unitary, inv is half unitary) |
| 23 ;# and that they are of course transposes of each other. |
| 24 ;# |
| 25 ;# The following three accomplish most of implementation and |
| 26 ;# are used only by ppc_idct.c and ppc_fdct.c. |
| 27 .macro prologue |
| 28 mfspr r11, 256 ;# get old VRSAVE |
| 29 oris r12, r11, 0xfffc |
| 30 mtspr 256, r12 ;# set VRSAVE |
| 31 |
| 32 stwu r1,-32(r1) ;# create space on the stack |
| 33 |
| 34 li r6, 16 |
| 35 |
| 36 load_c v0, dct_tab, 0, r9, r10 |
| 37 lvx v1, r6, r10 |
| 38 addi r10, r10, 32 |
| 39 lvx v2, 0, r10 |
| 40 lvx v3, r6, r10 |
| 41 |
| 42 load_c v4, ppc_dctperm_tab, 0, r9, r10 |
| 43 load_c v5, ppc_dctperm_tab, r6, r9, r10 |
| 44 |
| 45 load_c v6, round_tab, 0, r10, r9 |
| 46 .endm |
| 47 |
| 48 .macro epilogue |
| 49 addi r1, r1, 32 ;# recover stack |
| 50 |
| 51 mtspr 256, r11 ;# reset old VRSAVE |
| 52 .endm |
| 53 |
| 54 ;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3. |
| 55 ;# a/A are the even rows 0,2 b/B are the odd rows 1,3 |
| 56 ;# For fwd transform, indices are horizontal positions, then frequencies. |
| 57 ;# For inverse transform, frequencies then positions. |
| 58 ;# The two resulting A0..A3 B0..B3 are later combined |
| 59 ;# and vertically transformed. |
| 60 |
| 61 .macro two_rows_horiz Dst |
| 62 vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1 |
| 63 |
| 64 vmsumshm v10, v0, v8, v6 |
| 65 vmsumshm v10, v1, v9, v10 |
| 66 vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1 |
| 67 |
| 68 vmsumshm v11, v2, v8, v6 |
| 69 vmsumshm v11, v3, v9, v11 |
| 70 vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3 |
| 71 |
| 72 vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3 |
| 73 vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3 |
| 74 .endm |
| 75 |
| 76 ;# Vertical xf on two rows. DCT values in comments are for inverse transform; |
| 77 ;# forward transform uses transpose. |
| 78 |
| 79 .macro two_rows_vert Ceven, Codd |
| 80 vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times |
| 81 vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 "" |
| 82 vmsumshm v8, v8, v12, v6 |
| 83 vmsumshm v8, v9, v13, v8 |
| 84 vsraw v10, v8, v7 |
| 85 |
| 86 vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13 |
| 87 vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33 |
| 88 vmsumshm v8, v8, v12, v6 |
| 89 vmsumshm v8, v9, v13, v8 |
| 90 vsraw v8, v8, v7 |
| 91 |
| 92 vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3 |
| 93 .endm |
| 94 |
| 95 .macro two_rows_h Dest |
| 96 stw r0, 0(r8) |
| 97 lwz r0, 4(r3) |
| 98 stw r0, 4(r8) |
| 99 lwzux r0, r3,r5 |
| 100 stw r0, 8(r8) |
| 101 lwz r0, 4(r3) |
| 102 stw r0, 12(r8) |
| 103 lvx v8, 0,r8 |
| 104 two_rows_horiz \Dest |
| 105 .endm |
| 106 |
| 107 .align 2 |
| 108 ;# r3 short *input |
| 109 ;# r4 short *output |
| 110 ;# r5 int pitch |
| 111 vp8_short_fdct4x4_ppc: |
| 112 |
| 113 prologue |
| 114 |
| 115 vspltisw v7, 14 ;# == 14, fits in 5 signed bits |
| 116 addi r8, r1, 0 |
| 117 |
| 118 |
| 119 lwz r0, 0(r3) |
| 120 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 |
| 121 |
| 122 lwzux r0, r3, r5 |
| 123 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 |
| 124 |
| 125 lvx v6, r6, r9 ;# v6 = Vround |
| 126 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter |
| 127 |
| 128 two_rows_vert v0, v1 |
| 129 stvx v8, 0, r4 |
| 130 two_rows_vert v2, v3 |
| 131 stvx v8, r6, r4 |
| 132 |
| 133 epilogue |
| 134 |
| 135 blr |
| 136 |
| 137 .align 2 |
| 138 ;# r3 short *input |
| 139 ;# r4 short *output |
| 140 ;# r5 int pitch |
| 141 vp8_short_fdct8x4_ppc: |
| 142 prologue |
| 143 |
| 144 vspltisw v7, 14 ;# == 14, fits in 5 signed bits |
| 145 addi r8, r1, 0 |
| 146 addi r10, r3, 0 |
| 147 |
| 148 lwz r0, 0(r3) |
| 149 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 |
| 150 |
| 151 lwzux r0, r3, r5 |
| 152 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 |
| 153 |
| 154 lvx v6, r6, r9 ;# v6 = Vround |
| 155 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter |
| 156 |
| 157 two_rows_vert v0, v1 |
| 158 stvx v8, 0, r4 |
| 159 two_rows_vert v2, v3 |
| 160 stvx v8, r6, r4 |
| 161 |
| 162 ;# Next block |
| 163 addi r3, r10, 8 |
| 164 addi r4, r4, 32 |
| 165 lvx v6, 0, r9 ;# v6 = Hround |
| 166 |
| 167 vspltisw v7, 14 ;# == 14, fits in 5 signed bits |
| 168 addi r8, r1, 0 |
| 169 |
| 170 lwz r0, 0(r3) |
| 171 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 |
| 172 |
| 173 lwzux r0, r3, r5 |
| 174 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 |
| 175 |
| 176 lvx v6, r6, r9 ;# v6 = Vround |
| 177 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter |
| 178 |
| 179 two_rows_vert v0, v1 |
| 180 stvx v8, 0, r4 |
| 181 two_rows_vert v2, v3 |
| 182 stvx v8, r6, r4 |
| 183 |
| 184 epilogue |
| 185 |
| 186 blr |
| 187 |
| 188 .data |
| 189 .align 4 |
| 190 ppc_dctperm_tab: |
| 191 .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 |
| 192 .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15 |
| 193 |
| 194 .align 4 |
| 195 dct_tab: |
| 196 .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274 |
| 197 .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540 |
| 198 |
| 199 .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540 |
| 200 .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274 |
| 201 |
| 202 .align 4 |
| 203 round_tab: |
| 204 .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1)) |
| 205 .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1)) |
OLD | NEW |