source/libvpx/vp8/common/ppc/idctllm_altivec.asm - Issue 1124333011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp8/common/ppc/idctllm_altivec.asm

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: only update to last nights LKGR Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 ;

4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.

9 ;

10

11

12 .globl short_idct4x4llm_ppc

13

14 .macro load_c V, LABEL, OFF, R0, R1

15 lis \R0, \LABEL@ha

16 la \R1, \LABEL@l(\R0)

17 lvx \V, \OFF, \R1

18 .endm

19

20 ;# r3 short *input

21 ;# r4 short *output

22 ;# r5 int pitch

23 .align 2

24 short_idct4x4llm_ppc:

25 mfspr r11, 256 ;# get old VRSAVE

26 oris r12, r11, 0xfff8

27 mtspr 256, r12 ;# set VRSAVE

28

29 load_c v8, sinpi8sqrt2, 0, r9, r10

30 load_c v9, cospi8sqrt2minus1, 0, r9, r10

31 load_c v10, hi_hi, 0, r9, r10

32 load_c v11, lo_lo, 0, r9, r10

33 load_c v12, shift_16, 0, r9, r10

34

35 li r10, 16

36 lvx v0, 0, r3 ;# input ip[0], ip[ 4]

37 lvx v1, r10, r3 ;# input ip[8], ip[12]

38

39 ;# first pass

40 vupkhsh v2, v0

41 vupkhsh v3, v1

42 vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]

43 vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]

44

45 vupklsh v0, v0

46 vmulosh v4, v0, v8

47 vsraw v4, v4, v12

48 vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)

49

50 vupklsh v1, v1

51 vmulosh v5, v1, v9

52 vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)

53 vaddsws v5, v5, v1

54

55 vsubsws v4, v4, v5 ;# c1

56

57 vmulosh v3, v1, v8

58 vsraw v3, v3, v12

59 vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)

60

61 vmulosh v5, v0, v9

62 vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)

63 vaddsws v5, v5, v0

64

65 vaddsws v3, v3, v5 ;# d1

66

67 vaddsws v0, v6, v3 ;# a1 + d1

68 vsubsws v3, v6, v3 ;# a1 - d1

69

70 vaddsws v1, v7, v4 ;# b1 + c1

71 vsubsws v2, v7, v4 ;# b1 - c1

72

73 ;# transpose input

74 vmrghw v4, v0, v1 ;# a0 b0 a1 b1

75 vmrghw v5, v2, v3 ;# c0 d0 c1 d1

76

77 vmrglw v6, v0, v1 ;# a2 b2 a3 b3

78 vmrglw v7, v2, v3 ;# c2 d2 c3 d3

79

80 vperm v0, v4, v5, v10 ;# a0 b0 c0 d0

81 vperm v1, v4, v5, v11 ;# a1 b1 c1 d1

82

83 vperm v2, v6, v7, v10 ;# a2 b2 c2 d2

84 vperm v3, v6, v7, v11 ;# a3 b3 c3 d3

85

86 ;# second pass

87 vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]

88 vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]

89

90 vmulosh v4, v1, v8

91 vsraw v4, v4, v12

92 vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)

93

94 vmulosh v5, v3, v9

95 vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)

96 vaddsws v5, v5, v3

97

98 vsubsws v4, v4, v5 ;# c1

99

100 vmulosh v2, v3, v8

101 vsraw v2, v2, v12

102 vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)

103

104 vmulosh v5, v1, v9

105 vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)

106 vaddsws v5, v5, v1

107

108 vaddsws v3, v2, v5 ;# d1

109

110 vaddsws v0, v6, v3 ;# a1 + d1

111 vsubsws v3, v6, v3 ;# a1 - d1

112

113 vaddsws v1, v7, v4 ;# b1 + c1

114 vsubsws v2, v7, v4 ;# b1 - c1

115

116 vspltish v6, 4

117 vspltish v7, 3

118

119 vpkswss v0, v0, v1

120 vpkswss v1, v2, v3

121

122 vaddshs v0, v0, v6

123 vaddshs v1, v1, v6

124

125 vsrah v0, v0, v7

126 vsrah v1, v1, v7

127

128 ;# transpose output

129 vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3

130 vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3

131

132 vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1

133 vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3

134

135 stwu r1,-416(r1) ;# create space on the stack

136

137 stvx v0, 0, r1

138 lwz r6, 0(r1)

139 stw r6, 0(r4)

140 lwz r6, 4(r1)

141 stw r6, 4(r4)

142

143 add r4, r4, r5

144

145 lwz r6, 8(r1)

146 stw r6, 0(r4)

147 lwz r6, 12(r1)

148 stw r6, 4(r4)

149

150 add r4, r4, r5

151

152 stvx v1, 0, r1

153 lwz r6, 0(r1)

154 stw r6, 0(r4)

155 lwz r6, 4(r1)

156 stw r6, 4(r4)

157

158 add r4, r4, r5

159

160 lwz r6, 8(r1)

161 stw r6, 0(r4)

162 lwz r6, 12(r1)

163 stw r6, 4(r4)

164

165 addi r1, r1, 416 ;# recover stack

166

167 mtspr 256, r11 ;# reset old VRSAVE

168

169 blr

170

171 .align 4

172 sinpi8sqrt2:

173 .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

174

175 .align 4

176 cospi8sqrt2minus1:

177 .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

178

179 .align 4

180 shift_16:

181 .long 16, 16, 16, 16

182

183 .align 4

184 hi_hi:

185 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23

186

187 .align 4

188 lo_lo:

189 .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

OLD	NEW

« no previous file with comments | « source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm ('k') | source/libvpx/vp8/common/ppc/loopfilter_altivec.c » ('j') | no next file with comments »