source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm - Issue 812033011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 ;

2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 ;

4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.

9 ;

10

11 EXPORT \|vp9_iht4x4_16_add_neon\|

12 ARM

13 REQUIRE8

14 PRESERVE8

15

16 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2

17

18 ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are

19 ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain

20 ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back

21 ; into d16-d19 registers. This macro will touch q10- q15 registers and use

22 ; them as buffer during calculation.

23 MACRO

24 IDCT4x4_1D

25 ; stage 1

26 vadd.s16 d23, d16, d18 ; (input[0] + input[2])

27 vsub.s16 d24, d16, d18 ; (input[0] - input[2])

28

29 vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64

30 vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64

31 vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64

32 vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64

33 vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64

34 vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64

35

36 ; dct_const_round_shift

37 vqrshrn.s32 d26, q13, #14

38 vqrshrn.s32 d27, q14, #14

39 vqrshrn.s32 d29, q15, #14

40 vqrshrn.s32 d28, q10, #14

41

42 ; stage 2

43 ; output[0] = step[0] + step[3];

44 ; output[1] = step[1] + step[2];

45 ; output[3] = step[0] - step[3];

46 ; output[2] = step[1] - step[2];

47 vadd.s16 q8, q13, q14

48 vsub.s16 q9, q13, q14

49 vswp d18, d19

50 MEND

51

52 ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which

53 ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.

54 ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be

55 ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,

56 ; q14,q15 registers and use them as buffer during calculation.

57 MACRO

58 IADST4x4_1D

59 vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0

60 vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0

61 vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1

62 vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2

63 vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2

64 vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit

65 vaddw.s16 q15, q15, d19 ; x0 + x3

66 vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3

67 vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2

68 vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3

69

70 vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5

71 vadd.s32 q10, q10, q8

72 vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6

73 vdup.32 q8, r0 ; duplicate sinpi_3_9

74 vsub.s32 q11, q11, q9

75 vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7

76

77 vadd.s32 q13, q10, q12 ; s0 = x0 + x3

78 vadd.s32 q10, q10, q11 ; x0 + x1

79 vadd.s32 q14, q11, q12 ; s1 = x1 + x3

80 vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3

81

82 ; dct_const_round_shift

83 vqrshrn.s32 d16, q13, #14

84 vqrshrn.s32 d17, q14, #14

85 vqrshrn.s32 d18, q15, #14

86 vqrshrn.s32 d19, q10, #14

87 MEND

88

89 ; Generate cosine constants in d6 - d8 for the IDCT

90 MACRO

91 GENERATE_COSINE_CONSTANTS

92 ; cospi_8_64 = 15137 = 0x3b21

93 mov r0, #0x3b00

94 add r0, #0x21

95 ; cospi_16_64 = 11585 = 0x2d41

96 mov r3, #0x2d00

97 add r3, #0x41

98 ; cospi_24_64 = 6270 = 0x187e

99 mov r12, #0x1800

100 add r12, #0x7e

101

102 ; generate constant vectors

103 vdup.16 d0, r0 ; duplicate cospi_8_64

104 vdup.16 d1, r3 ; duplicate cospi_16_64

105 vdup.16 d2, r12 ; duplicate cospi_24_64

106 MEND

107

108 ; Generate sine constants in d1 - d4 for the IADST.

109 MACRO

110 GENERATE_SINE_CONSTANTS

111 ; sinpi_1_9 = 5283 = 0x14A3

112 mov r0, #0x1400

113 add r0, #0xa3

114 ; sinpi_2_9 = 9929 = 0x26C9

115 mov r3, #0x2600

116 add r3, #0xc9

117 ; sinpi_4_9 = 15212 = 0x3B6C

118 mov r12, #0x3b00

119 add r12, #0x6c

120

121 ; generate constant vectors

122 vdup.16 d3, r0 ; duplicate sinpi_1_9

123

124 ; sinpi_3_9 = 13377 = 0x3441

125 mov r0, #0x3400

126 add r0, #0x41

127

128 vdup.16 d4, r3 ; duplicate sinpi_2_9

129 vdup.16 d5, r12 ; duplicate sinpi_4_9

130 vdup.16 q3, r0 ; duplicate sinpi_3_9

131 MEND

132

133 ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.

134 MACRO

135 TRANSPOSE4X4

136 vtrn.16 d16, d17

137 vtrn.16 d18, d19

138 vtrn.32 q8, q9

139 MEND

140

141 AREA Block, CODE, READONLY ; name this block of code

142 ;void vp9_iht4x4_16_add_neon(int16_t input, uint8_t dest,

143 ; int dest_stride, int tx_type)

144 ;

145 ; r0 int16_t input

146 ; r1 uint8_t *dest

147 ; r2 int dest_stride

148 ; r3 int tx_type)

149 ; This function will only handle tx_type of 1,2,3.

150 \|vp9_iht4x4_16_add_neon\| PROC

151

152 ; load the inputs into d16-d19

153 vld1.s16 {q8,q9}, [r0]!

154

155 ; transpose the input data

156 TRANSPOSE4X4

157

158 ; decide the type of transform

159 cmp r3, #2

160 beq idct_iadst

161 cmp r3, #3

162 beq iadst_iadst

163

164 iadst_idct

165 ; generate constants

166 GENERATE_COSINE_CONSTANTS

167 GENERATE_SINE_CONSTANTS

168

169 ; first transform rows

170 IDCT4x4_1D

171

172 ; transpose the matrix

173 TRANSPOSE4X4

174

175 ; then transform columns

176 IADST4x4_1D

177

178 b end_vp9_iht4x4_16_add_neon

179

180 idct_iadst

181 ; generate constants

182 GENERATE_COSINE_CONSTANTS

183 GENERATE_SINE_CONSTANTS

184

185 ; first transform rows

186 IADST4x4_1D

187

188 ; transpose the matrix

189 TRANSPOSE4X4

190

191 ; then transform columns

192 IDCT4x4_1D

193

194 b end_vp9_iht4x4_16_add_neon

195

196 iadst_iadst

197 ; generate constants

198 GENERATE_SINE_CONSTANTS

199

200 ; first transform rows

201 IADST4x4_1D

202

203 ; transpose the matrix

204 TRANSPOSE4X4

205

206 ; then transform columns

207 IADST4x4_1D

208

209 end_vp9_iht4x4_16_add_neon

210 ; ROUND_POWER_OF_TWO(temp_out[j], 4)

211 vrshr.s16 q8, q8, #4

212 vrshr.s16 q9, q9, #4

213

214 vld1.32 {d26[0]}, [r1], r2

215 vld1.32 {d26[1]}, [r1], r2

216 vld1.32 {d27[0]}, [r1], r2

217 vld1.32 {d27[1]}, [r1]

218

219 ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]

220 vaddw.u8 q8, q8, d26

221 vaddw.u8 q9, q9, d27

222

223 ; clip_pixel

224 vqmovun.s16 d26, q8

225 vqmovun.s16 d27, q9

226

227 ; do the stores in reverse order with negative post-increment, by changing

228 ; the sign of the stride

229 rsb r2, r2, #0

230 vst1.32 {d27[1]}, [r1], r2

231 vst1.32 {d27[0]}, [r1], r2

232 vst1.32 {d26[1]}, [r1], r2

233 vst1.32 {d26[0]}, [r1] ; no post-increment

234 bx lr

235 ENDP ; \|vp9_iht4x4_16_add_neon\|

236

237 END

OLD	NEW