Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 EXPORT |vp9_iht4x4_16_add_neon|
12 ARM
13 REQUIRE8
14 PRESERVE8
15
16 AREA ||.text||, CODE, READONLY, ALIGN=2
17
18 ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
19 ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
20 ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
21 ; into d16-d19 registers. This macro will touch q10- q15 registers and use
22 ; them as buffer during calculation.
23 MACRO
24 IDCT4x4_1D
25 ; stage 1
26 vadd.s16 d23, d16, d18 ; (input[0] + input[2])
27 vsub.s16 d24, d16, d18 ; (input[0] - input[2])
28
29 vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64
30 vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64
31 vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64
32 vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64
33 vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64
34 vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64
35
36 ; dct_const_round_shift
37 vqrshrn.s32 d26, q13, #14
38 vqrshrn.s32 d27, q14, #14
39 vqrshrn.s32 d29, q15, #14
40 vqrshrn.s32 d28, q10, #14
41
42 ; stage 2
43 ; output[0] = step[0] + step[3];
44 ; output[1] = step[1] + step[2];
45 ; output[3] = step[0] - step[3];
46 ; output[2] = step[1] - step[2];
47 vadd.s16 q8, q13, q14
48 vsub.s16 q9, q13, q14
49 vswp d18, d19
50 MEND
51
52 ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
53 ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
54 ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
55 ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
56 ; q14,q15 registers and use them as buffer during calculation.
57 MACRO
58 IADST4x4_1D
59 vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0
60 vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0
61 vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1
62 vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2
63 vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2
64 vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit
65 vaddw.s16 q15, q15, d19 ; x0 + x3
66 vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3
67 vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2
68 vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3
69
70 vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5
71 vadd.s32 q10, q10, q8
72 vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6
73 vdup.32 q8, r0 ; duplicate sinpi_3_9
74 vsub.s32 q11, q11, q9
75 vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7
76
77 vadd.s32 q13, q10, q12 ; s0 = x0 + x3
78 vadd.s32 q10, q10, q11 ; x0 + x1
79 vadd.s32 q14, q11, q12 ; s1 = x1 + x3
80 vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3
81
82 ; dct_const_round_shift
83 vqrshrn.s32 d16, q13, #14
84 vqrshrn.s32 d17, q14, #14
85 vqrshrn.s32 d18, q15, #14
86 vqrshrn.s32 d19, q10, #14
87 MEND
88
89 ; Generate cosine constants in d6 - d8 for the IDCT
90 MACRO
91 GENERATE_COSINE_CONSTANTS
92 ; cospi_8_64 = 15137 = 0x3b21
93 mov r0, #0x3b00
94 add r0, #0x21
95 ; cospi_16_64 = 11585 = 0x2d41
96 mov r3, #0x2d00
97 add r3, #0x41
98 ; cospi_24_64 = 6270 = 0x187e
99 mov r12, #0x1800
100 add r12, #0x7e
101
102 ; generate constant vectors
103 vdup.16 d0, r0 ; duplicate cospi_8_64
104 vdup.16 d1, r3 ; duplicate cospi_16_64
105 vdup.16 d2, r12 ; duplicate cospi_24_64
106 MEND
107
108 ; Generate sine constants in d1 - d4 for the IADST.
109 MACRO
110 GENERATE_SINE_CONSTANTS
111 ; sinpi_1_9 = 5283 = 0x14A3
112 mov r0, #0x1400
113 add r0, #0xa3
114 ; sinpi_2_9 = 9929 = 0x26C9
115 mov r3, #0x2600
116 add r3, #0xc9
117 ; sinpi_4_9 = 15212 = 0x3B6C
118 mov r12, #0x3b00
119 add r12, #0x6c
120
121 ; generate constant vectors
122 vdup.16 d3, r0 ; duplicate sinpi_1_9
123
124 ; sinpi_3_9 = 13377 = 0x3441
125 mov r0, #0x3400
126 add r0, #0x41
127
128 vdup.16 d4, r3 ; duplicate sinpi_2_9
129 vdup.16 d5, r12 ; duplicate sinpi_4_9
130 vdup.16 q3, r0 ; duplicate sinpi_3_9
131 MEND
132
133 ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
134 MACRO
135 TRANSPOSE4X4
136 vtrn.16 d16, d17
137 vtrn.16 d18, d19
138 vtrn.32 q8, q9
139 MEND
140
141 AREA Block, CODE, READONLY ; name this block of code
142 ;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
143 ; int dest_stride, int tx_type)
144 ;
145 ; r0 int16_t input
146 ; r1 uint8_t *dest
147 ; r2 int dest_stride
148 ; r3 int tx_type)
149 ; This function will only handle tx_type of 1,2,3.
150 |vp9_iht4x4_16_add_neon| PROC
151
152 ; load the inputs into d16-d19
153 vld1.s16 {q8,q9}, [r0]!
154
155 ; transpose the input data
156 TRANSPOSE4X4
157
158 ; decide the type of transform
159 cmp r3, #2
160 beq idct_iadst
161 cmp r3, #3
162 beq iadst_iadst
163
164 iadst_idct
165 ; generate constants
166 GENERATE_COSINE_CONSTANTS
167 GENERATE_SINE_CONSTANTS
168
169 ; first transform rows
170 IDCT4x4_1D
171
172 ; transpose the matrix
173 TRANSPOSE4X4
174
175 ; then transform columns
176 IADST4x4_1D
177
178 b end_vp9_iht4x4_16_add_neon
179
180 idct_iadst
181 ; generate constants
182 GENERATE_COSINE_CONSTANTS
183 GENERATE_SINE_CONSTANTS
184
185 ; first transform rows
186 IADST4x4_1D
187
188 ; transpose the matrix
189 TRANSPOSE4X4
190
191 ; then transform columns
192 IDCT4x4_1D
193
194 b end_vp9_iht4x4_16_add_neon
195
196 iadst_iadst
197 ; generate constants
198 GENERATE_SINE_CONSTANTS
199
200 ; first transform rows
201 IADST4x4_1D
202
203 ; transpose the matrix
204 TRANSPOSE4X4
205
206 ; then transform columns
207 IADST4x4_1D
208
209 end_vp9_iht4x4_16_add_neon
210 ; ROUND_POWER_OF_TWO(temp_out[j], 4)
211 vrshr.s16 q8, q8, #4
212 vrshr.s16 q9, q9, #4
213
214 vld1.32 {d26[0]}, [r1], r2
215 vld1.32 {d26[1]}, [r1], r2
216 vld1.32 {d27[0]}, [r1], r2
217 vld1.32 {d27[1]}, [r1]
218
219 ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
220 vaddw.u8 q8, q8, d26
221 vaddw.u8 q9, q9, d27
222
223 ; clip_pixel
224 vqmovun.s16 d26, q8
225 vqmovun.s16 d27, q9
226
227 ; do the stores in reverse order with negative post-increment, by changing
228 ; the sign of the stride
229 rsb r2, r2, #0
230 vst1.32 {d27[1]}, [r1], r2
231 vst1.32 {d27[0]}, [r1], r2
232 vst1.32 {d26[1]}, [r1], r2
233 vst1.32 {d26[0]}, [r1] ; no post-increment
234 bx lr
235 ENDP ; |vp9_iht4x4_16_add_neon|
236
237 END
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698