Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(74)

Side by Side Diff: source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm

Issue 7671004: Update libvpx snapshot to v0.9.7-p1 (Cayuga). (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: '' Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 11
12 EXPORT |vp8_fast_quantize_b_neon_func| 12 EXPORT |vp8_fast_quantize_b_neon|
13 EXPORT |vp8_fast_quantize_b_pair_neon|
14
15 INCLUDE asm_enc_offsets.asm
13 16
14 ARM 17 ARM
15 REQUIRE8 18 REQUIRE8
16 PRESERVE8 19 PRESERVE8
17 20
18 AREA ||.text||, CODE, READONLY, ALIGN=2 21 AREA ||.text||, CODE, READONLY, ALIGN=4
19 22
20 ; r0 short *coeff_ptr 23 ;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
21 ; r1 short *zbin_ptr 24 |vp8_fast_quantize_b_pair_neon| PROC
22 ; r2 short *qcoeff_ptr 25
23 ; r3 short *dqcoeff_ptr 26 stmfd sp!, {r4-r9}
24 ; stack short *dequant_ptr 27 vstmdb sp!, {q4-q7}
25 ; stack short *scan_mask 28
26 ; stack short *round_ptr 29 ldr r4, [r0, #vp8_block_coeff]
27 ; stack short *quant_ptr 30 ldr r5, [r0, #vp8_block_quant_fast]
28 31 ldr r6, [r0, #vp8_block_round]
29 ; return int * eob 32
30 |vp8_fast_quantize_b_neon_func| PROC 33 vld1.16 {q0, q1}, [r4@128] ; load z
31 vld1.16 {q0, q1}, [r0] ;load z 34
32 vld1.16 {q10, q11}, [r1] ;load zbin 35 ldr r7, [r2, #vp8_blockd_qcoeff]
33 36
34 vabs.s16 q4, q0 ;calculate x = abs(z) 37 vabs.s16 q4, q0 ; calculate x = abs(z)
35 vabs.s16 q5, q1 38 vabs.s16 q5, q1
36 39
37 vcge.s16 q10, q4, q10 ;x>=zbin
38 vcge.s16 q11, q5, q11
39
40 ;if x<zbin (q10 & q11 are all 0), go to zero_output
41 vorr.s16 q6, q10, q11
42 vorr.s16 d12, d12, d13
43 vmov r0, r1, d12
44 orr r0, r0, r1
45 cmp r0, #0
46 beq zero_output
47
48 ldr r0, [sp, #8] ;load round_ptr
49 ldr r12, [sp, #12] ;load quant_ptr
50
51 ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ e 40 ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ e
52 vshr.s16 q2, q0, #15 ; sz 41 vshr.s16 q2, q0, #15 ; sz
53 vshr.s16 q3, q1, #15 42 vshr.s16 q3, q1, #15
54 43
55 vld1.s16 {q6, q7}, [r0] ;load round_ptr [0-15] 44 vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
56 vld1.s16 {q8, q9}, [r12] ;load quant_ptr [0-15] 45 vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
57 46
58 vadd.s16 q4, q6 ;x + Round 47 ldr r4, [r1, #vp8_block_coeff]
48
49 vadd.s16 q4, q6 ; x + Round
59 vadd.s16 q5, q7 50 vadd.s16 q5, q7
60 51
61 ldr r0, [sp, #4] ;load rvsplus1_scan_order ptr 52 vld1.16 {q0, q1}, [r4@128] ; load z2
62 53
63 vqdmulh.s16 q4, q8 ;y = ((Round + abs(z)) * Quant) >> 16 54 vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
64 vqdmulh.s16 q5, q9 55 vqdmulh.s16 q5, q9
65 56
66 vld1.16 {q0, q1}, [r0] ;load rvsplus1_scan_order 57 vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
67 vceq.s16 q8, q8 ;set q8 to all 1 58 vabs.s16 q11, q1
68 59 vshr.s16 q12, q0, #15 ; sz2
69 vshr.s16 q4, #1 ;right shift 1 after vqdmulh 60 vshr.s16 q13, q1, #15
61
62 ;modify data to have its original sign
63 veor.s16 q4, q2 ; y^sz
64 veor.s16 q5, q3
65
66 vadd.s16 q10, q6 ; x2 + Round
67 vadd.s16 q11, q7
68
69 ldr r8, [r2, #vp8_blockd_dequant]
70
71 vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
72 vqdmulh.s16 q11, q9
73
74 vshr.s16 q4, #1 ; right shift 1 after vqdmulh
70 vshr.s16 q5, #1 75 vshr.s16 q5, #1
71 76
77 vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
78
79 vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's comple ment)
80 vsub.s16 q5, q3
81
82 vshr.s16 q10, #1 ; right shift 1 after vqdmulh
83 vshr.s16 q11, #1
84
85 ldr r9, [r2, #vp8_blockd_dqcoeff]
86
87 veor.s16 q10, q12 ; y2^sz2
88 veor.s16 q11, q13
89
90 vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
91
92
93 vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's comple ment)
94 vsub.s16 q11, q13
95
96 ldr r6, [r3, #vp8_blockd_qcoeff]
97
98 vmul.s16 q2, q6, q4 ; x * Dequant
99 vmul.s16 q3, q7, q5
100
101 ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
102
103 vceq.s16 q8, q8 ; set q8 to all 1
104
105 vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
106
107 vmul.s16 q12, q6, q10 ; x2 * Dequant
108 vmul.s16 q13, q7, q11
109
110 vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
111
112 vtst.16 q14, q4, q8 ; now find eob
113 vtst.16 q15, q5, q8 ; non-zero element is set to all 1
114
115 vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
116
117 ldr r7, [r3, #vp8_blockd_dqcoeff]
118
119 vand q0, q6, q14 ; get all valid numbers from scan array
120 vand q1, q7, q15
121
122 vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
123
124 vtst.16 q2, q10, q8 ; now find eob
125 vtst.16 q3, q11, q8 ; non-zero element is set to all 1
126
127 vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
128
129 vand q10, q6, q2 ; get all valid numbers from scan array
130 vand q11, q7, q3
131 vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
132
133 vmax.u16 d0, d0, d1
134 vmax.u16 d20, d20, d21
135 vmovl.u16 q0, d0
136 vmovl.u16 q10, d20
137
138
139 vmax.u32 d0, d0, d1
140 vmax.u32 d20, d20, d21
141 vpmax.u32 d0, d0, d0
142 vpmax.u32 d20, d20, d20
143
144 add r4, r2, #vp8_blockd_eob
145 add r5, r3, #vp8_blockd_eob
146
147 vst1.32 {d0[0]}, [r4@32]
148 vst1.32 {d20[0]}, [r5@32]
149
150 vldmia sp!, {q4-q7}
151 ldmfd sp!, {r4-r9}
152 bx lr
153
154 ENDP
155
156 ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
157 |vp8_fast_quantize_b_neon| PROC
158
159 stmfd sp!, {r4-r7}
160
161 ldr r3, [r0, #vp8_block_coeff]
162 ldr r4, [r0, #vp8_block_quant_fast]
163 ldr r5, [r0, #vp8_block_round]
164
165 vld1.16 {q0, q1}, [r3@128] ; load z
166 vorr.s16 q14, q0, q1 ; check if all zero (step 1)
167 ldr r6, [r1, #vp8_blockd_qcoeff]
168 ldr r7, [r1, #vp8_blockd_dqcoeff]
169 vorr.s16 d28, d28, d29 ; check if all zero (step 2)
170
171 vabs.s16 q12, q0 ; calculate x = abs(z)
172 vabs.s16 q13, q1
173
174 ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ e
175 vshr.s16 q2, q0, #15 ; sz
176 vmov r2, r3, d28 ; check if all zero (step 3)
177 vshr.s16 q3, q1, #15
178
179 vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]
180 vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]
181
182 vadd.s16 q12, q14 ; x + Round
183 vadd.s16 q13, q15
184
185 ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
186
187 vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16
188 vqdmulh.s16 q13, q9
189
190 vld1.16 {q10, q11}, [r0@128]; load inverse scan order
191
192 vceq.s16 q8, q8 ; set q8 to all 1
193
194 ldr r4, [r1, #vp8_blockd_dequant]
195
196 vshr.s16 q12, #1 ; right shift 1 after vqdmulh
197 vshr.s16 q13, #1
198
199 orr r2, r2, r3 ; check if all zero (step 4)
200 cmp r2, #0 ; check if all zero (step 5)
201 beq zero_output ; check if all zero (step 6)
202
72 ;modify data to have its original sign 203 ;modify data to have its original sign
73 veor.s16 q4, q2 ; y^sz 204 veor.s16 q12, q2 ; y^sz
74 veor.s16 q5, q3 205 veor.s16 q13, q3
75 206
76 ldr r12, [sp] ;load dequant_ptr 207 vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's comple ment)
77 208 vsub.s16 q13, q3
78 vsub.s16 q4, q2 ; x1 = (y^sz) - sz = (y^sz) - (- 1) (two's complement) 209
79 vsub.s16 q5, q3 210 vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]
80 211
81 vand.s16 q4, q10 ;mask off x1 elements 212 vtst.16 q14, q12, q8 ; now find eob
82 vand.s16 q5, q11 213 vtst.16 q15, q13, q8 ; non-zero element is set to all 1
83 214
84 vld1.s16 {q6, q7}, [r12] ;load dequant_ptr[i] 215 vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1
85 216
86 vtst.16 q14, q4, q8 ;now find eob 217 vand q10, q10, q14 ; get all valid numbers from scan array
87 vtst.16 q15, q5, q8 ;non-zero element is set to all 1 in q4, q5 218 vand q11, q11, q15
88 219
89 vst1.s16 {q4, q5}, [r2] ;store: qcoeff = x1 220
90 221 vmax.u16 q0, q10, q11 ; find maximum value in q0, q1
91 vand q0, q0, q14 ;get all valid number from rvspl us1_scan_order array
92 vand q1, q1, q15
93
94 vmax.u16 q0, q0, q1 ;find maximum value in q0, q1
95 vmax.u16 d0, d0, d1 222 vmax.u16 d0, d0, d1
96 vmovl.u16 q0, d0 223 vmovl.u16 q0, d0
97 224
98 vmul.s16 q6, q4 ;x * Dequant 225 vmul.s16 q2, q12 ; x * Dequant
99 vmul.s16 q7, q5 226 vmul.s16 q3, q13
100 227
101 vmax.u32 d0, d0, d1 228 vmax.u32 d0, d0, d1
102 vpmax.u32 d0, d0, d0 229 vpmax.u32 d0, d0, d0
103 230
104 vst1.s16 {q6, q7}, [r3] ;store dqcoeff = x * Dequant 231 vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
105 232
106 vmov.32 r0, d0[0] 233 add r4, r1, #vp8_blockd_eob
234 vst1.32 {d0[0]}, [r4@32]
235
236 ldmfd sp!, {r4-r7}
107 bx lr 237 bx lr
108 238
109 zero_output 239 zero_output
110 vst1.s16 {q10, q11}, [r2] ; qcoeff = 0 240 str r2, [r1, #vp8_blockd_eob]
111 vst1.s16 {q10, q11}, [r3] ; dqcoeff = 0 241 vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0
112 mov r0, #0 242 vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0
113 243
244 ldmfd sp!, {r4-r7}
114 bx lr 245 bx lr
115 246
116 ENDP 247 ENDP
117 248
249 ; default inverse zigzag table is defined in vp8/common/entropy.c
250 _inv_zig_zag_
251 DCD inv_zig_zag
252
253 ALIGN 16 ; enable use of @128 bit aligned loads
254 inv_zig_zag
255 DCW 0x0001, 0x0002, 0x0006, 0x0007
256 DCW 0x0003, 0x0005, 0x0008, 0x000d
257 DCW 0x0004, 0x0009, 0x000c, 0x000e
258 DCW 0x000a, 0x000b, 0x000f, 0x0010
259
118 END 260 END
261
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698