Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(534)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 EXPORT |vp9_lpf_horizontal_4_neon|
12 EXPORT |vp9_lpf_vertical_4_neon|
13 ARM
14
15 AREA ||.text||, CODE, READONLY, ALIGN=2
16
17 ; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
18 ; works on 16 iterations at a time.
19 ; TODO(fgalligan): See about removing the count code as this function is only
20 ; called with a count of 1.
21 ;
22 ; void vp9_lpf_horizontal_4_neon(uint8_t *s,
23 ; int p /* pitch */,
24 ; const uint8_t *blimit,
25 ; const uint8_t *limit,
26 ; const uint8_t *thresh,
27 ; int count)
28 ;
29 ; r0 uint8_t *s,
30 ; r1 int p, /* pitch */
31 ; r2 const uint8_t *blimit,
32 ; r3 const uint8_t *limit,
33 ; sp const uint8_t *thresh,
34 ; sp+4 int count
35 |vp9_lpf_horizontal_4_neon| PROC
36 push {lr}
37
38 vld1.8 {d0[]}, [r2] ; duplicate *blimit
39 ldr r12, [sp, #8] ; load count
40 ldr r2, [sp, #4] ; load thresh
41 add r1, r1, r1 ; double pitch
42
43 cmp r12, #0
44 beq end_vp9_lf_h_edge
45
46 vld1.8 {d1[]}, [r3] ; duplicate *limit
47 vld1.8 {d2[]}, [r2] ; duplicate *thresh
48
49 count_lf_h_loop
50 sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
51 add r3, r2, r1, lsr #1 ; set to 3 lines down
52
53 vld1.u8 {d3}, [r2@64], r1 ; p3
54 vld1.u8 {d4}, [r3@64], r1 ; p2
55 vld1.u8 {d5}, [r2@64], r1 ; p1
56 vld1.u8 {d6}, [r3@64], r1 ; p0
57 vld1.u8 {d7}, [r2@64], r1 ; q0
58 vld1.u8 {d16}, [r3@64], r1 ; q1
59 vld1.u8 {d17}, [r2@64] ; q2
60 vld1.u8 {d18}, [r3@64] ; q3
61
62 sub r2, r2, r1, lsl #1
63 sub r3, r3, r1, lsl #1
64
65 bl vp9_loop_filter_neon
66
67 vst1.u8 {d4}, [r2@64], r1 ; store op1
68 vst1.u8 {d5}, [r3@64], r1 ; store op0
69 vst1.u8 {d6}, [r2@64], r1 ; store oq0
70 vst1.u8 {d7}, [r3@64], r1 ; store oq1
71
72 add r0, r0, #8
73 subs r12, r12, #1
74 bne count_lf_h_loop
75
76 end_vp9_lf_h_edge
77 pop {pc}
78 ENDP ; |vp9_lpf_horizontal_4_neon|
79
80 ; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
81 ; works on 16 iterations at a time.
82 ; TODO(fgalligan): See about removing the count code as this function is only
83 ; called with a count of 1.
84 ;
85 ; void vp9_lpf_vertical_4_neon(uint8_t *s,
86 ; int p /* pitch */,
87 ; const uint8_t *blimit,
88 ; const uint8_t *limit,
89 ; const uint8_t *thresh,
90 ; int count)
91 ;
92 ; r0 uint8_t *s,
93 ; r1 int p, /* pitch */
94 ; r2 const uint8_t *blimit,
95 ; r3 const uint8_t *limit,
96 ; sp const uint8_t *thresh,
97 ; sp+4 int count
98 |vp9_lpf_vertical_4_neon| PROC
99 push {lr}
100
101 vld1.8 {d0[]}, [r2] ; duplicate *blimit
102 ldr r12, [sp, #8] ; load count
103 vld1.8 {d1[]}, [r3] ; duplicate *limit
104
105 ldr r3, [sp, #4] ; load thresh
106 sub r2, r0, #4 ; move s pointer down by 4 columns
107 cmp r12, #0
108 beq end_vp9_lf_v_edge
109
110 vld1.8 {d2[]}, [r3] ; duplicate *thresh
111
112 count_lf_v_loop
113 vld1.u8 {d3}, [r2], r1 ; load s data
114 vld1.u8 {d4}, [r2], r1
115 vld1.u8 {d5}, [r2], r1
116 vld1.u8 {d6}, [r2], r1
117 vld1.u8 {d7}, [r2], r1
118 vld1.u8 {d16}, [r2], r1
119 vld1.u8 {d17}, [r2], r1
120 vld1.u8 {d18}, [r2]
121
122 ;transpose to 8x16 matrix
123 vtrn.32 d3, d7
124 vtrn.32 d4, d16
125 vtrn.32 d5, d17
126 vtrn.32 d6, d18
127
128 vtrn.16 d3, d5
129 vtrn.16 d4, d6
130 vtrn.16 d7, d17
131 vtrn.16 d16, d18
132
133 vtrn.8 d3, d4
134 vtrn.8 d5, d6
135 vtrn.8 d7, d16
136 vtrn.8 d17, d18
137
138 bl vp9_loop_filter_neon
139
140 sub r0, r0, #2
141
142 ;store op1, op0, oq0, oq1
143 vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
144 vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
145 vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
146 vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
147 vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
148 vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
149 vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
150 vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
151
152 add r0, r0, r1, lsl #3 ; s += pitch * 8
153 subs r12, r12, #1
154 subne r2, r0, #4 ; move s pointer down by 4 columns
155 bne count_lf_v_loop
156
157 end_vp9_lf_v_edge
158 pop {pc}
159 ENDP ; |vp9_lpf_vertical_4_neon|
160
161 ; void vp9_loop_filter_neon();
162 ; This is a helper function for the loopfilters. The invidual functions do the
163 ; necessary load, transpose (if necessary) and store. The function does not use
164 ; registers d8-d15.
165 ;
166 ; Inputs:
167 ; r0-r3, r12 PRESERVE
168 ; d0 blimit
169 ; d1 limit
170 ; d2 thresh
171 ; d3 p3
172 ; d4 p2
173 ; d5 p1
174 ; d6 p0
175 ; d7 q0
176 ; d16 q1
177 ; d17 q2
178 ; d18 q3
179 ;
180 ; Outputs:
181 ; d4 op1
182 ; d5 op0
183 ; d6 oq0
184 ; d7 oq1
185 |vp9_loop_filter_neon| PROC
186 ; filter_mask
187 vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
188 vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
189 vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
190 vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
191 vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
192 vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
193
194 ; only compare the largest value to limit
195 vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
196 vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
197
198 vabd.u8 d17, d6, d7 ; abs(p0 - q0)
199
200 vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
201
202 vmov.u8 d18, #0x80
203
204 vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
205
206 ; hevmask
207 vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
208 vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
209 vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
210
211 vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
212 vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
213
214 veor d7, d7, d18 ; qs0
215
216 vcge.u8 d23, d1, d23 ; abs(m1) > limit
217
218 ; filter() function
219 ; convert to signed
220
221 vshr.u8 d28, d28, #1 ; a = a / 2
222 veor d6, d6, d18 ; ps0
223
224 veor d5, d5, d18 ; ps1
225 vqadd.u8 d17, d17, d28 ; a = b + a
226
227 veor d16, d16, d18 ; qs1
228
229 vmov.u8 d19, #3
230
231 vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
232
233 vcge.u8 d17, d0, d17 ; a > blimit
234
235 vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
236 vorr d22, d21, d22 ; hevmask
237
238 vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
239
240 vand d27, d27, d22 ; filter &= hev
241 vand d23, d23, d17 ; filter_mask
242
243 vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
244
245 vmov.u8 d17, #4
246
247 ; filter = clamp(filter + 3 * ( qs0 - ps0))
248 vqmovn.s16 d27, q12
249
250 vand d27, d27, d23 ; filter &= mask
251
252 vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
253 vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
254 vshr.s8 d28, d28, #3 ; filter2 >>= 3
255 vshr.s8 d27, d27, #3 ; filter1 >>= 3
256
257 vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
258 vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
259
260 ; outer tap adjustments
261 vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
262
263 veor d6, d26, d18 ; *oq0 = u^0x80
264
265 vbic d27, d27, d22 ; filter &= ~hev
266
267 vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
268 vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
269
270 veor d5, d19, d18 ; *op0 = u^0x80
271 veor d4, d21, d18 ; *op1 = u^0x80
272 veor d7, d20, d18 ; *oq1 = u^0x80
273
274 bx lr
275 ENDP ; |vp9_loop_filter_neon|
276
277 END
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon.c ('k') | source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698