Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(290)

Side by Side Diff: source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 %define program_name vpx
12
13 %include "third_party/x86inc/x86inc.asm" 11 %include "third_party/x86inc/x86inc.asm"
14 12
15 SECTION .text 13 SECTION .text
16 14
17 %macro convolve_fn 1 15 %macro convolve_fn 1
18 INIT_XMM sse2 16 INIT_XMM sse2
19 cglobal convolve_%1, 4, 7, 8, src, src_stride, dst, dst_stride, \ 17 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
20 fx, fxs, fy, fys, w, h 18 fx, fxs, fy, fys, w, h
21 mov r4d, dword wm 19 mov r4d, dword wm
22 cmp r4d, 4 20 cmp r4d, 4
23 je .w4 21 je .w4
24 cmp r4d, 8 22 cmp r4d, 8
25 je .w8 23 je .w8
26 cmp r4d, 16 24 cmp r4d, 16
27 je .w16 25 je .w16
28 cmp r4d, 32 26 cmp r4d, 32
29 je .w32 27 je .w32
30 28
31 ; 64xh 29 mov r4d, dword hm
32 mov r4d, dword hm
33 shr r4d, 1 ; ASSUMPTION: hm is at least EVEN
34 sub r4d, 1
35
36 movu m0, [srcq]
37 movu m4, [srcq+src_strideq]
38 movu m1, [srcq+16]
39 movu m5, [srcq+src_strideq+16]
40 movu m2, [srcq+32]
41 movu m6, [srcq+src_strideq+32]
42 movu m3, [srcq+48]
43 movu m7, [srcq+src_strideq+48]
44
45 .loop64: 30 .loop64:
46 prefetcht0 [srcq+64 ] 31 movu m0, [srcq]
47 prefetcht0 [srcq+src_strideq+64] 32 movu m1, [srcq+16]
48 33 movu m2, [srcq+32]
49 lea srcq, [srcq+src_strideq*2] 34 movu m3, [srcq+48]
50 35 add srcq, src_strideq
51 %ifidn %1, avg 36 %ifidn %1, avg
52 pavgb m0, [dstq] 37 pavgb m0, [dstq]
53 pavgb m1, [dstq+16] 38 pavgb m1, [dstq+16]
54 39 pavgb m2, [dstq+32]
55 mova [dstq ], m0 40 pavgb m3, [dstq+48]
56 movu m0, [srcq]
57
58 mova [dstq+16], m1
59 movu m1, [srcq+16]
60
61 pavgb m2, [dstq+32]
62 mova [dstq+32], m2
63 movu m2, [srcq+32]
64 pavgb m3, [dstq+48]
65 mova [dstq+48], m3
66 movu m3, [srcq+48]
67 pavgb m4, [dstq+dst_strideq]
68
69 mova [dstq+dst_strideq], m4
70 movu m4, [srcq+src_strideq]
71
72 pavgb m5, [dstq+dst_strideq+16]
73 mova [dstq+dst_strideq+16], m5
74 movu m5, [srcq+src_strideq+16]
75 pavgb m6, [dstq+dst_strideq+32]
76 mova [dstq+dst_strideq+32], m6
77 movu m6, [srcq+src_strideq+32]
78 pavgb m7, [dstq+dst_strideq+48]
79 mova [dstq+dst_strideq+48], m7
80 movu m7, [srcq+src_strideq+48]
81
82 lea dstq, [dstq+dst_strideq*2]
83 %else
84 mova [dstq ], m0
85 movu m0, [srcq]
86
87 mova [dstq+16], m1
88 movu m1, [srcq+16]
89 mova [dstq+32], m2
90 movu m2, [srcq+32]
91 mova [dstq+48], m3
92 movu m3, [srcq+48]
93
94 mova [dstq+dst_strideq], m4
95 movu m4, [srcq+src_strideq]
96
97 mova [dstq+dst_strideq+16], m5
98 movu m5, [srcq+src_strideq+16]
99 mova [dstq+dst_strideq+32], m6
100 movu m6, [srcq+src_strideq+32]
101 mova [dstq+dst_strideq+48], m7
102 movu m7, [srcq+src_strideq+48]
103
104 lea dstq, [dstq+dst_strideq*2]
105 %endif 41 %endif
106 dec r4d 42 mova [dstq ], m0
43 mova [dstq+16], m1
44 mova [dstq+32], m2
45 mova [dstq+48], m3
46 add dstq, dst_strideq
47 dec r4d
107 jnz .loop64 48 jnz .loop64
108
109 %ifidn %1, avg
110 pavgb m0, [dstq]
111 pavgb m1, [dstq+16]
112 pavgb m2, [dstq+32]
113 pavgb m3, [dstq+48]
114 pavgb m4, [dstq+dst_strideq]
115 pavgb m5, [dstq+dst_strideq+16]
116 pavgb m6, [dstq+dst_strideq+32]
117 pavgb m7, [dstq+dst_strideq+48]
118 %endif
119 mova [dstq ], m0
120 mova [dstq+16], m1
121 mova [dstq+32], m2
122 mova [dstq+48], m3
123
124 mova [dstq+dst_strideq ], m4
125 mova [dstq+dst_strideq+16], m5
126 mova [dstq+dst_strideq+32], m6
127 mova [dstq+dst_strideq+48], m7
128
129 RET 49 RET
130 50
131 .w32: 51 .w32:
132 mov r4d, dword hm 52 mov r4d, dword hm
133 sub r4d, 2
134
135 movu m0, [srcq]
136 movu m1, [srcq+16]
137 movu m2, [srcq+src_strideq]
138 movu m3, [srcq+src_strideq+16]
139
140 .loop32: 53 .loop32:
141 prefetcht0 [srcq+64] 54 movu m0, [srcq]
142 prefetcht0 [srcq+src_strideq+64] 55 movu m1, [srcq+16]
143 56 movu m2, [srcq+src_strideq]
144 lea srcq, [srcq+src_strideq*2] 57 movu m3, [srcq+src_strideq+16]
58 lea srcq, [srcq+src_strideq*2]
145 %ifidn %1, avg 59 %ifidn %1, avg
146 pavgb m0, [dstq] 60 pavgb m0, [dstq]
147 pavgb m1, [dstq+16] 61 pavgb m1, [dstq +16]
148 pavgb m2, [dstq+dst_strideq] 62 pavgb m2, [dstq+dst_strideq]
149 pavgb m3, [dstq+dst_strideq+16] 63 pavgb m3, [dstq+dst_strideq+16]
150 %endif 64 %endif
151 mova [dstq], m0 65 mova [dstq ], m0
152 movu m0, [srcq] 66 mova [dstq +16], m1
153 67 mova [dstq+dst_strideq ], m2
154 mova [dstq+16], m1 68 mova [dstq+dst_strideq+16], m3
155 movu m1, [srcq+16] 69 lea dstq, [dstq+dst_strideq*2]
156 70 sub r4d, 2
157 mova [dstq+dst_strideq], m2
158 movu m2, [srcq+src_strideq]
159
160 mova [dstq+dst_strideq+16], m3
161 movu m3, [srcq+src_strideq+16]
162
163 lea dstq, [dstq+dst_strideq*2]
164
165 sub r4d, 2
166 jnz .loop32 71 jnz .loop32
167
168 %ifidn %1, avg
169 pavgb m0, [dstq]
170 pavgb m1, [dstq+16]
171 pavgb m2, [dstq+dst_strideq]
172 pavgb m3, [dstq+dst_strideq+16]
173 %endif
174 mova [dstq ], m0
175 mova [dstq+16], m1
176
177 mova [dstq+dst_strideq ], m2
178 mova [dstq+dst_strideq+16], m3
179
180 RET 72 RET
181 73
182 .w16: 74 .w16:
183 mov r4d, dword hm 75 mov r4d, dword hm
184 sub r4d, 4 76 lea r5q, [src_strideq*3]
185 77 lea r6q, [dst_strideq*3]
186 movu m0, [srcq]
187 movu m1, [srcq+src_strideq]
188
189 .loop16: 78 .loop16:
190 lea srcq, [srcq+src_strideq] 79 movu m0, [srcq]
191 prefetcht0 [srcq+src_strideq*4] 80 movu m1, [srcq+src_strideq]
192 lea srcq, [srcq+src_strideq] 81 movu m2, [srcq+src_strideq*2]
193 prefetcht0 [srcq+src_strideq*2] 82 movu m3, [srcq+r5q]
83 lea srcq, [srcq+src_strideq*4]
194 %ifidn %1, avg 84 %ifidn %1, avg
195 pavgb m0, [dstq] 85 pavgb m0, [dstq]
196 pavgb m1, [dstq+dst_strideq] 86 pavgb m1, [dstq+dst_strideq]
87 pavgb m2, [dstq+dst_strideq*2]
88 pavgb m3, [dstq+r6q]
197 %endif 89 %endif
198 mova [dstq ], m0 90 mova [dstq ], m0
199 mova [dstq+dst_strideq], m1 91 mova [dstq+dst_strideq ], m1
200 92 mova [dstq+dst_strideq*2], m2
201 lea dstq, [dstq+dst_strideq*2] 93 mova [dstq+r6q ], m3
202 94 lea dstq, [dstq+dst_strideq*4]
203 movu m0, [srcq] 95 sub r4d, 4
204 movu m1, [srcq+src_strideq]
205
206 sub r4d, 2
207 jnz .loop16 96 jnz .loop16
208
209 lea srcq, [srcq+src_strideq*2]
210 %ifidn %1, avg
211 pavgb m0, [dstq]
212 pavgb m1, [dstq+dst_strideq]
213 %endif
214 mova [dstq ], m0
215 mova [dstq+dst_strideq], m1
216
217 lea dstq, [dstq+dst_strideq*2]
218
219 movu m0, [srcq]
220 movu m1, [srcq+src_strideq]
221
222 %ifidn %1, avg
223 pavgb m0, [dstq]
224 pavgb m1, [dstq+dst_strideq]
225 %endif
226
227 mova [dstq ], m0
228 mova [dstq+dst_strideq], m1
229
230 RET 97 RET
231 98
232 INIT_MMX sse 99 INIT_MMX sse
233 .w8: 100 .w8:
234 mov r4d, dword hm 101 mov r4d, dword hm
235 sub r4d, 2 102 lea r5q, [src_strideq*3]
236 103 lea r6q, [dst_strideq*3]
237 movu m0, [srcq]
238 movu m1, [srcq+src_strideq]
239
240 .loop8: 104 .loop8:
241 lea srcq, [srcq+src_strideq] 105 movu m0, [srcq]
242 prefetcht0 [srcq+src_strideq*4] 106 movu m1, [srcq+src_strideq]
243 lea srcq, [srcq+src_strideq] 107 movu m2, [srcq+src_strideq*2]
244 prefetcht0 [srcq+src_strideq*2] 108 movu m3, [srcq+r5q]
245 109 lea srcq, [srcq+src_strideq*4]
246 %ifidn %1, avg 110 %ifidn %1, avg
247 pavgb m0, [dstq] 111 pavgb m0, [dstq]
248 pavgb m1, [dstq+dst_strideq] 112 pavgb m1, [dstq+dst_strideq]
113 pavgb m2, [dstq+dst_strideq*2]
114 pavgb m3, [dstq+r6q]
249 %endif 115 %endif
250 mova [dstq ], m0 116 mova [dstq ], m0
251 mova [dstq+dst_strideq], m1 117 mova [dstq+dst_strideq ], m1
252 118 mova [dstq+dst_strideq*2], m2
253 movu m0, [srcq] 119 mova [dstq+r6q ], m3
254 movu m1, [srcq+src_strideq] 120 lea dstq, [dstq+dst_strideq*4]
255 121 sub r4d, 4
256 lea dstq, [dstq+dst_strideq*2]
257
258 sub r4d, 2
259 jnz .loop8 122 jnz .loop8
260
261 %ifidn %1, avg
262 pavgb m0, [dstq]
263 pavgb m1, [dstq+dst_strideq]
264 %endif
265 mova [dstq ], m0
266 mova [dstq+dst_strideq], m1
267
268 RET 123 RET
269 124
270 .w4: 125 .w4:
271 mov r4d, dword hm 126 mov r4d, dword hm
272 127 lea r5q, [src_strideq*3]
273 lea r5q, [src_strideq*3] 128 lea r6q, [dst_strideq*3]
274 lea r6q, [dst_strideq*3]
275
276 .loop4: 129 .loop4:
277 movh m0, [srcq] 130 movh m0, [srcq]
278 movh m1, [srcq+src_strideq] 131 movh m1, [srcq+src_strideq]
279 movh m2, [srcq+src_strideq*2] 132 movh m2, [srcq+src_strideq*2]
280 movh m3, [srcq+r5q] 133 movh m3, [srcq+r5q]
281 134 lea srcq, [srcq+src_strideq*4]
282 lea srcq, [srcq+src_strideq*4]
283 %ifidn %1, avg 135 %ifidn %1, avg
284 movh m4, [dstq] 136 movh m4, [dstq]
285 movh m5, [dstq+dst_strideq] 137 movh m5, [dstq+dst_strideq]
286 movh m6, [dstq+dst_strideq*2] 138 movh m6, [dstq+dst_strideq*2]
287 movh m7, [dstq+r6q] 139 movh m7, [dstq+r6q]
288 140 pavgb m0, m4
289 pavgb m0, m4 141 pavgb m1, m5
290 pavgb m1, m5 142 pavgb m2, m6
291 pavgb m2, m6 143 pavgb m3, m7
292 pavgb m3, m7
293 %endif 144 %endif
294 movh [dstq ], m0 145 movh [dstq ], m0
295 movh [dstq+dst_strideq ], m1 146 movh [dstq+dst_strideq ], m1
296 movh [dstq+dst_strideq*2], m2 147 movh [dstq+dst_strideq*2], m2
297 movh [dstq+r6q ], m3 148 movh [dstq+r6q ], m3
298 149 lea dstq, [dstq+dst_strideq*4]
299 lea dstq, [dstq+dst_strideq*4] 150 sub r4d, 4
300
301 sub r4d, 4
302 jnz .loop4 151 jnz .loop4
303 RET 152 RET
304 %endmacro 153 %endmacro
305 154
306 convolve_fn copy 155 convolve_fn copy
307 convolve_fn avg 156 convolve_fn avg
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/x86/subtract_sse2.asm ('k') | source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698