OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 %define program_name vpx | |
12 | |
13 %include "third_party/x86inc/x86inc.asm" | 11 %include "third_party/x86inc/x86inc.asm" |
14 | 12 |
15 SECTION .text | 13 SECTION .text |
16 | 14 |
17 %macro convolve_fn 1 | 15 %macro convolve_fn 1 |
18 INIT_XMM sse2 | 16 INIT_XMM sse2 |
19 cglobal convolve_%1, 4, 7, 8, src, src_stride, dst, dst_stride, \ | 17 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ |
20 fx, fxs, fy, fys, w, h | 18 fx, fxs, fy, fys, w, h |
21 mov r4d, dword wm | 19 mov r4d, dword wm |
22 cmp r4d, 4 | 20 cmp r4d, 4 |
23 je .w4 | 21 je .w4 |
24 cmp r4d, 8 | 22 cmp r4d, 8 |
25 je .w8 | 23 je .w8 |
26 cmp r4d, 16 | 24 cmp r4d, 16 |
27 je .w16 | 25 je .w16 |
28 cmp r4d, 32 | 26 cmp r4d, 32 |
29 je .w32 | 27 je .w32 |
30 | 28 |
31 ; 64xh | 29 mov r4d, dword hm |
32 mov r4d, dword hm | |
33 shr r4d, 1 ; ASSUMPTION: hm is at least EVEN | |
34 sub r4d, 1 | |
35 | |
36 movu m0, [srcq] | |
37 movu m4, [srcq+src_strideq] | |
38 movu m1, [srcq+16] | |
39 movu m5, [srcq+src_strideq+16] | |
40 movu m2, [srcq+32] | |
41 movu m6, [srcq+src_strideq+32] | |
42 movu m3, [srcq+48] | |
43 movu m7, [srcq+src_strideq+48] | |
44 | |
45 .loop64: | 30 .loop64: |
46 prefetcht0 [srcq+64 ] | 31 movu m0, [srcq] |
47 prefetcht0 [srcq+src_strideq+64] | 32 movu m1, [srcq+16] |
48 | 33 movu m2, [srcq+32] |
49 lea srcq, [srcq+src_strideq*2] | 34 movu m3, [srcq+48] |
50 | 35 add srcq, src_strideq |
51 %ifidn %1, avg | 36 %ifidn %1, avg |
52 pavgb m0, [dstq] | 37 pavgb m0, [dstq] |
53 pavgb m1, [dstq+16] | 38 pavgb m1, [dstq+16] |
54 | 39 pavgb m2, [dstq+32] |
55 mova [dstq ], m0 | 40 pavgb m3, [dstq+48] |
56 movu m0, [srcq] | |
57 | |
58 mova [dstq+16], m1 | |
59 movu m1, [srcq+16] | |
60 | |
61 pavgb m2, [dstq+32] | |
62 mova [dstq+32], m2 | |
63 movu m2, [srcq+32] | |
64 pavgb m3, [dstq+48] | |
65 mova [dstq+48], m3 | |
66 movu m3, [srcq+48] | |
67 pavgb m4, [dstq+dst_strideq] | |
68 | |
69 mova [dstq+dst_strideq], m4 | |
70 movu m4, [srcq+src_strideq] | |
71 | |
72 pavgb m5, [dstq+dst_strideq+16] | |
73 mova [dstq+dst_strideq+16], m5 | |
74 movu m5, [srcq+src_strideq+16] | |
75 pavgb m6, [dstq+dst_strideq+32] | |
76 mova [dstq+dst_strideq+32], m6 | |
77 movu m6, [srcq+src_strideq+32] | |
78 pavgb m7, [dstq+dst_strideq+48] | |
79 mova [dstq+dst_strideq+48], m7 | |
80 movu m7, [srcq+src_strideq+48] | |
81 | |
82 lea dstq, [dstq+dst_strideq*2] | |
83 %else | |
84 mova [dstq ], m0 | |
85 movu m0, [srcq] | |
86 | |
87 mova [dstq+16], m1 | |
88 movu m1, [srcq+16] | |
89 mova [dstq+32], m2 | |
90 movu m2, [srcq+32] | |
91 mova [dstq+48], m3 | |
92 movu m3, [srcq+48] | |
93 | |
94 mova [dstq+dst_strideq], m4 | |
95 movu m4, [srcq+src_strideq] | |
96 | |
97 mova [dstq+dst_strideq+16], m5 | |
98 movu m5, [srcq+src_strideq+16] | |
99 mova [dstq+dst_strideq+32], m6 | |
100 movu m6, [srcq+src_strideq+32] | |
101 mova [dstq+dst_strideq+48], m7 | |
102 movu m7, [srcq+src_strideq+48] | |
103 | |
104 lea dstq, [dstq+dst_strideq*2] | |
105 %endif | 41 %endif |
106 dec r4d | 42 mova [dstq ], m0 |
| 43 mova [dstq+16], m1 |
| 44 mova [dstq+32], m2 |
| 45 mova [dstq+48], m3 |
| 46 add dstq, dst_strideq |
| 47 dec r4d |
107 jnz .loop64 | 48 jnz .loop64 |
108 | |
109 %ifidn %1, avg | |
110 pavgb m0, [dstq] | |
111 pavgb m1, [dstq+16] | |
112 pavgb m2, [dstq+32] | |
113 pavgb m3, [dstq+48] | |
114 pavgb m4, [dstq+dst_strideq] | |
115 pavgb m5, [dstq+dst_strideq+16] | |
116 pavgb m6, [dstq+dst_strideq+32] | |
117 pavgb m7, [dstq+dst_strideq+48] | |
118 %endif | |
119 mova [dstq ], m0 | |
120 mova [dstq+16], m1 | |
121 mova [dstq+32], m2 | |
122 mova [dstq+48], m3 | |
123 | |
124 mova [dstq+dst_strideq ], m4 | |
125 mova [dstq+dst_strideq+16], m5 | |
126 mova [dstq+dst_strideq+32], m6 | |
127 mova [dstq+dst_strideq+48], m7 | |
128 | |
129 RET | 49 RET |
130 | 50 |
131 .w32: | 51 .w32: |
132 mov r4d, dword hm | 52 mov r4d, dword hm |
133 sub r4d, 2 | |
134 | |
135 movu m0, [srcq] | |
136 movu m1, [srcq+16] | |
137 movu m2, [srcq+src_strideq] | |
138 movu m3, [srcq+src_strideq+16] | |
139 | |
140 .loop32: | 53 .loop32: |
141 prefetcht0 [srcq+64] | 54 movu m0, [srcq] |
142 prefetcht0 [srcq+src_strideq+64] | 55 movu m1, [srcq+16] |
143 | 56 movu m2, [srcq+src_strideq] |
144 lea srcq, [srcq+src_strideq*2] | 57 movu m3, [srcq+src_strideq+16] |
| 58 lea srcq, [srcq+src_strideq*2] |
145 %ifidn %1, avg | 59 %ifidn %1, avg |
146 pavgb m0, [dstq] | 60 pavgb m0, [dstq] |
147 pavgb m1, [dstq+16] | 61 pavgb m1, [dstq +16] |
148 pavgb m2, [dstq+dst_strideq] | 62 pavgb m2, [dstq+dst_strideq] |
149 pavgb m3, [dstq+dst_strideq+16] | 63 pavgb m3, [dstq+dst_strideq+16] |
150 %endif | 64 %endif |
151 mova [dstq], m0 | 65 mova [dstq ], m0 |
152 movu m0, [srcq] | 66 mova [dstq +16], m1 |
153 | 67 mova [dstq+dst_strideq ], m2 |
154 mova [dstq+16], m1 | 68 mova [dstq+dst_strideq+16], m3 |
155 movu m1, [srcq+16] | 69 lea dstq, [dstq+dst_strideq*2] |
156 | 70 sub r4d, 2 |
157 mova [dstq+dst_strideq], m2 | |
158 movu m2, [srcq+src_strideq] | |
159 | |
160 mova [dstq+dst_strideq+16], m3 | |
161 movu m3, [srcq+src_strideq+16] | |
162 | |
163 lea dstq, [dstq+dst_strideq*2] | |
164 | |
165 sub r4d, 2 | |
166 jnz .loop32 | 71 jnz .loop32 |
167 | |
168 %ifidn %1, avg | |
169 pavgb m0, [dstq] | |
170 pavgb m1, [dstq+16] | |
171 pavgb m2, [dstq+dst_strideq] | |
172 pavgb m3, [dstq+dst_strideq+16] | |
173 %endif | |
174 mova [dstq ], m0 | |
175 mova [dstq+16], m1 | |
176 | |
177 mova [dstq+dst_strideq ], m2 | |
178 mova [dstq+dst_strideq+16], m3 | |
179 | |
180 RET | 72 RET |
181 | 73 |
182 .w16: | 74 .w16: |
183 mov r4d, dword hm | 75 mov r4d, dword hm |
184 sub r4d, 4 | 76 lea r5q, [src_strideq*3] |
185 | 77 lea r6q, [dst_strideq*3] |
186 movu m0, [srcq] | |
187 movu m1, [srcq+src_strideq] | |
188 | |
189 .loop16: | 78 .loop16: |
190 lea srcq, [srcq+src_strideq] | 79 movu m0, [srcq] |
191 prefetcht0 [srcq+src_strideq*4] | 80 movu m1, [srcq+src_strideq] |
192 lea srcq, [srcq+src_strideq] | 81 movu m2, [srcq+src_strideq*2] |
193 prefetcht0 [srcq+src_strideq*2] | 82 movu m3, [srcq+r5q] |
| 83 lea srcq, [srcq+src_strideq*4] |
194 %ifidn %1, avg | 84 %ifidn %1, avg |
195 pavgb m0, [dstq] | 85 pavgb m0, [dstq] |
196 pavgb m1, [dstq+dst_strideq] | 86 pavgb m1, [dstq+dst_strideq] |
| 87 pavgb m2, [dstq+dst_strideq*2] |
| 88 pavgb m3, [dstq+r6q] |
197 %endif | 89 %endif |
198 mova [dstq ], m0 | 90 mova [dstq ], m0 |
199 mova [dstq+dst_strideq], m1 | 91 mova [dstq+dst_strideq ], m1 |
200 | 92 mova [dstq+dst_strideq*2], m2 |
201 lea dstq, [dstq+dst_strideq*2] | 93 mova [dstq+r6q ], m3 |
202 | 94 lea dstq, [dstq+dst_strideq*4] |
203 movu m0, [srcq] | 95 sub r4d, 4 |
204 movu m1, [srcq+src_strideq] | |
205 | |
206 sub r4d, 2 | |
207 jnz .loop16 | 96 jnz .loop16 |
208 | |
209 lea srcq, [srcq+src_strideq*2] | |
210 %ifidn %1, avg | |
211 pavgb m0, [dstq] | |
212 pavgb m1, [dstq+dst_strideq] | |
213 %endif | |
214 mova [dstq ], m0 | |
215 mova [dstq+dst_strideq], m1 | |
216 | |
217 lea dstq, [dstq+dst_strideq*2] | |
218 | |
219 movu m0, [srcq] | |
220 movu m1, [srcq+src_strideq] | |
221 | |
222 %ifidn %1, avg | |
223 pavgb m0, [dstq] | |
224 pavgb m1, [dstq+dst_strideq] | |
225 %endif | |
226 | |
227 mova [dstq ], m0 | |
228 mova [dstq+dst_strideq], m1 | |
229 | |
230 RET | 97 RET |
231 | 98 |
232 INIT_MMX sse | 99 INIT_MMX sse |
233 .w8: | 100 .w8: |
234 mov r4d, dword hm | 101 mov r4d, dword hm |
235 sub r4d, 2 | 102 lea r5q, [src_strideq*3] |
236 | 103 lea r6q, [dst_strideq*3] |
237 movu m0, [srcq] | |
238 movu m1, [srcq+src_strideq] | |
239 | |
240 .loop8: | 104 .loop8: |
241 lea srcq, [srcq+src_strideq] | 105 movu m0, [srcq] |
242 prefetcht0 [srcq+src_strideq*4] | 106 movu m1, [srcq+src_strideq] |
243 lea srcq, [srcq+src_strideq] | 107 movu m2, [srcq+src_strideq*2] |
244 prefetcht0 [srcq+src_strideq*2] | 108 movu m3, [srcq+r5q] |
245 | 109 lea srcq, [srcq+src_strideq*4] |
246 %ifidn %1, avg | 110 %ifidn %1, avg |
247 pavgb m0, [dstq] | 111 pavgb m0, [dstq] |
248 pavgb m1, [dstq+dst_strideq] | 112 pavgb m1, [dstq+dst_strideq] |
| 113 pavgb m2, [dstq+dst_strideq*2] |
| 114 pavgb m3, [dstq+r6q] |
249 %endif | 115 %endif |
250 mova [dstq ], m0 | 116 mova [dstq ], m0 |
251 mova [dstq+dst_strideq], m1 | 117 mova [dstq+dst_strideq ], m1 |
252 | 118 mova [dstq+dst_strideq*2], m2 |
253 movu m0, [srcq] | 119 mova [dstq+r6q ], m3 |
254 movu m1, [srcq+src_strideq] | 120 lea dstq, [dstq+dst_strideq*4] |
255 | 121 sub r4d, 4 |
256 lea dstq, [dstq+dst_strideq*2] | |
257 | |
258 sub r4d, 2 | |
259 jnz .loop8 | 122 jnz .loop8 |
260 | |
261 %ifidn %1, avg | |
262 pavgb m0, [dstq] | |
263 pavgb m1, [dstq+dst_strideq] | |
264 %endif | |
265 mova [dstq ], m0 | |
266 mova [dstq+dst_strideq], m1 | |
267 | |
268 RET | 123 RET |
269 | 124 |
270 .w4: | 125 .w4: |
271 mov r4d, dword hm | 126 mov r4d, dword hm |
272 | 127 lea r5q, [src_strideq*3] |
273 lea r5q, [src_strideq*3] | 128 lea r6q, [dst_strideq*3] |
274 lea r6q, [dst_strideq*3] | |
275 | |
276 .loop4: | 129 .loop4: |
277 movh m0, [srcq] | 130 movh m0, [srcq] |
278 movh m1, [srcq+src_strideq] | 131 movh m1, [srcq+src_strideq] |
279 movh m2, [srcq+src_strideq*2] | 132 movh m2, [srcq+src_strideq*2] |
280 movh m3, [srcq+r5q] | 133 movh m3, [srcq+r5q] |
281 | 134 lea srcq, [srcq+src_strideq*4] |
282 lea srcq, [srcq+src_strideq*4] | |
283 %ifidn %1, avg | 135 %ifidn %1, avg |
284 movh m4, [dstq] | 136 movh m4, [dstq] |
285 movh m5, [dstq+dst_strideq] | 137 movh m5, [dstq+dst_strideq] |
286 movh m6, [dstq+dst_strideq*2] | 138 movh m6, [dstq+dst_strideq*2] |
287 movh m7, [dstq+r6q] | 139 movh m7, [dstq+r6q] |
288 | 140 pavgb m0, m4 |
289 pavgb m0, m4 | 141 pavgb m1, m5 |
290 pavgb m1, m5 | 142 pavgb m2, m6 |
291 pavgb m2, m6 | 143 pavgb m3, m7 |
292 pavgb m3, m7 | |
293 %endif | 144 %endif |
294 movh [dstq ], m0 | 145 movh [dstq ], m0 |
295 movh [dstq+dst_strideq ], m1 | 146 movh [dstq+dst_strideq ], m1 |
296 movh [dstq+dst_strideq*2], m2 | 147 movh [dstq+dst_strideq*2], m2 |
297 movh [dstq+r6q ], m3 | 148 movh [dstq+r6q ], m3 |
298 | 149 lea dstq, [dstq+dst_strideq*4] |
299 lea dstq, [dstq+dst_strideq*4] | 150 sub r4d, 4 |
300 | |
301 sub r4d, 4 | |
302 jnz .loop4 | 151 jnz .loop4 |
303 RET | 152 RET |
304 %endmacro | 153 %endmacro |
305 | 154 |
306 convolve_fn copy | 155 convolve_fn copy |
307 convolve_fn avg | 156 convolve_fn avg |
OLD | NEW |