OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 %include "third_party/x86inc/x86inc.asm" | |
12 | |
13 SECTION .text | |
14 | |
15 %macro HIGH_SAD_FN 4 | |
16 %if %4 == 0 | |
17 %if %3 == 5 | |
18 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows | |
19 %else ; %3 == 7 | |
20 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ | |
21 src_stride3, ref_stride3, n_rows | |
22 %endif ; %3 == 5/7 | |
23 %else ; avg | |
24 %if %3 == 5 | |
25 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ | |
26 second_pred, n_rows | |
27 %else ; %3 == 7 | |
28 cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ | |
29 ref, ref_stride, \ | |
30 second_pred, \ | |
31 src_stride3, ref_stride3 | |
32 %if ARCH_X86_64 | |
33 %define n_rowsd r7d | |
34 %else ; x86-32 | |
35 %define n_rowsd dword r0m | |
36 %endif ; x86-32/64 | |
37 %endif ; %3 == 5/7 | |
38 %endif ; avg/sad | |
39 movsxdifnidn src_strideq, src_strided | |
40 movsxdifnidn ref_strideq, ref_strided | |
41 %if %3 == 7 | |
42 lea src_stride3q, [src_strideq*3] | |
43 lea ref_stride3q, [ref_strideq*3] | |
44 %endif ; %3 == 7 | |
45 ; convert src, ref & second_pred to short ptrs (from byte ptrs) | |
46 shl srcq, 1 | |
47 shl refq, 1 | |
48 %if %4 == 1 | |
49 shl second_predq, 1 | |
50 %endif | |
51 %endmacro | |
52 | |
53 ; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, | |
54 ; uint8_t *ref, int ref_stride); | |
55 %macro HIGH_SAD64XN 1-2 0 | |
56 HIGH_SAD_FN 64, %1, 5, %2 | |
57 mov n_rowsd, %1 | |
58 pxor m0, m0 | |
59 pxor m6, m6 | |
60 | |
61 .loop: | |
62 ; first half of each row | |
63 movu m1, [refq] | |
64 movu m2, [refq+16] | |
65 movu m3, [refq+32] | |
66 movu m4, [refq+48] | |
67 %if %2 == 1 | |
68 pavgw m1, [second_predq+mmsize*0] | |
69 pavgw m2, [second_predq+mmsize*1] | |
70 pavgw m3, [second_predq+mmsize*2] | |
71 pavgw m4, [second_predq+mmsize*3] | |
72 lea second_predq, [second_predq+mmsize*4] | |
73 %endif | |
74 mova m5, [srcq] | |
75 psubusw m5, m1 | |
76 psubusw m1, [srcq] | |
77 por m1, m5 | |
78 mova m5, [srcq+16] | |
79 psubusw m5, m2 | |
80 psubusw m2, [srcq+16] | |
81 por m2, m5 | |
82 mova m5, [srcq+32] | |
83 psubusw m5, m3 | |
84 psubusw m3, [srcq+32] | |
85 por m3, m5 | |
86 mova m5, [srcq+48] | |
87 psubusw m5, m4 | |
88 psubusw m4, [srcq+48] | |
89 por m4, m5 | |
90 paddw m1, m2 | |
91 paddw m3, m4 | |
92 movhlps m2, m1 | |
93 movhlps m4, m3 | |
94 paddw m1, m2 | |
95 paddw m3, m4 | |
96 punpcklwd m1, m6 | |
97 punpcklwd m3, m6 | |
98 paddd m0, m1 | |
99 paddd m0, m3 | |
100 ; second half of each row | |
101 movu m1, [refq+64] | |
102 movu m2, [refq+80] | |
103 movu m3, [refq+96] | |
104 movu m4, [refq+112] | |
105 %if %2 == 1 | |
106 pavgw m1, [second_predq+mmsize*0] | |
107 pavgw m2, [second_predq+mmsize*1] | |
108 pavgw m3, [second_predq+mmsize*2] | |
109 pavgw m4, [second_predq+mmsize*3] | |
110 lea second_predq, [second_predq+mmsize*4] | |
111 %endif | |
112 mova m5, [srcq+64] | |
113 psubusw m5, m1 | |
114 psubusw m1, [srcq+64] | |
115 por m1, m5 | |
116 mova m5, [srcq+80] | |
117 psubusw m5, m2 | |
118 psubusw m2, [srcq+80] | |
119 por m2, m5 | |
120 mova m5, [srcq+96] | |
121 psubusw m5, m3 | |
122 psubusw m3, [srcq+96] | |
123 por m3, m5 | |
124 mova m5, [srcq+112] | |
125 psubusw m5, m4 | |
126 psubusw m4, [srcq+112] | |
127 por m4, m5 | |
128 paddw m1, m2 | |
129 paddw m3, m4 | |
130 movhlps m2, m1 | |
131 movhlps m4, m3 | |
132 paddw m1, m2 | |
133 paddw m3, m4 | |
134 punpcklwd m1, m6 | |
135 punpcklwd m3, m6 | |
136 lea refq, [refq+ref_strideq*2] | |
137 paddd m0, m1 | |
138 lea srcq, [srcq+src_strideq*2] | |
139 paddd m0, m3 | |
140 | |
141 dec n_rowsd | |
142 jg .loop | |
143 | |
144 movhlps m1, m0 | |
145 paddd m0, m1 | |
146 punpckldq m0, m6 | |
147 movhlps m1, m0 | |
148 paddd m0, m1 | |
149 movd eax, m0 | |
150 RET | |
151 %endmacro | |
152 | |
153 INIT_XMM sse2 | |
154 HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 | |
155 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 | |
156 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 | |
157 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 | |
158 | |
159 | |
160 ; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, | |
161 ; uint8_t *ref, int ref_stride); | |
162 %macro HIGH_SAD32XN 1-2 0 | |
163 HIGH_SAD_FN 32, %1, 5, %2 | |
164 mov n_rowsd, %1 | |
165 pxor m0, m0 | |
166 pxor m6, m6 | |
167 | |
168 .loop: | |
169 movu m1, [refq] | |
170 movu m2, [refq+16] | |
171 movu m3, [refq+32] | |
172 movu m4, [refq+48] | |
173 %if %2 == 1 | |
174 pavgw m1, [second_predq+mmsize*0] | |
175 pavgw m2, [second_predq+mmsize*1] | |
176 pavgw m3, [second_predq+mmsize*2] | |
177 pavgw m4, [second_predq+mmsize*3] | |
178 lea second_predq, [second_predq+mmsize*4] | |
179 %endif | |
180 mova m5, [srcq] | |
181 psubusw m5, m1 | |
182 psubusw m1, [srcq] | |
183 por m1, m5 | |
184 mova m5, [srcq+16] | |
185 psubusw m5, m2 | |
186 psubusw m2, [srcq+16] | |
187 por m2, m5 | |
188 mova m5, [srcq+32] | |
189 psubusw m5, m3 | |
190 psubusw m3, [srcq+32] | |
191 por m3, m5 | |
192 mova m5, [srcq+48] | |
193 psubusw m5, m4 | |
194 psubusw m4, [srcq+48] | |
195 por m4, m5 | |
196 paddw m1, m2 | |
197 paddw m3, m4 | |
198 movhlps m2, m1 | |
199 movhlps m4, m3 | |
200 paddw m1, m2 | |
201 paddw m3, m4 | |
202 punpcklwd m1, m6 | |
203 punpcklwd m3, m6 | |
204 lea refq, [refq+ref_strideq*2] | |
205 paddd m0, m1 | |
206 lea srcq, [srcq+src_strideq*2] | |
207 paddd m0, m3 | |
208 dec n_rowsd | |
209 jg .loop | |
210 | |
211 movhlps m1, m0 | |
212 paddd m0, m1 | |
213 punpckldq m0, m6 | |
214 movhlps m1, m0 | |
215 paddd m0, m1 | |
216 movd eax, m0 | |
217 RET | |
218 %endmacro | |
219 | |
220 INIT_XMM sse2 | |
221 HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 | |
222 HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 | |
223 HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 | |
224 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 | |
225 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 | |
226 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 | |
227 | |
228 ; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, | |
229 ; uint8_t *ref, int ref_stride); | |
230 %macro HIGH_SAD16XN 1-2 0 | |
231 HIGH_SAD_FN 16, %1, 5, %2 | |
232 mov n_rowsd, %1/2 | |
233 pxor m0, m0 | |
234 pxor m6, m6 | |
235 | |
236 .loop: | |
237 movu m1, [refq] | |
238 movu m2, [refq+16] | |
239 movu m3, [refq+ref_strideq*2] | |
240 movu m4, [refq+ref_strideq*2+16] | |
241 %if %2 == 1 | |
242 pavgw m1, [second_predq+mmsize*0] | |
243 pavgw m2, [second_predq+16] | |
244 pavgw m3, [second_predq+mmsize*2] | |
245 pavgw m4, [second_predq+mmsize*2+16] | |
246 lea second_predq, [second_predq+mmsize*4] | |
247 %endif | |
248 mova m5, [srcq] | |
249 psubusw m5, m1 | |
250 psubusw m1, [srcq] | |
251 por m1, m5 | |
252 mova m5, [srcq+16] | |
253 psubusw m5, m2 | |
254 psubusw m2, [srcq+16] | |
255 por m2, m5 | |
256 mova m5, [srcq+src_strideq*2] | |
257 psubusw m5, m3 | |
258 psubusw m3, [srcq+src_strideq*2] | |
259 por m3, m5 | |
260 mova m5, [srcq+src_strideq*2+16] | |
261 psubusw m5, m4 | |
262 psubusw m4, [srcq+src_strideq*2+16] | |
263 por m4, m5 | |
264 paddw m1, m2 | |
265 paddw m3, m4 | |
266 movhlps m2, m1 | |
267 movhlps m4, m3 | |
268 paddw m1, m2 | |
269 paddw m3, m4 | |
270 punpcklwd m1, m6 | |
271 punpcklwd m3, m6 | |
272 lea refq, [refq+ref_strideq*4] | |
273 paddd m0, m1 | |
274 lea srcq, [srcq+src_strideq*4] | |
275 paddd m0, m3 | |
276 dec n_rowsd | |
277 jg .loop | |
278 | |
279 movhlps m1, m0 | |
280 paddd m0, m1 | |
281 punpckldq m0, m6 | |
282 movhlps m1, m0 | |
283 paddd m0, m1 | |
284 movd eax, m0 | |
285 RET | |
286 %endmacro | |
287 | |
288 INIT_XMM sse2 | |
289 HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 | |
290 HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 | |
291 HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 | |
292 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 | |
293 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 | |
294 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 | |
295 | |
296 | |
297 ; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, | |
298 ; uint8_t *ref, int ref_stride); | |
299 %macro HIGH_SAD8XN 1-2 0 | |
300 HIGH_SAD_FN 8, %1, 7, %2 | |
301 mov n_rowsd, %1/4 | |
302 pxor m0, m0 | |
303 pxor m6, m6 | |
304 | |
305 .loop: | |
306 movu m1, [refq] | |
307 movu m2, [refq+ref_strideq*2] | |
308 movu m3, [refq+ref_strideq*4] | |
309 movu m4, [refq+ref_stride3q*2] | |
310 %if %2 == 1 | |
311 pavgw m1, [second_predq+mmsize*0] | |
312 pavgw m2, [second_predq+mmsize*1] | |
313 pavgw m3, [second_predq+mmsize*2] | |
314 pavgw m4, [second_predq+mmsize*3] | |
315 lea second_predq, [second_predq+mmsize*4] | |
316 %endif | |
317 mova m5, [srcq] | |
318 psubusw m5, m1 | |
319 psubusw m1, [srcq] | |
320 por m1, m5 | |
321 mova m5, [srcq+src_strideq*2] | |
322 psubusw m5, m2 | |
323 psubusw m2, [srcq+src_strideq*2] | |
324 por m2, m5 | |
325 mova m5, [srcq+src_strideq*4] | |
326 psubusw m5, m3 | |
327 psubusw m3, [srcq+src_strideq*4] | |
328 por m3, m5 | |
329 mova m5, [srcq+src_stride3q*2] | |
330 psubusw m5, m4 | |
331 psubusw m4, [srcq+src_stride3q*2] | |
332 por m4, m5 | |
333 paddw m1, m2 | |
334 paddw m3, m4 | |
335 movhlps m2, m1 | |
336 movhlps m4, m3 | |
337 paddw m1, m2 | |
338 paddw m3, m4 | |
339 punpcklwd m1, m6 | |
340 punpcklwd m3, m6 | |
341 lea refq, [refq+ref_strideq*8] | |
342 paddd m0, m1 | |
343 lea srcq, [srcq+src_strideq*8] | |
344 paddd m0, m3 | |
345 dec n_rowsd | |
346 jg .loop | |
347 | |
348 movhlps m1, m0 | |
349 paddd m0, m1 | |
350 punpckldq m0, m6 | |
351 movhlps m1, m0 | |
352 paddd m0, m1 | |
353 movd eax, m0 | |
354 RET | |
355 %endmacro | |
356 | |
357 INIT_XMM sse2 | |
358 HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 | |
359 HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 | |
360 HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 | |
361 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 | |
362 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 | |
363 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 | |
OLD | NEW |