Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(234)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "third_party/x86inc/x86inc.asm"
12
13 SECTION_RODATA
14 pw_8: times 8 dw 8
15 bilin_filter_m_sse2: times 8 dw 16
16 times 8 dw 0
17 times 8 dw 15
18 times 8 dw 1
19 times 8 dw 14
20 times 8 dw 2
21 times 8 dw 13
22 times 8 dw 3
23 times 8 dw 12
24 times 8 dw 4
25 times 8 dw 11
26 times 8 dw 5
27 times 8 dw 10
28 times 8 dw 6
29 times 8 dw 9
30 times 8 dw 7
31 times 16 dw 8
32 times 8 dw 7
33 times 8 dw 9
34 times 8 dw 6
35 times 8 dw 10
36 times 8 dw 5
37 times 8 dw 11
38 times 8 dw 4
39 times 8 dw 12
40 times 8 dw 3
41 times 8 dw 13
42 times 8 dw 2
43 times 8 dw 14
44 times 8 dw 1
45 times 8 dw 15
46
47 SECTION .text
48
49 ; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
50 ; int x_offset, int y_offset,
51 ; const uint8_t *dst, ptrdiff_t dst_stride,
52 ; int height, unsigned int *sse);
53 ;
54 ; This function returns the SE and stores SSE in the given pointer.
55
56 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
57 psubw %3, %4
58 psubw %1, %2
59 mova %4, %3 ; make copies to manipulate to calc sum
60 mova %2, %1 ; use originals for calc sse
61 pmaddwd %3, %3
62 paddw %4, %2
63 pmaddwd %1, %1
64 movhlps %2, %4
65 paddd %6, %3
66 paddw %4, %2
67 pxor %2, %2
68 pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
69 punpcklwd %4, %2 ; sign-extend word to dword
70 paddd %6, %1
71 paddd %5, %4
72
73 %endmacro
74
75 %macro STORE_AND_RET 0
76 %if mmsize == 16
77 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
78 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
79 ; We have to sign-extend it before adding the words within the register
80 ; and outputing to a dword.
81 movhlps m3, m7
82 movhlps m4, m6
83 paddd m7, m3
84 paddd m6, m4
85 pshufd m3, m7, 0x1
86 pshufd m4, m6, 0x1
87 paddd m7, m3
88 paddd m6, m4
89 mov r1, ssem ; r1 = unsigned int *sse
90 movd [r1], m7 ; store sse
91 movd rax, m6 ; store sum as return value
92 %endif
93 RET
94 %endmacro
95
96 %macro INC_SRC_BY_SRC_STRIDE 0
97 %if ARCH_X86=1 && CONFIG_PIC=1
98 lea srcq, [srcq + src_stridemp*2]
99 %else
100 lea srcq, [srcq + src_strideq*2]
101 %endif
102 %endmacro
103
104 %macro INC_SRC_BY_SRC_2STRIDE 0
105 %if ARCH_X86=1 && CONFIG_PIC=1
106 lea srcq, [srcq + src_stridemp*4]
107 %else
108 lea srcq, [srcq + src_strideq*4]
109 %endif
110 %endmacro
111
112 %macro SUBPEL_VARIANCE 1-2 0 ; W
113 %define bilin_filter_m bilin_filter_m_sse2
114 %define filter_idx_shift 5
115
116
117 %ifdef PIC ; 64bit PIC
118 %if %2 == 1 ; avg
119 cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
120 x_offset, y_offset, \
121 dst, dst_stride, \
122 sec, sec_stride, height, sse
123 %define sec_str sec_strideq
124 %else
125 cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
126 y_offset, dst, dst_stride, height, sse
127 %endif
128 %define h heightd
129 %define bilin_filter sseq
130 %else
131 %if ARCH_X86=1 && CONFIG_PIC=1
132 %if %2 == 1 ; avg
133 cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
134 x_offset, y_offset, \
135 dst, dst_stride, \
136 sec, sec_stride, \
137 height, sse, g_bilin_filter, g_pw_8
138 %define h dword heightm
139 %define sec_str sec_stridemp
140
141 ; Store bilin_filter and pw_8 location in stack
142 GET_GOT eax
143 add esp, 4 ; restore esp
144
145 lea ecx, [GLOBAL(bilin_filter_m)]
146 mov g_bilin_filterm, ecx
147
148 lea ecx, [GLOBAL(pw_8)]
149 mov g_pw_8m, ecx
150
151 LOAD_IF_USED 0, 1 ; load eax, ecx back
152 %else
153 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
154 x_offset, y_offset, dst, dst_stride, height, \
155 sse, g_bilin_filter, g_pw_8
156 %define h heightd
157
158 ; Store bilin_filter and pw_8 location in stack
159 GET_GOT eax
160 add esp, 4 ; restore esp
161
162 lea ecx, [GLOBAL(bilin_filter_m)]
163 mov g_bilin_filterm, ecx
164
165 lea ecx, [GLOBAL(pw_8)]
166 mov g_pw_8m, ecx
167
168 LOAD_IF_USED 0, 1 ; load eax, ecx back
169 %endif
170 %else
171 %if %2 == 1 ; avg
172 cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
173 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
174 x_offset, y_offset, \
175 dst, dst_stride, \
176 sec, sec_stride, \
177 height, sse
178 %if ARCH_X86_64
179 %define h heightd
180 %define sec_str sec_strideq
181 %else
182 %define h dword heightm
183 %define sec_str sec_stridemp
184 %endif
185 %else
186 cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
187 x_offset, y_offset, dst, dst_stride, height, sse
188 %define h heightd
189 %endif
190
191 %define bilin_filter bilin_filter_m
192 %endif
193 %endif
194
195 ASSERT %1 <= 16 ; m6 overflows if w > 16
196 pxor m6, m6 ; sum
197 pxor m7, m7 ; sse
198
199 %if %1 < 16
200 sar h, 1
201 %endif
202
203 ; FIXME(rbultje) replace by jumptable?
204 test x_offsetd, x_offsetd
205 jnz .x_nonzero
206 ; x_offset == 0
207 test y_offsetd, y_offsetd
208 jnz .x_zero_y_nonzero
209
210 ; x_offset == 0 && y_offset == 0
211 .x_zero_y_zero_loop:
212 %if %1 == 16
213 movu m0, [srcq]
214 movu m2, [srcq + 16]
215 mova m1, [dstq]
216 mova m3, [dstq + 16]
217 %if %2 == 1 ; avg
218 pavgw m0, [secq]
219 pavgw m2, [secq+16]
220 %endif
221 SUM_SSE m0, m1, m2, m3, m6, m7
222
223 lea srcq, [srcq + src_strideq*2]
224 lea dstq, [dstq + dst_strideq*2]
225 %if %2 == 1 ; avg
226 lea secq, [secq + sec_str*2]
227 %endif
228 %else ; %1 < 16
229 movu m0, [srcq]
230 movu m2, [srcq + src_strideq*2]
231 mova m1, [dstq]
232 mova m3, [dstq + dst_strideq*2]
233 %if %2 == 1 ; avg
234 pavgw m0, [secq]
235 pavgw m2, [secq + sec_str*2]
236 %endif
237 SUM_SSE m0, m1, m2, m3, m6, m7
238
239 lea srcq, [srcq + src_strideq*4]
240 lea dstq, [dstq + dst_strideq*4]
241 %if %2 == 1 ; avg
242 lea secq, [secq + sec_str*4]
243 %endif
244 %endif
245 dec h
246 jg .x_zero_y_zero_loop
247 STORE_AND_RET
248
249 .x_zero_y_nonzero:
250 cmp y_offsetd, 8
251 jne .x_zero_y_nonhalf
252
253 ; x_offset == 0 && y_offset == 0.5
254 .x_zero_y_half_loop:
255 %if %1 == 16
256 movu m0, [srcq]
257 movu m1, [srcq+16]
258 movu m4, [srcq+src_strideq*2]
259 movu m5, [srcq+src_strideq*2+16]
260 mova m2, [dstq]
261 mova m3, [dstq+16]
262 pavgw m0, m4
263 pavgw m1, m5
264 %if %2 == 1 ; avg
265 pavgw m0, [secq]
266 pavgw m1, [secq+16]
267 %endif
268 SUM_SSE m0, m2, m1, m3, m6, m7
269
270 lea srcq, [srcq + src_strideq*2]
271 lea dstq, [dstq + dst_strideq*2]
272 %if %2 == 1 ; avg
273 lea secq, [secq + sec_str*2]
274 %endif
275 %else ; %1 < 16
276 movu m0, [srcq]
277 movu m1, [srcq+src_strideq*2]
278 movu m5, [srcq+src_strideq*4]
279 mova m2, [dstq]
280 mova m3, [dstq+dst_strideq*2]
281 pavgw m0, m1
282 pavgw m1, m5
283 %if %2 == 1 ; avg
284 pavgw m0, [secq]
285 pavgw m1, [secq+sec_str*2]
286 %endif
287 SUM_SSE m0, m2, m1, m3, m6, m7
288
289 lea srcq, [srcq + src_strideq*4]
290 lea dstq, [dstq + dst_strideq*4]
291 %if %2 == 1 ; avg
292 lea secq, [secq + sec_str*4]
293 %endif
294 %endif
295 dec h
296 jg .x_zero_y_half_loop
297 STORE_AND_RET
298
299 .x_zero_y_nonhalf:
300 ; x_offset == 0 && y_offset == bilin interpolation
301 %ifdef PIC
302 lea bilin_filter, [bilin_filter_m]
303 %endif
304 shl y_offsetd, filter_idx_shift
305 %if ARCH_X86_64 && mmsize == 16
306 mova m8, [bilin_filter+y_offsetq]
307 mova m9, [bilin_filter+y_offsetq+16]
308 mova m10, [pw_8]
309 %define filter_y_a m8
310 %define filter_y_b m9
311 %define filter_rnd m10
312 %else ; x86-32 or mmx
313 %if ARCH_X86=1 && CONFIG_PIC=1
314 ; x_offset == 0, reuse x_offset reg
315 %define tempq x_offsetq
316 add y_offsetq, g_bilin_filterm
317 %define filter_y_a [y_offsetq]
318 %define filter_y_b [y_offsetq+16]
319 mov tempq, g_pw_8m
320 %define filter_rnd [tempq]
321 %else
322 add y_offsetq, bilin_filter
323 %define filter_y_a [y_offsetq]
324 %define filter_y_b [y_offsetq+16]
325 %define filter_rnd [pw_8]
326 %endif
327 %endif
328
329 .x_zero_y_other_loop:
330 %if %1 == 16
331 movu m0, [srcq]
332 movu m1, [srcq + 16]
333 movu m4, [srcq+src_strideq*2]
334 movu m5, [srcq+src_strideq*2+16]
335 mova m2, [dstq]
336 mova m3, [dstq+16]
337 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
338 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
339 ; instructions is the same (5), but it is 1 mul instead of 2, so might be
340 ; slightly faster because of pmullw latency. It would also cut our rodata
341 ; tables in half for this function, and save 1-2 registers on x86-64.
342 pmullw m1, filter_y_a
343 pmullw m5, filter_y_b
344 paddw m1, filter_rnd
345 pmullw m0, filter_y_a
346 pmullw m4, filter_y_b
347 paddw m0, filter_rnd
348 paddw m1, m5
349 paddw m0, m4
350 psrlw m1, 4
351 psrlw m0, 4
352 %if %2 == 1 ; avg
353 pavgw m0, [secq]
354 pavgw m1, [secq+16]
355 %endif
356 SUM_SSE m0, m2, m1, m3, m6, m7
357
358 lea srcq, [srcq + src_strideq*2]
359 lea dstq, [dstq + dst_strideq*2]
360 %if %2 == 1 ; avg
361 lea secq, [secq + sec_str*2]
362 %endif
363 %else ; %1 < 16
364 movu m0, [srcq]
365 movu m1, [srcq+src_strideq*2]
366 movu m5, [srcq+src_strideq*4]
367 mova m4, m1
368 mova m2, [dstq]
369 mova m3, [dstq+dst_strideq*2]
370 pmullw m1, filter_y_a
371 pmullw m5, filter_y_b
372 paddw m1, filter_rnd
373 pmullw m0, filter_y_a
374 pmullw m4, filter_y_b
375 paddw m0, filter_rnd
376 paddw m1, m5
377 paddw m0, m4
378 psrlw m1, 4
379 psrlw m0, 4
380 %if %2 == 1 ; avg
381 pavgw m0, [secq]
382 pavgw m1, [secq+sec_str*2]
383 %endif
384 SUM_SSE m0, m2, m1, m3, m6, m7
385
386 lea srcq, [srcq + src_strideq*4]
387 lea dstq, [dstq + dst_strideq*4]
388 %if %2 == 1 ; avg
389 lea secq, [secq + sec_str*4]
390 %endif
391 %endif
392 dec h
393 jg .x_zero_y_other_loop
394 %undef filter_y_a
395 %undef filter_y_b
396 %undef filter_rnd
397 STORE_AND_RET
398
399 .x_nonzero:
400 cmp x_offsetd, 8
401 jne .x_nonhalf
402 ; x_offset == 0.5
403 test y_offsetd, y_offsetd
404 jnz .x_half_y_nonzero
405
406 ; x_offset == 0.5 && y_offset == 0
407 .x_half_y_zero_loop:
408 %if %1 == 16
409 movu m0, [srcq]
410 movu m1, [srcq + 16]
411 movu m4, [srcq + 2]
412 movu m5, [srcq + 18]
413 mova m2, [dstq]
414 mova m3, [dstq + 16]
415 pavgw m0, m4
416 pavgw m1, m5
417 %if %2 == 1 ; avg
418 pavgw m0, [secq]
419 pavgw m1, [secq+16]
420 %endif
421 SUM_SSE m0, m2, m1, m3, m6, m7
422
423 lea srcq, [srcq + src_strideq*2]
424 lea dstq, [dstq + dst_strideq*2]
425 %if %2 == 1 ; avg
426 lea secq, [secq + sec_str*2]
427 %endif
428 %else ; %1 < 16
429 movu m0, [srcq]
430 movu m1, [srcq + src_strideq*2]
431 movu m4, [srcq + 2]
432 movu m5, [srcq + src_strideq*2 + 2]
433 mova m2, [dstq]
434 mova m3, [dstq + dst_strideq*2]
435 pavgw m0, m4
436 pavgw m1, m5
437 %if %2 == 1 ; avg
438 pavgw m0, [secq]
439 pavgw m1, [secq+sec_str*2]
440 %endif
441 SUM_SSE m0, m2, m1, m3, m6, m7
442
443 lea srcq, [srcq + src_strideq*4]
444 lea dstq, [dstq + dst_strideq*4]
445 %if %2 == 1 ; avg
446 lea secq, [secq + sec_str*4]
447 %endif
448 %endif
449 dec h
450 jg .x_half_y_zero_loop
451 STORE_AND_RET
452
453 .x_half_y_nonzero:
454 cmp y_offsetd, 8
455 jne .x_half_y_nonhalf
456
457 ; x_offset == 0.5 && y_offset == 0.5
458 %if %1 == 16
459 movu m0, [srcq]
460 movu m1, [srcq+16]
461 movu m2, [srcq+2]
462 movu m3, [srcq+18]
463 lea srcq, [srcq + src_strideq*2]
464 pavgw m0, m2
465 pavgw m1, m3
466 .x_half_y_half_loop:
467 movu m2, [srcq]
468 movu m3, [srcq + 16]
469 movu m4, [srcq + 2]
470 movu m5, [srcq + 18]
471 pavgw m2, m4
472 pavgw m3, m5
473 pavgw m0, m2
474 pavgw m1, m3
475 mova m4, [dstq]
476 mova m5, [dstq + 16]
477 %if %2 == 1 ; avg
478 pavgw m0, [secq]
479 pavgw m1, [secq+16]
480 %endif
481 SUM_SSE m0, m4, m1, m5, m6, m7
482 mova m0, m2
483 mova m1, m3
484
485 lea srcq, [srcq + src_strideq*2]
486 lea dstq, [dstq + dst_strideq*2]
487 %if %2 == 1 ; avg
488 lea secq, [secq + sec_str*2]
489 %endif
490 %else ; %1 < 16
491 movu m0, [srcq]
492 movu m2, [srcq+2]
493 lea srcq, [srcq + src_strideq*2]
494 pavgw m0, m2
495 .x_half_y_half_loop:
496 movu m2, [srcq]
497 movu m3, [srcq + src_strideq*2]
498 movu m4, [srcq + 2]
499 movu m5, [srcq + src_strideq*2 + 2]
500 pavgw m2, m4
501 pavgw m3, m5
502 pavgw m0, m2
503 pavgw m2, m3
504 mova m4, [dstq]
505 mova m5, [dstq + dst_strideq*2]
506 %if %2 == 1 ; avg
507 pavgw m0, [secq]
508 pavgw m2, [secq+sec_str*2]
509 %endif
510 SUM_SSE m0, m4, m2, m5, m6, m7
511 mova m0, m3
512
513 lea srcq, [srcq + src_strideq*4]
514 lea dstq, [dstq + dst_strideq*4]
515 %if %2 == 1 ; avg
516 lea secq, [secq + sec_str*4]
517 %endif
518 %endif
519 dec h
520 jg .x_half_y_half_loop
521 STORE_AND_RET
522
523 .x_half_y_nonhalf:
524 ; x_offset == 0.5 && y_offset == bilin interpolation
525 %ifdef PIC
526 lea bilin_filter, [bilin_filter_m]
527 %endif
528 shl y_offsetd, filter_idx_shift
529 %if ARCH_X86_64 && mmsize == 16
530 mova m8, [bilin_filter+y_offsetq]
531 mova m9, [bilin_filter+y_offsetq+16]
532 mova m10, [pw_8]
533 %define filter_y_a m8
534 %define filter_y_b m9
535 %define filter_rnd m10
536 %else ; x86_32
537 %if ARCH_X86=1 && CONFIG_PIC=1
538 ; x_offset == 0.5. We can reuse x_offset reg
539 %define tempq x_offsetq
540 add y_offsetq, g_bilin_filterm
541 %define filter_y_a [y_offsetq]
542 %define filter_y_b [y_offsetq+16]
543 mov tempq, g_pw_8m
544 %define filter_rnd [tempq]
545 %else
546 add y_offsetq, bilin_filter
547 %define filter_y_a [y_offsetq]
548 %define filter_y_b [y_offsetq+16]
549 %define filter_rnd [pw_8]
550 %endif
551 %endif
552
553 %if %1 == 16
554 movu m0, [srcq]
555 movu m1, [srcq+16]
556 movu m2, [srcq+2]
557 movu m3, [srcq+18]
558 lea srcq, [srcq + src_strideq*2]
559 pavgw m0, m2
560 pavgw m1, m3
561 .x_half_y_other_loop:
562 movu m2, [srcq]
563 movu m3, [srcq+16]
564 movu m4, [srcq+2]
565 movu m5, [srcq+18]
566 pavgw m2, m4
567 pavgw m3, m5
568 mova m4, m2
569 mova m5, m3
570 pmullw m1, filter_y_a
571 pmullw m3, filter_y_b
572 paddw m1, filter_rnd
573 paddw m1, m3
574 pmullw m0, filter_y_a
575 pmullw m2, filter_y_b
576 paddw m0, filter_rnd
577 psrlw m1, 4
578 paddw m0, m2
579 mova m2, [dstq]
580 psrlw m0, 4
581 mova m3, [dstq+16]
582 %if %2 == 1 ; avg
583 pavgw m0, [secq]
584 pavgw m1, [secq+16]
585 %endif
586 SUM_SSE m0, m2, m1, m3, m6, m7
587 mova m0, m4
588 mova m1, m5
589
590 lea srcq, [srcq + src_strideq*2]
591 lea dstq, [dstq + dst_strideq*2]
592 %if %2 == 1 ; avg
593 lea secq, [secq + sec_str*2]
594 %endif
595 %else ; %1 < 16
596 movu m0, [srcq]
597 movu m2, [srcq+2]
598 lea srcq, [srcq + src_strideq*2]
599 pavgw m0, m2
600 .x_half_y_other_loop:
601 movu m2, [srcq]
602 movu m3, [srcq+src_strideq*2]
603 movu m4, [srcq+2]
604 movu m5, [srcq+src_strideq*2+2]
605 pavgw m2, m4
606 pavgw m3, m5
607 mova m4, m2
608 mova m5, m3
609 pmullw m4, filter_y_a
610 pmullw m3, filter_y_b
611 paddw m4, filter_rnd
612 paddw m4, m3
613 pmullw m0, filter_y_a
614 pmullw m2, filter_y_b
615 paddw m0, filter_rnd
616 psrlw m4, 4
617 paddw m0, m2
618 mova m2, [dstq]
619 psrlw m0, 4
620 mova m3, [dstq+dst_strideq*2]
621 %if %2 == 1 ; avg
622 pavgw m0, [secq]
623 pavgw m4, [secq+sec_str*2]
624 %endif
625 SUM_SSE m0, m2, m4, m3, m6, m7
626 mova m0, m5
627
628 lea srcq, [srcq + src_strideq*4]
629 lea dstq, [dstq + dst_strideq*4]
630 %if %2 == 1 ; avg
631 lea secq, [secq + sec_str*4]
632 %endif
633 %endif
634 dec h
635 jg .x_half_y_other_loop
636 %undef filter_y_a
637 %undef filter_y_b
638 %undef filter_rnd
639 STORE_AND_RET
640
641 .x_nonhalf:
642 test y_offsetd, y_offsetd
643 jnz .x_nonhalf_y_nonzero
644
645 ; x_offset == bilin interpolation && y_offset == 0
646 %ifdef PIC
647 lea bilin_filter, [bilin_filter_m]
648 %endif
649 shl x_offsetd, filter_idx_shift
650 %if ARCH_X86_64 && mmsize == 16
651 mova m8, [bilin_filter+x_offsetq]
652 mova m9, [bilin_filter+x_offsetq+16]
653 mova m10, [pw_8]
654 %define filter_x_a m8
655 %define filter_x_b m9
656 %define filter_rnd m10
657 %else ; x86-32
658 %if ARCH_X86=1 && CONFIG_PIC=1
659 ; y_offset == 0. We can reuse y_offset reg.
660 %define tempq y_offsetq
661 add x_offsetq, g_bilin_filterm
662 %define filter_x_a [x_offsetq]
663 %define filter_x_b [x_offsetq+16]
664 mov tempq, g_pw_8m
665 %define filter_rnd [tempq]
666 %else
667 add x_offsetq, bilin_filter
668 %define filter_x_a [x_offsetq]
669 %define filter_x_b [x_offsetq+16]
670 %define filter_rnd [pw_8]
671 %endif
672 %endif
673
674 .x_other_y_zero_loop:
675 %if %1 == 16
676 movu m0, [srcq]
677 movu m1, [srcq+16]
678 movu m2, [srcq+2]
679 movu m3, [srcq+18]
680 mova m4, [dstq]
681 mova m5, [dstq+16]
682 pmullw m1, filter_x_a
683 pmullw m3, filter_x_b
684 paddw m1, filter_rnd
685 pmullw m0, filter_x_a
686 pmullw m2, filter_x_b
687 paddw m0, filter_rnd
688 paddw m1, m3
689 paddw m0, m2
690 psrlw m1, 4
691 psrlw m0, 4
692 %if %2 == 1 ; avg
693 pavgw m0, [secq]
694 pavgw m1, [secq+16]
695 %endif
696 SUM_SSE m0, m4, m1, m5, m6, m7
697
698 lea srcq, [srcq+src_strideq*2]
699 lea dstq, [dstq+dst_strideq*2]
700 %if %2 == 1 ; avg
701 lea secq, [secq + sec_str*2]
702 %endif
703 %else ; %1 < 16
704 movu m0, [srcq]
705 movu m1, [srcq+src_strideq*2]
706 movu m2, [srcq+2]
707 movu m3, [srcq+src_strideq*2+2]
708 mova m4, [dstq]
709 mova m5, [dstq+dst_strideq*2]
710 pmullw m1, filter_x_a
711 pmullw m3, filter_x_b
712 paddw m1, filter_rnd
713 pmullw m0, filter_x_a
714 pmullw m2, filter_x_b
715 paddw m0, filter_rnd
716 paddw m1, m3
717 paddw m0, m2
718 psrlw m1, 4
719 psrlw m0, 4
720 %if %2 == 1 ; avg
721 pavgw m0, [secq]
722 pavgw m1, [secq+sec_str*2]
723 %endif
724 SUM_SSE m0, m4, m1, m5, m6, m7
725
726 lea srcq, [srcq+src_strideq*4]
727 lea dstq, [dstq+dst_strideq*4]
728 %if %2 == 1 ; avg
729 lea secq, [secq + sec_str*4]
730 %endif
731 %endif
732 dec h
733 jg .x_other_y_zero_loop
734 %undef filter_x_a
735 %undef filter_x_b
736 %undef filter_rnd
737 STORE_AND_RET
738
739 .x_nonhalf_y_nonzero:
740 cmp y_offsetd, 8
741 jne .x_nonhalf_y_nonhalf
742
743 ; x_offset == bilin interpolation && y_offset == 0.5
744 %ifdef PIC
745 lea bilin_filter, [bilin_filter_m]
746 %endif
747 shl x_offsetd, filter_idx_shift
748 %if ARCH_X86_64 && mmsize == 16
749 mova m8, [bilin_filter+x_offsetq]
750 mova m9, [bilin_filter+x_offsetq+16]
751 mova m10, [pw_8]
752 %define filter_x_a m8
753 %define filter_x_b m9
754 %define filter_rnd m10
755 %else ; x86-32
756 %if ARCH_X86=1 && CONFIG_PIC=1
757 ; y_offset == 0.5. We can reuse y_offset reg.
758 %define tempq y_offsetq
759 add x_offsetq, g_bilin_filterm
760 %define filter_x_a [x_offsetq]
761 %define filter_x_b [x_offsetq+16]
762 mov tempq, g_pw_8m
763 %define filter_rnd [tempq]
764 %else
765 add x_offsetq, bilin_filter
766 %define filter_x_a [x_offsetq]
767 %define filter_x_b [x_offsetq+16]
768 %define filter_rnd [pw_8]
769 %endif
770 %endif
771
772 %if %1 == 16
773 movu m0, [srcq]
774 movu m1, [srcq+16]
775 movu m2, [srcq+2]
776 movu m3, [srcq+18]
777 pmullw m0, filter_x_a
778 pmullw m2, filter_x_b
779 paddw m0, filter_rnd
780 pmullw m1, filter_x_a
781 pmullw m3, filter_x_b
782 paddw m1, filter_rnd
783 paddw m0, m2
784 paddw m1, m3
785 psrlw m0, 4
786 psrlw m1, 4
787 lea srcq, [srcq+src_strideq*2]
788 .x_other_y_half_loop:
789 movu m2, [srcq]
790 movu m3, [srcq+16]
791 movu m4, [srcq+2]
792 movu m5, [srcq+18]
793 pmullw m2, filter_x_a
794 pmullw m4, filter_x_b
795 paddw m2, filter_rnd
796 pmullw m3, filter_x_a
797 pmullw m5, filter_x_b
798 paddw m3, filter_rnd
799 paddw m2, m4
800 paddw m3, m5
801 mova m4, [dstq]
802 mova m5, [dstq+16]
803 psrlw m2, 4
804 psrlw m3, 4
805 pavgw m0, m2
806 pavgw m1, m3
807 %if %2 == 1 ; avg
808 pavgw m0, [secq]
809 pavgw m1, [secq+16]
810 %endif
811 SUM_SSE m0, m4, m1, m5, m6, m7
812 mova m0, m2
813 mova m1, m3
814
815 lea srcq, [srcq+src_strideq*2]
816 lea dstq, [dstq+dst_strideq*2]
817 %if %2 == 1 ; avg
818 lea secq, [secq + sec_str*2]
819 %endif
820 %else ; %1 < 16
821 movu m0, [srcq]
822 movu m2, [srcq+2]
823 pmullw m0, filter_x_a
824 pmullw m2, filter_x_b
825 paddw m0, filter_rnd
826 paddw m0, m2
827 psrlw m0, 4
828 lea srcq, [srcq+src_strideq*2]
829 .x_other_y_half_loop:
830 movu m2, [srcq]
831 movu m3, [srcq+src_strideq*2]
832 movu m4, [srcq+2]
833 movu m5, [srcq+src_strideq*2+2]
834 pmullw m2, filter_x_a
835 pmullw m4, filter_x_b
836 paddw m2, filter_rnd
837 pmullw m3, filter_x_a
838 pmullw m5, filter_x_b
839 paddw m3, filter_rnd
840 paddw m2, m4
841 paddw m3, m5
842 mova m4, [dstq]
843 mova m5, [dstq+dst_strideq*2]
844 psrlw m2, 4
845 psrlw m3, 4
846 pavgw m0, m2
847 pavgw m2, m3
848 %if %2 == 1 ; avg
849 pavgw m0, [secq]
850 pavgw m2, [secq+sec_str*2]
851 %endif
852 SUM_SSE m0, m4, m2, m5, m6, m7
853 mova m0, m3
854
855 lea srcq, [srcq+src_strideq*4]
856 lea dstq, [dstq+dst_strideq*4]
857 %if %2 == 1 ; avg
858 lea secq, [secq + sec_str*4]
859 %endif
860 %endif
861 dec h
862 jg .x_other_y_half_loop
863 %undef filter_x_a
864 %undef filter_x_b
865 %undef filter_rnd
866 STORE_AND_RET
867
868 .x_nonhalf_y_nonhalf:
869 ; loading filter - this is same as in 8-bit depth
870 %ifdef PIC
871 lea bilin_filter, [bilin_filter_m]
872 %endif
873 shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
874 shl y_offsetd, filter_idx_shift
875 %if ARCH_X86_64 && mmsize == 16
876 mova m8, [bilin_filter+x_offsetq]
877 mova m9, [bilin_filter+x_offsetq+16]
878 mova m10, [bilin_filter+y_offsetq]
879 mova m11, [bilin_filter+y_offsetq+16]
880 mova m12, [pw_8]
881 %define filter_x_a m8
882 %define filter_x_b m9
883 %define filter_y_a m10
884 %define filter_y_b m11
885 %define filter_rnd m12
886 %else ; x86-32
887 %if ARCH_X86=1 && CONFIG_PIC=1
888 ; In this case, there is NO unused register. Used src_stride register. Later,
889 ; src_stride has to be loaded from stack when it is needed.
890 %define tempq src_strideq
891 mov tempq, g_bilin_filterm
892 add x_offsetq, tempq
893 add y_offsetq, tempq
894 %define filter_x_a [x_offsetq]
895 %define filter_x_b [x_offsetq+16]
896 %define filter_y_a [y_offsetq]
897 %define filter_y_b [y_offsetq+16]
898
899 mov tempq, g_pw_8m
900 %define filter_rnd [tempq]
901 %else
902 add x_offsetq, bilin_filter
903 add y_offsetq, bilin_filter
904 %define filter_x_a [x_offsetq]
905 %define filter_x_b [x_offsetq+16]
906 %define filter_y_a [y_offsetq]
907 %define filter_y_b [y_offsetq+16]
908 %define filter_rnd [pw_8]
909 %endif
910 %endif
911 ; end of load filter
912
913 ; x_offset == bilin interpolation && y_offset == bilin interpolation
914 %if %1 == 16
915 movu m0, [srcq]
916 movu m2, [srcq+2]
917 movu m1, [srcq+16]
918 movu m3, [srcq+18]
919 pmullw m0, filter_x_a
920 pmullw m2, filter_x_b
921 paddw m0, filter_rnd
922 pmullw m1, filter_x_a
923 pmullw m3, filter_x_b
924 paddw m1, filter_rnd
925 paddw m0, m2
926 paddw m1, m3
927 psrlw m0, 4
928 psrlw m1, 4
929
930 INC_SRC_BY_SRC_STRIDE
931
932 .x_other_y_other_loop:
933 movu m2, [srcq]
934 movu m4, [srcq+2]
935 movu m3, [srcq+16]
936 movu m5, [srcq+18]
937 pmullw m2, filter_x_a
938 pmullw m4, filter_x_b
939 paddw m2, filter_rnd
940 pmullw m3, filter_x_a
941 pmullw m5, filter_x_b
942 paddw m3, filter_rnd
943 paddw m2, m4
944 paddw m3, m5
945 psrlw m2, 4
946 psrlw m3, 4
947 mova m4, m2
948 mova m5, m3
949 pmullw m0, filter_y_a
950 pmullw m2, filter_y_b
951 paddw m0, filter_rnd
952 pmullw m1, filter_y_a
953 pmullw m3, filter_y_b
954 paddw m0, m2
955 paddw m1, filter_rnd
956 mova m2, [dstq]
957 paddw m1, m3
958 psrlw m0, 4
959 psrlw m1, 4
960 mova m3, [dstq+16]
961 %if %2 == 1 ; avg
962 pavgw m0, [secq]
963 pavgw m1, [secq+16]
964 %endif
965 SUM_SSE m0, m2, m1, m3, m6, m7
966 mova m0, m4
967 mova m1, m5
968
969 INC_SRC_BY_SRC_STRIDE
970 lea dstq, [dstq + dst_strideq * 2]
971 %if %2 == 1 ; avg
972 lea secq, [secq + sec_str*2]
973 %endif
974 %else ; %1 < 16
975 movu m0, [srcq]
976 movu m2, [srcq+2]
977 pmullw m0, filter_x_a
978 pmullw m2, filter_x_b
979 paddw m0, filter_rnd
980 paddw m0, m2
981 psrlw m0, 4
982
983 INC_SRC_BY_SRC_STRIDE
984
985 .x_other_y_other_loop:
986 movu m2, [srcq]
987 movu m4, [srcq+2]
988 movu m3, [srcq+src_strideq*2]
989 movu m5, [srcq+src_strideq*2+2]
990 pmullw m2, filter_x_a
991 pmullw m4, filter_x_b
992 paddw m2, filter_rnd
993 pmullw m3, filter_x_a
994 pmullw m5, filter_x_b
995 paddw m3, filter_rnd
996 paddw m2, m4
997 paddw m3, m5
998 psrlw m2, 4
999 psrlw m3, 4
1000 mova m4, m2
1001 mova m5, m3
1002 pmullw m0, filter_y_a
1003 pmullw m2, filter_y_b
1004 paddw m0, filter_rnd
1005 pmullw m4, filter_y_a
1006 pmullw m3, filter_y_b
1007 paddw m0, m2
1008 paddw m4, filter_rnd
1009 mova m2, [dstq]
1010 paddw m4, m3
1011 psrlw m0, 4
1012 psrlw m4, 4
1013 mova m3, [dstq+dst_strideq*2]
1014 %if %2 == 1 ; avg
1015 pavgw m0, [secq]
1016 pavgw m4, [secq+sec_str*2]
1017 %endif
1018 SUM_SSE m0, m2, m4, m3, m6, m7
1019 mova m0, m5
1020
1021 INC_SRC_BY_SRC_2STRIDE
1022 lea dstq, [dstq + dst_strideq * 4]
1023 %if %2 == 1 ; avg
1024 lea secq, [secq + sec_str*4]
1025 %endif
1026 %endif
1027 dec h
1028 jg .x_other_y_other_loop
1029 %undef filter_x_a
1030 %undef filter_x_b
1031 %undef filter_y_a
1032 %undef filter_y_b
1033 %undef filter_rnd
1034 STORE_AND_RET
1035 %endmacro
1036
1037 INIT_XMM sse2
1038 SUBPEL_VARIANCE 8
1039 SUBPEL_VARIANCE 16
1040
1041 INIT_XMM sse2
1042 SUBPEL_VARIANCE 8, 1
1043 SUBPEL_VARIANCE 16, 1
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_highbd_sad_sse2.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698