OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
65 movq rax, m4 | 65 movq rax, m4 |
66 movq [sszq], m6 | 66 movq [sszq], m6 |
67 %else | 67 %else |
68 mov eax, sszm | 68 mov eax, sszm |
69 pshufd m5, m4, 0x1 | 69 pshufd m5, m4, 0x1 |
70 movq [eax], m6 | 70 movq [eax], m6 |
71 movd eax, m4 | 71 movd eax, m4 |
72 movd edx, m5 | 72 movd edx, m5 |
73 %endif | 73 %endif |
74 RET | 74 RET |
| 75 |
| 76 ; Compute the sum of squared difference between two int16_t vectors. |
| 77 ; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff, |
| 78 ; intptr_t block_size) |
| 79 |
| 80 INIT_XMM sse2 |
| 81 cglobal block_error_fp, 3, 3, 6, uqc, dqc, size |
| 82 pxor m4, m4 ; sse accumulator |
| 83 pxor m5, m5 ; dedicated zero register |
| 84 lea uqcq, [uqcq+sizeq*2] |
| 85 lea dqcq, [dqcq+sizeq*2] |
| 86 neg sizeq |
| 87 .loop: |
| 88 mova m2, [uqcq+sizeq*2] |
| 89 mova m0, [dqcq+sizeq*2] |
| 90 mova m3, [uqcq+sizeq*2+mmsize] |
| 91 mova m1, [dqcq+sizeq*2+mmsize] |
| 92 psubw m0, m2 |
| 93 psubw m1, m3 |
| 94 ; individual errors are max. 15bit+sign, so squares are 30bit, and |
| 95 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) |
| 96 pmaddwd m0, m0 |
| 97 pmaddwd m1, m1 |
| 98 ; accumulate in 64bit |
| 99 punpckldq m3, m0, m5 |
| 100 punpckhdq m0, m5 |
| 101 paddq m4, m3 |
| 102 punpckldq m3, m1, m5 |
| 103 paddq m4, m0 |
| 104 punpckhdq m1, m5 |
| 105 paddq m4, m3 |
| 106 paddq m4, m1 |
| 107 add sizeq, mmsize |
| 108 jl .loop |
| 109 |
| 110 ; accumulate horizontally and store in return value |
| 111 movhlps m5, m4 |
| 112 paddq m4, m5 |
| 113 %if ARCH_X86_64 |
| 114 movq rax, m4 |
| 115 %else |
| 116 pshufd m5, m4, 0x1 |
| 117 movd eax, m4 |
| 118 movd edx, m5 |
| 119 %endif |
| 120 RET |
OLD | NEW |