OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 %include "third_party/x86inc/x86inc.asm" | |
12 | |
13 SECTION .text | |
14 | |
15 %macro SAD_FN 4 | |
16 %if %4 == 0 | |
17 %if %3 == 5 | |
18 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows | |
19 %else ; %3 == 7 | |
20 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ | |
21 src_stride3, ref_stride3, n_rows | |
22 %endif ; %3 == 5/7 | |
23 %else ; avg | |
24 %if %3 == 5 | |
25 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ | |
26 second_pred, n_rows | |
27 %else ; %3 == 7 | |
28 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ | |
29 ref, ref_stride, \ | |
30 second_pred, \ | |
31 src_stride3, ref_stride3 | |
32 %if ARCH_X86_64 | |
33 %define n_rowsd r7d | |
34 %else ; x86-32 | |
35 %define n_rowsd dword r0m | |
36 %endif ; x86-32/64 | |
37 %endif ; %3 == 5/7 | |
38 %endif ; avg/sad | |
39 movsxdifnidn src_strideq, src_strided | |
40 movsxdifnidn ref_strideq, ref_strided | |
41 %if %3 == 7 | |
42 lea src_stride3q, [src_strideq*3] | |
43 lea ref_stride3q, [ref_strideq*3] | |
44 %endif ; %3 == 7 | |
45 %endmacro | |
46 | |
47 ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, | |
48 ; uint8_t *ref, int ref_stride); | |
49 %macro SAD64XN 1-2 0 | |
50 SAD_FN 64, %1, 5, %2 | |
51 mov n_rowsd, %1 | |
52 pxor m0, m0 | |
53 .loop: | |
54 movu m1, [refq] | |
55 movu m2, [refq+16] | |
56 movu m3, [refq+32] | |
57 movu m4, [refq+48] | |
58 %if %2 == 1 | |
59 pavgb m1, [second_predq+mmsize*0] | |
60 pavgb m2, [second_predq+mmsize*1] | |
61 pavgb m3, [second_predq+mmsize*2] | |
62 pavgb m4, [second_predq+mmsize*3] | |
63 lea second_predq, [second_predq+mmsize*4] | |
64 %endif | |
65 psadbw m1, [srcq] | |
66 psadbw m2, [srcq+16] | |
67 psadbw m3, [srcq+32] | |
68 psadbw m4, [srcq+48] | |
69 paddd m1, m2 | |
70 paddd m3, m4 | |
71 add refq, ref_strideq | |
72 paddd m0, m1 | |
73 add srcq, src_strideq | |
74 paddd m0, m3 | |
75 dec n_rowsd | |
76 jg .loop | |
77 | |
78 movhlps m1, m0 | |
79 paddd m0, m1 | |
80 movd eax, m0 | |
81 RET | |
82 %endmacro | |
83 | |
84 INIT_XMM sse2 | |
85 SAD64XN 64 ; sad64x64_sse2 | |
86 SAD64XN 32 ; sad64x32_sse2 | |
87 SAD64XN 64, 1 ; sad64x64_avg_sse2 | |
88 SAD64XN 32, 1 ; sad64x32_avg_sse2 | |
89 | |
90 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, | |
91 ; uint8_t *ref, int ref_stride); | |
92 %macro SAD32XN 1-2 0 | |
93 SAD_FN 32, %1, 5, %2 | |
94 mov n_rowsd, %1/2 | |
95 pxor m0, m0 | |
96 .loop: | |
97 movu m1, [refq] | |
98 movu m2, [refq+16] | |
99 movu m3, [refq+ref_strideq] | |
100 movu m4, [refq+ref_strideq+16] | |
101 %if %2 == 1 | |
102 pavgb m1, [second_predq+mmsize*0] | |
103 pavgb m2, [second_predq+mmsize*1] | |
104 pavgb m3, [second_predq+mmsize*2] | |
105 pavgb m4, [second_predq+mmsize*3] | |
106 lea second_predq, [second_predq+mmsize*4] | |
107 %endif | |
108 psadbw m1, [srcq] | |
109 psadbw m2, [srcq+16] | |
110 psadbw m3, [srcq+src_strideq] | |
111 psadbw m4, [srcq+src_strideq+16] | |
112 paddd m1, m2 | |
113 paddd m3, m4 | |
114 lea refq, [refq+ref_strideq*2] | |
115 paddd m0, m1 | |
116 lea srcq, [srcq+src_strideq*2] | |
117 paddd m0, m3 | |
118 dec n_rowsd | |
119 jg .loop | |
120 | |
121 movhlps m1, m0 | |
122 paddd m0, m1 | |
123 movd eax, m0 | |
124 RET | |
125 %endmacro | |
126 | |
127 INIT_XMM sse2 | |
128 SAD32XN 64 ; sad32x64_sse2 | |
129 SAD32XN 32 ; sad32x32_sse2 | |
130 SAD32XN 16 ; sad32x16_sse2 | |
131 SAD32XN 64, 1 ; sad32x64_avg_sse2 | |
132 SAD32XN 32, 1 ; sad32x32_avg_sse2 | |
133 SAD32XN 16, 1 ; sad32x16_avg_sse2 | |
134 | |
135 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, | |
136 ; uint8_t *ref, int ref_stride); | |
137 %macro SAD16XN 1-2 0 | |
138 SAD_FN 16, %1, 7, %2 | |
139 mov n_rowsd, %1/4 | |
140 pxor m0, m0 | |
141 | |
142 .loop: | |
143 movu m1, [refq] | |
144 movu m2, [refq+ref_strideq] | |
145 movu m3, [refq+ref_strideq*2] | |
146 movu m4, [refq+ref_stride3q] | |
147 %if %2 == 1 | |
148 pavgb m1, [second_predq+mmsize*0] | |
149 pavgb m2, [second_predq+mmsize*1] | |
150 pavgb m3, [second_predq+mmsize*2] | |
151 pavgb m4, [second_predq+mmsize*3] | |
152 lea second_predq, [second_predq+mmsize*4] | |
153 %endif | |
154 psadbw m1, [srcq] | |
155 psadbw m2, [srcq+src_strideq] | |
156 psadbw m3, [srcq+src_strideq*2] | |
157 psadbw m4, [srcq+src_stride3q] | |
158 paddd m1, m2 | |
159 paddd m3, m4 | |
160 lea refq, [refq+ref_strideq*4] | |
161 paddd m0, m1 | |
162 lea srcq, [srcq+src_strideq*4] | |
163 paddd m0, m3 | |
164 dec n_rowsd | |
165 jg .loop | |
166 | |
167 movhlps m1, m0 | |
168 paddd m0, m1 | |
169 movd eax, m0 | |
170 RET | |
171 %endmacro | |
172 | |
173 INIT_XMM sse2 | |
174 SAD16XN 32 ; sad16x32_sse2 | |
175 SAD16XN 16 ; sad16x16_sse2 | |
176 SAD16XN 8 ; sad16x8_sse2 | |
177 SAD16XN 32, 1 ; sad16x32_avg_sse2 | |
178 SAD16XN 16, 1 ; sad16x16_avg_sse2 | |
179 SAD16XN 8, 1 ; sad16x8_avg_sse2 | |
180 | |
181 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, | |
182 ; uint8_t *ref, int ref_stride); | |
183 %macro SAD8XN 1-2 0 | |
184 SAD_FN 8, %1, 7, %2 | |
185 mov n_rowsd, %1/4 | |
186 pxor m0, m0 | |
187 | |
188 .loop: | |
189 movh m1, [refq] | |
190 movhps m1, [refq+ref_strideq] | |
191 movh m2, [refq+ref_strideq*2] | |
192 movhps m2, [refq+ref_stride3q] | |
193 %if %2 == 1 | |
194 pavgb m1, [second_predq+mmsize*0] | |
195 pavgb m2, [second_predq+mmsize*1] | |
196 lea second_predq, [second_predq+mmsize*2] | |
197 %endif | |
198 movh m3, [srcq] | |
199 movhps m3, [srcq+src_strideq] | |
200 movh m4, [srcq+src_strideq*2] | |
201 movhps m4, [srcq+src_stride3q] | |
202 psadbw m1, m3 | |
203 psadbw m2, m4 | |
204 lea refq, [refq+ref_strideq*4] | |
205 paddd m0, m1 | |
206 lea srcq, [srcq+src_strideq*4] | |
207 paddd m0, m2 | |
208 dec n_rowsd | |
209 jg .loop | |
210 | |
211 movhlps m1, m0 | |
212 paddd m0, m1 | |
213 movd eax, m0 | |
214 RET | |
215 %endmacro | |
216 | |
217 INIT_XMM sse2 | |
218 SAD8XN 16 ; sad8x16_sse2 | |
219 SAD8XN 8 ; sad8x8_sse2 | |
220 SAD8XN 4 ; sad8x4_sse2 | |
221 SAD8XN 16, 1 ; sad8x16_avg_sse2 | |
222 SAD8XN 8, 1 ; sad8x8_avg_sse2 | |
223 SAD8XN 4, 1 ; sad8x4_avg_sse2 | |
224 | |
225 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, | |
226 ; uint8_t *ref, int ref_stride); | |
227 %macro SAD4XN 1-2 0 | |
228 SAD_FN 4, %1, 7, %2 | |
229 mov n_rowsd, %1/4 | |
230 pxor m0, m0 | |
231 | |
232 .loop: | |
233 movd m1, [refq] | |
234 movd m2, [refq+ref_strideq] | |
235 movd m3, [refq+ref_strideq*2] | |
236 movd m4, [refq+ref_stride3q] | |
237 punpckldq m1, m2 | |
238 punpckldq m3, m4 | |
239 %if %2 == 1 | |
240 pavgb m1, [second_predq+mmsize*0] | |
241 pavgb m3, [second_predq+mmsize*1] | |
242 lea second_predq, [second_predq+mmsize*2] | |
243 %endif | |
244 movd m2, [srcq] | |
245 movd m5, [srcq+src_strideq] | |
246 movd m4, [srcq+src_strideq*2] | |
247 movd m6, [srcq+src_stride3q] | |
248 punpckldq m2, m5 | |
249 punpckldq m4, m6 | |
250 psadbw m1, m2 | |
251 psadbw m3, m4 | |
252 lea refq, [refq+ref_strideq*4] | |
253 paddd m0, m1 | |
254 lea srcq, [srcq+src_strideq*4] | |
255 paddd m0, m3 | |
256 dec n_rowsd | |
257 jg .loop | |
258 | |
259 movd eax, m0 | |
260 RET | |
261 %endmacro | |
262 | |
263 INIT_MMX sse | |
264 SAD4XN 8 ; sad4x8_sse | |
265 SAD4XN 4 ; sad4x4_sse | |
266 SAD4XN 8, 1 ; sad4x8_avg_sse | |
267 SAD4XN 4, 1 ; sad4x4_avg_sse | |
OLD | NEW |