OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 %include "third_party/x86inc/x86inc.asm" | |
12 | |
13 SECTION .text | |
14 | |
15 ; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
16 %macro PROCESS_4x2x4 5-6 0 | |
17 movd m0, [srcq +%2] | |
18 %if %1 == 1 | |
19 movd m6, [ref1q+%3] | |
20 movd m4, [ref2q+%3] | |
21 movd m7, [ref3q+%3] | |
22 movd m5, [ref4q+%3] | |
23 punpckldq m0, [srcq +%4] | |
24 punpckldq m6, [ref1q+%5] | |
25 punpckldq m4, [ref2q+%5] | |
26 punpckldq m7, [ref3q+%5] | |
27 punpckldq m5, [ref4q+%5] | |
28 psadbw m6, m0 | |
29 psadbw m4, m0 | |
30 psadbw m7, m0 | |
31 psadbw m5, m0 | |
32 punpckldq m6, m4 | |
33 punpckldq m7, m5 | |
34 %else | |
35 movd m1, [ref1q+%3] | |
36 movd m2, [ref2q+%3] | |
37 movd m3, [ref3q+%3] | |
38 movd m4, [ref4q+%3] | |
39 punpckldq m0, [srcq +%4] | |
40 punpckldq m1, [ref1q+%5] | |
41 punpckldq m2, [ref2q+%5] | |
42 punpckldq m3, [ref3q+%5] | |
43 punpckldq m4, [ref4q+%5] | |
44 psadbw m1, m0 | |
45 psadbw m2, m0 | |
46 psadbw m3, m0 | |
47 psadbw m4, m0 | |
48 punpckldq m1, m2 | |
49 punpckldq m3, m4 | |
50 paddd m6, m1 | |
51 paddd m7, m3 | |
52 %endif | |
53 %if %6 == 1 | |
54 lea srcq, [srcq +src_strideq*2] | |
55 lea ref1q, [ref1q+ref_strideq*2] | |
56 lea ref2q, [ref2q+ref_strideq*2] | |
57 lea ref3q, [ref3q+ref_strideq*2] | |
58 lea ref4q, [ref4q+ref_strideq*2] | |
59 %endif | |
60 %endmacro | |
61 | |
62 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
63 %macro PROCESS_8x2x4 5-6 0 | |
64 movh m0, [srcq +%2] | |
65 %if %1 == 1 | |
66 movh m4, [ref1q+%3] | |
67 movh m5, [ref2q+%3] | |
68 movh m6, [ref3q+%3] | |
69 movh m7, [ref4q+%3] | |
70 movhps m0, [srcq +%4] | |
71 movhps m4, [ref1q+%5] | |
72 movhps m5, [ref2q+%5] | |
73 movhps m6, [ref3q+%5] | |
74 movhps m7, [ref4q+%5] | |
75 psadbw m4, m0 | |
76 psadbw m5, m0 | |
77 psadbw m6, m0 | |
78 psadbw m7, m0 | |
79 %else | |
80 movh m1, [ref1q+%3] | |
81 movh m2, [ref2q+%3] | |
82 movh m3, [ref3q+%3] | |
83 movhps m0, [srcq +%4] | |
84 movhps m1, [ref1q+%5] | |
85 movhps m2, [ref2q+%5] | |
86 movhps m3, [ref3q+%5] | |
87 psadbw m1, m0 | |
88 psadbw m2, m0 | |
89 psadbw m3, m0 | |
90 paddd m4, m1 | |
91 movh m1, [ref4q+%3] | |
92 movhps m1, [ref4q+%5] | |
93 paddd m5, m2 | |
94 paddd m6, m3 | |
95 psadbw m1, m0 | |
96 paddd m7, m1 | |
97 %endif | |
98 %if %6 == 1 | |
99 lea srcq, [srcq +src_strideq*2] | |
100 lea ref1q, [ref1q+ref_strideq*2] | |
101 lea ref2q, [ref2q+ref_strideq*2] | |
102 lea ref3q, [ref3q+ref_strideq*2] | |
103 lea ref4q, [ref4q+ref_strideq*2] | |
104 %endif | |
105 %endmacro | |
106 | |
107 ; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
108 %macro PROCESS_16x2x4 5-6 0 | |
109 ; 1st 16 px | |
110 mova m0, [srcq +%2] | |
111 %if %1 == 1 | |
112 movu m4, [ref1q+%3] | |
113 movu m5, [ref2q+%3] | |
114 movu m6, [ref3q+%3] | |
115 movu m7, [ref4q+%3] | |
116 psadbw m4, m0 | |
117 psadbw m5, m0 | |
118 psadbw m6, m0 | |
119 psadbw m7, m0 | |
120 %else | |
121 movu m1, [ref1q+%3] | |
122 movu m2, [ref2q+%3] | |
123 movu m3, [ref3q+%3] | |
124 psadbw m1, m0 | |
125 psadbw m2, m0 | |
126 psadbw m3, m0 | |
127 paddd m4, m1 | |
128 movu m1, [ref4q+%3] | |
129 paddd m5, m2 | |
130 paddd m6, m3 | |
131 psadbw m1, m0 | |
132 paddd m7, m1 | |
133 %endif | |
134 | |
135 ; 2nd 16 px | |
136 mova m0, [srcq +%4] | |
137 movu m1, [ref1q+%5] | |
138 movu m2, [ref2q+%5] | |
139 movu m3, [ref3q+%5] | |
140 psadbw m1, m0 | |
141 psadbw m2, m0 | |
142 psadbw m3, m0 | |
143 paddd m4, m1 | |
144 movu m1, [ref4q+%5] | |
145 paddd m5, m2 | |
146 paddd m6, m3 | |
147 %if %6 == 1 | |
148 lea srcq, [srcq +src_strideq*2] | |
149 lea ref1q, [ref1q+ref_strideq*2] | |
150 lea ref2q, [ref2q+ref_strideq*2] | |
151 lea ref3q, [ref3q+ref_strideq*2] | |
152 lea ref4q, [ref4q+ref_strideq*2] | |
153 %endif | |
154 psadbw m1, m0 | |
155 paddd m7, m1 | |
156 %endmacro | |
157 | |
158 ; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
159 %macro PROCESS_32x2x4 5-6 0 | |
160 PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 | |
161 PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 | |
162 %endmacro | |
163 | |
164 ; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
165 %macro PROCESS_64x2x4 5-6 0 | |
166 PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 | |
167 PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 | |
168 %endmacro | |
169 | |
170 ; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride, | |
171 ; uint8_t *ref[4], int ref_stride, | |
172 ; unsigned int res[4]); | |
173 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 | |
174 %macro SADNXN4D 2 | |
175 %if UNIX64 | |
176 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ | |
177 res, ref2, ref3, ref4 | |
178 %else | |
179 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ | |
180 ref2, ref3, ref4 | |
181 %endif | |
182 movsxdifnidn src_strideq, src_strided | |
183 movsxdifnidn ref_strideq, ref_strided | |
184 mov ref2q, [ref1q+gprsize*1] | |
185 mov ref3q, [ref1q+gprsize*2] | |
186 mov ref4q, [ref1q+gprsize*3] | |
187 mov ref1q, [ref1q+gprsize*0] | |
188 | |
189 PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 | |
190 %rep (%2-4)/2 | |
191 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 | |
192 %endrep | |
193 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 | |
194 | |
195 %if mmsize == 16 | |
196 pslldq m5, 4 | |
197 pslldq m7, 4 | |
198 por m4, m5 | |
199 por m6, m7 | |
200 mova m5, m4 | |
201 mova m7, m6 | |
202 punpcklqdq m4, m6 | |
203 punpckhqdq m5, m7 | |
204 movifnidn r4, r4mp | |
205 paddd m4, m5 | |
206 movu [r4], m4 | |
207 RET | |
208 %else | |
209 movifnidn r4, r4mp | |
210 movq [r4+0], m6 | |
211 movq [r4+8], m7 | |
212 RET | |
213 %endif | |
214 %endmacro | |
215 | |
216 INIT_XMM sse2 | |
217 SADNXN4D 64, 64 | |
218 SADNXN4D 64, 32 | |
219 SADNXN4D 32, 64 | |
220 SADNXN4D 32, 32 | |
221 SADNXN4D 32, 16 | |
222 SADNXN4D 16, 32 | |
223 SADNXN4D 16, 16 | |
224 SADNXN4D 16, 8 | |
225 SADNXN4D 8, 16 | |
226 SADNXN4D 8, 8 | |
227 SADNXN4D 8, 4 | |
228 | |
229 INIT_MMX sse | |
230 SADNXN4D 4, 8 | |
231 SADNXN4D 4, 4 | |
OLD | NEW |