Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(332)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "third_party/x86inc/x86inc.asm"
12
13 SECTION .text
14
15 ; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
16 %macro HIGH_PROCESS_4x2x4 5-6 0
17 movh m0, [srcq +%2*2]
18 %if %1 == 1
19 movu m4, [ref1q+%3*2]
20 movu m5, [ref2q+%3*2]
21 movu m6, [ref3q+%3*2]
22 movu m7, [ref4q+%3*2]
23 movhps m0, [srcq +%4*2]
24 movhps m4, [ref1q+%5*2]
25 movhps m5, [ref2q+%5*2]
26 movhps m6, [ref3q+%5*2]
27 movhps m7, [ref4q+%5*2]
28 mova m3, m0
29 mova m2, m0
30 psubusw m3, m4
31 psubusw m2, m5
32 psubusw m4, m0
33 psubusw m5, m0
34 por m4, m3
35 por m5, m2
36 pmaddwd m4, m1
37 pmaddwd m5, m1
38 mova m3, m0
39 mova m2, m0
40 psubusw m3, m6
41 psubusw m2, m7
42 psubusw m6, m0
43 psubusw m7, m0
44 por m6, m3
45 por m7, m2
46 pmaddwd m6, m1
47 pmaddwd m7, m1
48 %else
49 movu m2, [ref1q+%3*2]
50 movhps m0, [srcq +%4*2]
51 movhps m2, [ref1q+%5*2]
52 mova m3, m0
53 psubusw m3, m2
54 psubusw m2, m0
55 por m2, m3
56 pmaddwd m2, m1
57 paddd m4, m2
58
59 movu m2, [ref2q+%3*2]
60 mova m3, m0
61 movhps m2, [ref2q+%5*2]
62 psubusw m3, m2
63 psubusw m2, m0
64 por m2, m3
65 pmaddwd m2, m1
66 paddd m5, m2
67
68 movu m2, [ref3q+%3*2]
69 mova m3, m0
70 movhps m2, [ref3q+%5*2]
71 psubusw m3, m2
72 psubusw m2, m0
73 por m2, m3
74 pmaddwd m2, m1
75 paddd m6, m2
76
77 movu m2, [ref4q+%3*2]
78 mova m3, m0
79 movhps m2, [ref4q+%5*2]
80 psubusw m3, m2
81 psubusw m2, m0
82 por m2, m3
83 pmaddwd m2, m1
84 paddd m7, m2
85 %endif
86 %if %6 == 1
87 lea srcq, [srcq +src_strideq*4]
88 lea ref1q, [ref1q+ref_strideq*4]
89 lea ref2q, [ref2q+ref_strideq*4]
90 lea ref3q, [ref3q+ref_strideq*4]
91 lea ref4q, [ref4q+ref_strideq*4]
92 %endif
93 %endmacro
94
95 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
96 %macro HIGH_PROCESS_8x2x4 5-6 0
97 ; 1st 8 px
98 mova m0, [srcq +%2*2]
99 %if %1 == 1
100 movu m4, [ref1q+%3*2]
101 movu m5, [ref2q+%3*2]
102 movu m6, [ref3q+%3*2]
103 movu m7, [ref4q+%3*2]
104 mova m3, m0
105 mova m2, m0
106 psubusw m3, m4
107 psubusw m2, m5
108 psubusw m4, m0
109 psubusw m5, m0
110 por m4, m3
111 por m5, m2
112 pmaddwd m4, m1
113 pmaddwd m5, m1
114 mova m3, m0
115 mova m2, m0
116 psubusw m3, m6
117 psubusw m2, m7
118 psubusw m6, m0
119 psubusw m7, m0
120 por m6, m3
121 por m7, m2
122 pmaddwd m6, m1
123 pmaddwd m7, m1
124 %else
125 mova m3, m0
126 movu m2, [ref1q+%3*2]
127 psubusw m3, m2
128 psubusw m2, m0
129 por m2, m3
130 mova m3, m0
131 pmaddwd m2, m1
132 paddd m4, m2
133 movu m2, [ref2q+%3*2]
134 psubusw m3, m2
135 psubusw m2, m0
136 por m2, m3
137 mova m3, m0
138 pmaddwd m2, m1
139 paddd m5, m2
140 movu m2, [ref3q+%3*2]
141 psubusw m3, m2
142 psubusw m2, m0
143 por m2, m3
144 mova m3, m0
145 pmaddwd m2, m1
146 paddd m6, m2
147 movu m2, [ref4q+%3*2]
148 psubusw m3, m2
149 psubusw m2, m0
150 por m2, m3
151 pmaddwd m2, m1
152 paddd m7, m2
153 %endif
154
155 ; 2nd 8 px
156 mova m0, [srcq +(%4)*2]
157 mova m3, m0
158 movu m2, [ref1q+(%5)*2]
159 psubusw m3, m2
160 psubusw m2, m0
161 por m2, m3
162 mova m3, m0
163 pmaddwd m2, m1
164 paddd m4, m2
165 movu m2, [ref2q+(%5)*2]
166 psubusw m3, m2
167 psubusw m2, m0
168 por m2, m3
169 mova m3, m0
170 pmaddwd m2, m1
171 paddd m5, m2
172 movu m2, [ref3q+(%5)*2]
173 psubusw m3, m2
174 psubusw m2, m0
175 por m2, m3
176 mova m3, m0
177 pmaddwd m2, m1
178 paddd m6, m2
179 movu m2, [ref4q+(%5)*2]
180 psubusw m3, m2
181 psubusw m2, m0
182 %if %6 == 1
183 lea srcq, [srcq +src_strideq*4]
184 lea ref1q, [ref1q+ref_strideq*4]
185 lea ref2q, [ref2q+ref_strideq*4]
186 lea ref3q, [ref3q+ref_strideq*4]
187 lea ref4q, [ref4q+ref_strideq*4]
188 %endif
189 por m2, m3
190 pmaddwd m2, m1
191 paddd m7, m2
192 %endmacro
193
194 ; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
195 %macro HIGH_PROCESS_16x2x4 5-6 0
196 HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
197 HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
198 %endmacro
199
200 ; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
201 %macro HIGH_PROCESS_32x2x4 5-6 0
202 HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
203 HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
204 %endmacro
205
206 ; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
207 %macro HIGH_PROCESS_64x2x4 5-6 0
208 HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
209 HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
210 %endmacro
211
212 ; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
213 ; uint8_t *ref[4], int ref_stride,
214 ; unsigned int res[4]);
215 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
216 %macro HIGH_SADNXN4D 2
217 %if UNIX64
218 cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \
219 res, ref2, ref3, ref4, one
220 %else
221 cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \
222 ref2, ref3, ref4, one
223 %endif
224
225 movsxdifnidn src_strideq, src_strided
226 movsxdifnidn ref_strideq, ref_strided
227 mov ref2q, [ref1q+gprsize*1]
228 mov ref3q, [ref1q+gprsize*2]
229 mov ref4q, [ref1q+gprsize*3]
230 mov ref1q, [ref1q+gprsize*0]
231
232 ; convert byte pointers to short pointers
233 shl srcq, 1
234 shl ref2q, 1
235 shl ref3q, 1
236 shl ref4q, 1
237 shl ref1q, 1
238
239 mov oned, 0x00010001
240 movd m1, oned
241 pshufd m1, m1, 0x0
242
243 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
244 %rep (%2-4)/2
245 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
246 %endrep
247 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
248 ; N.B. HIGH_PROCESS outputs dwords (32 bits)
249 ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
250 movhlps m0, m4
251 movhlps m1, m5
252 movhlps m2, m6
253 movhlps m3, m7
254 paddd m4, m0
255 paddd m5, m1
256 paddd m6, m2
257 paddd m7, m3
258 punpckldq m4, m5
259 punpckldq m6, m7
260 movhlps m0, m4
261 movhlps m1, m6
262 paddd m4, m0
263 paddd m6, m1
264 punpcklqdq m4, m6
265 movifnidn r4, r4mp
266 movu [r4], m4
267 RET
268 %endmacro
269
270
271 INIT_XMM sse2
272 HIGH_SADNXN4D 64, 64
273 HIGH_SADNXN4D 64, 32
274 HIGH_SADNXN4D 32, 64
275 HIGH_SADNXN4D 32, 32
276 HIGH_SADNXN4D 32, 16
277 HIGH_SADNXN4D 16, 32
278 HIGH_SADNXN4D 16, 16
279 HIGH_SADNXN4D 16, 8
280 HIGH_SADNXN4D 8, 16
281 HIGH_SADNXN4D 8, 8
282 HIGH_SADNXN4D 8, 4
283 HIGH_SADNXN4D 4, 8
284 HIGH_SADNXN4D 4, 4
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_highbd_sad_sse2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698