Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(241)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;unsigned int vp9_get_mb_ss_sse2
15 ;(
16 ; short *src_ptr
17 ;)
18 global sym(vp9_get_mb_ss_sse2) PRIVATE
19 sym(vp9_get_mb_ss_sse2):
20 push rbp
21 mov rbp, rsp
22 SHADOW_ARGS_TO_STACK 1
23 GET_GOT rbx
24 push rsi
25 push rdi
26 sub rsp, 16
27 ; end prolog
28
29
30 mov rax, arg(0) ;[src_ptr]
31 mov rcx, 8
32 pxor xmm4, xmm4
33
34 .NEXTROW:
35 movdqa xmm0, [rax]
36 movdqa xmm1, [rax+16]
37 movdqa xmm2, [rax+32]
38 movdqa xmm3, [rax+48]
39 pmaddwd xmm0, xmm0
40 pmaddwd xmm1, xmm1
41 pmaddwd xmm2, xmm2
42 pmaddwd xmm3, xmm3
43
44 paddd xmm0, xmm1
45 paddd xmm2, xmm3
46 paddd xmm4, xmm0
47 paddd xmm4, xmm2
48
49 add rax, 0x40
50 dec rcx
51 ja .NEXTROW
52
53 movdqa xmm3,xmm4
54 psrldq xmm4,8
55 paddd xmm4,xmm3
56 movdqa xmm3,xmm4
57 psrldq xmm4,4
58 paddd xmm4,xmm3
59 movq rax,xmm4
60
61
62 ; begin epilog
63 add rsp, 16
64 pop rdi
65 pop rsi
66 RESTORE_GOT
67 UNSHADOW_ARGS
68 pop rbp
69 ret
70
71
72 ;unsigned int vp9_get16x16var_sse2
73 ;(
74 ; unsigned char * src_ptr,
75 ; int source_stride,
76 ; unsigned char * ref_ptr,
77 ; int recon_stride,
78 ; unsigned int * SSE,
79 ; int * Sum
80 ;)
81 global sym(vp9_get16x16var_sse2) PRIVATE
82 sym(vp9_get16x16var_sse2):
83 push rbp
84 mov rbp, rsp
85 SHADOW_ARGS_TO_STACK 6
86 SAVE_XMM 7
87 push rbx
88 push rsi
89 push rdi
90 ; end prolog
91
92 mov rsi, arg(0) ;[src_ptr]
93 mov rdi, arg(2) ;[ref_ptr]
94
95 movsxd rax, DWORD PTR arg(1) ;[source_stride]
96 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
97
98 ; Prefetch data
99 lea rcx, [rax+rax*2]
100 prefetcht0 [rsi]
101 prefetcht0 [rsi+rax]
102 prefetcht0 [rsi+rax*2]
103 prefetcht0 [rsi+rcx]
104 lea rbx, [rsi+rax*4]
105 prefetcht0 [rbx]
106 prefetcht0 [rbx+rax]
107 prefetcht0 [rbx+rax*2]
108 prefetcht0 [rbx+rcx]
109
110 lea rcx, [rdx+rdx*2]
111 prefetcht0 [rdi]
112 prefetcht0 [rdi+rdx]
113 prefetcht0 [rdi+rdx*2]
114 prefetcht0 [rdi+rcx]
115 lea rbx, [rdi+rdx*4]
116 prefetcht0 [rbx]
117 prefetcht0 [rbx+rdx]
118 prefetcht0 [rbx+rdx*2]
119 prefetcht0 [rbx+rcx]
120
121 pxor xmm0, xmm0 ; clear xmm0 for unpack
122 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
123
124 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
125 mov rcx, 16
126
127 .var16loop:
128 movdqu xmm1, XMMWORD PTR [rsi]
129 movdqu xmm2, XMMWORD PTR [rdi]
130
131 prefetcht0 [rsi+rax*8]
132 prefetcht0 [rdi+rdx*8]
133
134 movdqa xmm3, xmm1
135 movdqa xmm4, xmm2
136
137
138 punpcklbw xmm1, xmm0
139 punpckhbw xmm3, xmm0
140
141 punpcklbw xmm2, xmm0
142 punpckhbw xmm4, xmm0
143
144
145 psubw xmm1, xmm2
146 psubw xmm3, xmm4
147
148 paddw xmm7, xmm1
149 pmaddwd xmm1, xmm1
150
151 paddw xmm7, xmm3
152 pmaddwd xmm3, xmm3
153
154 paddd xmm6, xmm1
155 paddd xmm6, xmm3
156
157 add rsi, rax
158 add rdi, rdx
159
160 sub rcx, 1
161 jnz .var16loop
162
163
164 movdqa xmm1, xmm6
165 pxor xmm6, xmm6
166
167 pxor xmm5, xmm5
168 punpcklwd xmm6, xmm7
169
170 punpckhwd xmm5, xmm7
171 psrad xmm5, 16
172
173 psrad xmm6, 16
174 paddd xmm6, xmm5
175
176 movdqa xmm2, xmm1
177 punpckldq xmm1, xmm0
178
179 punpckhdq xmm2, xmm0
180 movdqa xmm7, xmm6
181
182 paddd xmm1, xmm2
183 punpckldq xmm6, xmm0
184
185 punpckhdq xmm7, xmm0
186 paddd xmm6, xmm7
187
188 movdqa xmm2, xmm1
189 movdqa xmm7, xmm6
190
191 psrldq xmm1, 8
192 psrldq xmm6, 8
193
194 paddd xmm7, xmm6
195 paddd xmm1, xmm2
196
197 mov rax, arg(5) ;[Sum]
198 mov rdi, arg(4) ;[SSE]
199
200 movd DWORD PTR [rax], xmm7
201 movd DWORD PTR [rdi], xmm1
202
203
204 ; begin epilog
205 pop rdi
206 pop rsi
207 pop rbx
208 RESTORE_XMM
209 UNSHADOW_ARGS
210 pop rbp
211 ret
212
213
214
215
216 ;unsigned int vp9_get8x8var_sse2
217 ;(
218 ; unsigned char * src_ptr,
219 ; int source_stride,
220 ; unsigned char * ref_ptr,
221 ; int recon_stride,
222 ; unsigned int * SSE,
223 ; int * Sum
224 ;)
225 global sym(vp9_get8x8var_sse2) PRIVATE
226 sym(vp9_get8x8var_sse2):
227 push rbp
228 mov rbp, rsp
229 SHADOW_ARGS_TO_STACK 6
230 SAVE_XMM 7
231 GET_GOT rbx
232 push rsi
233 push rdi
234 sub rsp, 16
235 ; end prolog
236
237 mov rsi, arg(0) ;[src_ptr]
238 mov rdi, arg(2) ;[ref_ptr]
239
240 movsxd rax, DWORD PTR arg(1) ;[source_stride]
241 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
242
243 pxor xmm0, xmm0 ; clear xmm0 for unpack
244 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
245
246 movq xmm1, QWORD PTR [rsi]
247 movq xmm2, QWORD PTR [rdi]
248
249 punpcklbw xmm1, xmm0
250 punpcklbw xmm2, xmm0
251
252 psubsw xmm1, xmm2
253 paddw xmm7, xmm1
254
255 pmaddwd xmm1, xmm1
256
257 movq xmm2, QWORD PTR[rsi + rax]
258 movq xmm3, QWORD PTR[rdi + rdx]
259
260 punpcklbw xmm2, xmm0
261 punpcklbw xmm3, xmm0
262
263 psubsw xmm2, xmm3
264 paddw xmm7, xmm2
265
266 pmaddwd xmm2, xmm2
267 paddd xmm1, xmm2
268
269
270 movq xmm2, QWORD PTR[rsi + rax * 2]
271 movq xmm3, QWORD PTR[rdi + rdx * 2]
272
273 punpcklbw xmm2, xmm0
274 punpcklbw xmm3, xmm0
275
276 psubsw xmm2, xmm3
277 paddw xmm7, xmm2
278
279 pmaddwd xmm2, xmm2
280 paddd xmm1, xmm2
281
282
283 lea rsi, [rsi + rax * 2]
284 lea rdi, [rdi + rdx * 2]
285 movq xmm2, QWORD PTR[rsi + rax]
286 movq xmm3, QWORD PTR[rdi + rdx]
287
288 punpcklbw xmm2, xmm0
289 punpcklbw xmm3, xmm0
290
291 psubsw xmm2, xmm3
292 paddw xmm7, xmm2
293
294 pmaddwd xmm2, xmm2
295 paddd xmm1, xmm2
296
297 movq xmm2, QWORD PTR[rsi + rax *2]
298 movq xmm3, QWORD PTR[rdi + rdx *2]
299
300 punpcklbw xmm2, xmm0
301 punpcklbw xmm3, xmm0
302
303 psubsw xmm2, xmm3
304 paddw xmm7, xmm2
305
306 pmaddwd xmm2, xmm2
307 paddd xmm1, xmm2
308
309
310 lea rsi, [rsi + rax * 2]
311 lea rdi, [rdi + rdx * 2]
312
313
314 movq xmm2, QWORD PTR[rsi + rax]
315 movq xmm3, QWORD PTR[rdi + rdx]
316
317 punpcklbw xmm2, xmm0
318 punpcklbw xmm3, xmm0
319
320 psubsw xmm2, xmm3
321 paddw xmm7, xmm2
322
323 pmaddwd xmm2, xmm2
324 paddd xmm1, xmm2
325
326 movq xmm2, QWORD PTR[rsi + rax *2]
327 movq xmm3, QWORD PTR[rdi + rdx *2]
328
329 punpcklbw xmm2, xmm0
330 punpcklbw xmm3, xmm0
331
332 psubsw xmm2, xmm3
333 paddw xmm7, xmm2
334
335 pmaddwd xmm2, xmm2
336 paddd xmm1, xmm2
337
338
339 lea rsi, [rsi + rax * 2]
340 lea rdi, [rdi + rdx * 2]
341
342 movq xmm2, QWORD PTR[rsi + rax]
343 movq xmm3, QWORD PTR[rdi + rdx]
344
345 punpcklbw xmm2, xmm0
346 punpcklbw xmm3, xmm0
347
348 psubsw xmm2, xmm3
349 paddw xmm7, xmm2
350
351 pmaddwd xmm2, xmm2
352 paddd xmm1, xmm2
353
354
355 movdqa xmm6, xmm7
356 punpcklwd xmm6, xmm0
357
358 punpckhwd xmm7, xmm0
359 movdqa xmm2, xmm1
360
361 paddw xmm6, xmm7
362 punpckldq xmm1, xmm0
363
364 punpckhdq xmm2, xmm0
365 movdqa xmm7, xmm6
366
367 paddd xmm1, xmm2
368 punpckldq xmm6, xmm0
369
370 punpckhdq xmm7, xmm0
371 paddw xmm6, xmm7
372
373 movdqa xmm2, xmm1
374 movdqa xmm7, xmm6
375
376 psrldq xmm1, 8
377 psrldq xmm6, 8
378
379 paddw xmm7, xmm6
380 paddd xmm1, xmm2
381
382 mov rax, arg(5) ;[Sum]
383 mov rdi, arg(4) ;[SSE]
384
385 movq rdx, xmm7
386 movsx rcx, dx
387
388 mov dword ptr [rax], ecx
389 movd DWORD PTR [rdi], xmm1
390
391 ; begin epilog
392 add rsp, 16
393 pop rdi
394 pop rsi
395 RESTORE_GOT
396 RESTORE_XMM
397 UNSHADOW_ARGS
398 pop rbp
399 ret
400
401
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698