Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(732)

Side by Side Diff: source/libvpx/vp8/common/x86/variance_impl_sse2.asm

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 11
12 %include "vpx_ports/x86_abi_support.asm" 12 %include "vpx_ports/x86_abi_support.asm"
13 13
14 %define xmm_filter_shift 7 14 %define xmm_filter_shift 7
15 15
16 ;unsigned int vp8_get_mb_ss_sse2
17 ;(
18 ; short *src_ptr
19 ;)
20 global sym(vp8_get_mb_ss_sse2) PRIVATE
21 sym(vp8_get_mb_ss_sse2):
22 push rbp
23 mov rbp, rsp
24 SHADOW_ARGS_TO_STACK 1
25 GET_GOT rbx
26 push rsi
27 push rdi
28 sub rsp, 16
29 ; end prolog
30
31
32 mov rax, arg(0) ;[src_ptr]
33 mov rcx, 8
34 pxor xmm4, xmm4
35
36 .NEXTROW:
37 movdqa xmm0, [rax]
38 movdqa xmm1, [rax+16]
39 movdqa xmm2, [rax+32]
40 movdqa xmm3, [rax+48]
41 pmaddwd xmm0, xmm0
42 pmaddwd xmm1, xmm1
43 pmaddwd xmm2, xmm2
44 pmaddwd xmm3, xmm3
45
46 paddd xmm0, xmm1
47 paddd xmm2, xmm3
48 paddd xmm4, xmm0
49 paddd xmm4, xmm2
50
51 add rax, 0x40
52 dec rcx
53 ja .NEXTROW
54
55 movdqa xmm3,xmm4
56 psrldq xmm4,8
57 paddd xmm4,xmm3
58 movdqa xmm3,xmm4
59 psrldq xmm4,4
60 paddd xmm4,xmm3
61 movq rax,xmm4
62
63
64 ; begin epilog
65 add rsp, 16
66 pop rdi
67 pop rsi
68 RESTORE_GOT
69 UNSHADOW_ARGS
70 pop rbp
71 ret
72
73
74 ;unsigned int vp8_get16x16var_sse2
75 ;(
76 ; unsigned char * src_ptr,
77 ; int source_stride,
78 ; unsigned char * ref_ptr,
79 ; int recon_stride,
80 ; unsigned int * SSE,
81 ; int * Sum
82 ;)
83 global sym(vp8_get16x16var_sse2) PRIVATE
84 sym(vp8_get16x16var_sse2):
85 push rbp
86 mov rbp, rsp
87 SHADOW_ARGS_TO_STACK 6
88 SAVE_XMM 7
89 push rbx
90 push rsi
91 push rdi
92 ; end prolog
93
94 mov rsi, arg(0) ;[src_ptr]
95 mov rdi, arg(2) ;[ref_ptr]
96
97 movsxd rax, DWORD PTR arg(1) ;[source_stride]
98 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
99
100 ; Prefetch data
101 lea rcx, [rax+rax*2]
102 prefetcht0 [rsi]
103 prefetcht0 [rsi+rax]
104 prefetcht0 [rsi+rax*2]
105 prefetcht0 [rsi+rcx]
106 lea rbx, [rsi+rax*4]
107 prefetcht0 [rbx]
108 prefetcht0 [rbx+rax]
109 prefetcht0 [rbx+rax*2]
110 prefetcht0 [rbx+rcx]
111
112 lea rcx, [rdx+rdx*2]
113 prefetcht0 [rdi]
114 prefetcht0 [rdi+rdx]
115 prefetcht0 [rdi+rdx*2]
116 prefetcht0 [rdi+rcx]
117 lea rbx, [rdi+rdx*4]
118 prefetcht0 [rbx]
119 prefetcht0 [rbx+rdx]
120 prefetcht0 [rbx+rdx*2]
121 prefetcht0 [rbx+rcx]
122
123 pxor xmm0, xmm0 ; clear xmm0 for unpack
124 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
125
126 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
127 mov rcx, 16
128
129 .var16loop:
130 movdqu xmm1, XMMWORD PTR [rsi]
131 movdqu xmm2, XMMWORD PTR [rdi]
132
133 prefetcht0 [rsi+rax*8]
134 prefetcht0 [rdi+rdx*8]
135
136 movdqa xmm3, xmm1
137 movdqa xmm4, xmm2
138
139
140 punpcklbw xmm1, xmm0
141 punpckhbw xmm3, xmm0
142
143 punpcklbw xmm2, xmm0
144 punpckhbw xmm4, xmm0
145
146
147 psubw xmm1, xmm2
148 psubw xmm3, xmm4
149
150 paddw xmm7, xmm1
151 pmaddwd xmm1, xmm1
152
153 paddw xmm7, xmm3
154 pmaddwd xmm3, xmm3
155
156 paddd xmm6, xmm1
157 paddd xmm6, xmm3
158
159 add rsi, rax
160 add rdi, rdx
161
162 sub rcx, 1
163 jnz .var16loop
164
165
166 movdqa xmm1, xmm6
167 pxor xmm6, xmm6
168
169 pxor xmm5, xmm5
170 punpcklwd xmm6, xmm7
171
172 punpckhwd xmm5, xmm7
173 psrad xmm5, 16
174
175 psrad xmm6, 16
176 paddd xmm6, xmm5
177
178 movdqa xmm2, xmm1
179 punpckldq xmm1, xmm0
180
181 punpckhdq xmm2, xmm0
182 movdqa xmm7, xmm6
183
184 paddd xmm1, xmm2
185 punpckldq xmm6, xmm0
186
187 punpckhdq xmm7, xmm0
188 paddd xmm6, xmm7
189
190 movdqa xmm2, xmm1
191 movdqa xmm7, xmm6
192
193 psrldq xmm1, 8
194 psrldq xmm6, 8
195
196 paddd xmm7, xmm6
197 paddd xmm1, xmm2
198
199 mov rax, arg(5) ;[Sum]
200 mov rdi, arg(4) ;[SSE]
201
202 movd DWORD PTR [rax], xmm7
203 movd DWORD PTR [rdi], xmm1
204
205
206 ; begin epilog
207 pop rdi
208 pop rsi
209 pop rbx
210 RESTORE_XMM
211 UNSHADOW_ARGS
212 pop rbp
213 ret
214
215
216
217
218 ;unsigned int vp8_get8x8var_sse2
219 ;(
220 ; unsigned char * src_ptr,
221 ; int source_stride,
222 ; unsigned char * ref_ptr,
223 ; int recon_stride,
224 ; unsigned int * SSE,
225 ; int * Sum
226 ;)
227 global sym(vp8_get8x8var_sse2) PRIVATE
228 sym(vp8_get8x8var_sse2):
229 push rbp
230 mov rbp, rsp
231 SHADOW_ARGS_TO_STACK 6
232 SAVE_XMM 7
233 GET_GOT rbx
234 push rsi
235 push rdi
236 sub rsp, 16
237 ; end prolog
238
239 mov rsi, arg(0) ;[src_ptr]
240 mov rdi, arg(2) ;[ref_ptr]
241
242 movsxd rax, DWORD PTR arg(1) ;[source_stride]
243 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
244
245 pxor xmm0, xmm0 ; clear xmm0 for unpack
246 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
247
248 movq xmm1, QWORD PTR [rsi]
249 movq xmm2, QWORD PTR [rdi]
250
251 punpcklbw xmm1, xmm0
252 punpcklbw xmm2, xmm0
253
254 psubsw xmm1, xmm2
255 paddw xmm7, xmm1
256
257 pmaddwd xmm1, xmm1
258
259 movq xmm2, QWORD PTR[rsi + rax]
260 movq xmm3, QWORD PTR[rdi + rdx]
261
262 punpcklbw xmm2, xmm0
263 punpcklbw xmm3, xmm0
264
265 psubsw xmm2, xmm3
266 paddw xmm7, xmm2
267
268 pmaddwd xmm2, xmm2
269 paddd xmm1, xmm2
270
271
272 movq xmm2, QWORD PTR[rsi + rax * 2]
273 movq xmm3, QWORD PTR[rdi + rdx * 2]
274
275 punpcklbw xmm2, xmm0
276 punpcklbw xmm3, xmm0
277
278 psubsw xmm2, xmm3
279 paddw xmm7, xmm2
280
281 pmaddwd xmm2, xmm2
282 paddd xmm1, xmm2
283
284
285 lea rsi, [rsi + rax * 2]
286 lea rdi, [rdi + rdx * 2]
287 movq xmm2, QWORD PTR[rsi + rax]
288 movq xmm3, QWORD PTR[rdi + rdx]
289
290 punpcklbw xmm2, xmm0
291 punpcklbw xmm3, xmm0
292
293 psubsw xmm2, xmm3
294 paddw xmm7, xmm2
295
296 pmaddwd xmm2, xmm2
297 paddd xmm1, xmm2
298
299 movq xmm2, QWORD PTR[rsi + rax *2]
300 movq xmm3, QWORD PTR[rdi + rdx *2]
301
302 punpcklbw xmm2, xmm0
303 punpcklbw xmm3, xmm0
304
305 psubsw xmm2, xmm3
306 paddw xmm7, xmm2
307
308 pmaddwd xmm2, xmm2
309 paddd xmm1, xmm2
310
311
312 lea rsi, [rsi + rax * 2]
313 lea rdi, [rdi + rdx * 2]
314
315
316 movq xmm2, QWORD PTR[rsi + rax]
317 movq xmm3, QWORD PTR[rdi + rdx]
318
319 punpcklbw xmm2, xmm0
320 punpcklbw xmm3, xmm0
321
322 psubsw xmm2, xmm3
323 paddw xmm7, xmm2
324
325 pmaddwd xmm2, xmm2
326 paddd xmm1, xmm2
327
328 movq xmm2, QWORD PTR[rsi + rax *2]
329 movq xmm3, QWORD PTR[rdi + rdx *2]
330
331 punpcklbw xmm2, xmm0
332 punpcklbw xmm3, xmm0
333
334 psubsw xmm2, xmm3
335 paddw xmm7, xmm2
336
337 pmaddwd xmm2, xmm2
338 paddd xmm1, xmm2
339
340
341 lea rsi, [rsi + rax * 2]
342 lea rdi, [rdi + rdx * 2]
343
344 movq xmm2, QWORD PTR[rsi + rax]
345 movq xmm3, QWORD PTR[rdi + rdx]
346
347 punpcklbw xmm2, xmm0
348 punpcklbw xmm3, xmm0
349
350 psubsw xmm2, xmm3
351 paddw xmm7, xmm2
352
353 pmaddwd xmm2, xmm2
354 paddd xmm1, xmm2
355
356
357 movdqa xmm6, xmm7
358 punpcklwd xmm6, xmm0
359
360 punpckhwd xmm7, xmm0
361 movdqa xmm2, xmm1
362
363 paddw xmm6, xmm7
364 punpckldq xmm1, xmm0
365
366 punpckhdq xmm2, xmm0
367 movdqa xmm7, xmm6
368
369 paddd xmm1, xmm2
370 punpckldq xmm6, xmm0
371
372 punpckhdq xmm7, xmm0
373 paddw xmm6, xmm7
374
375 movdqa xmm2, xmm1
376 movdqa xmm7, xmm6
377
378 psrldq xmm1, 8
379 psrldq xmm6, 8
380
381 paddw xmm7, xmm6
382 paddd xmm1, xmm2
383
384 mov rax, arg(5) ;[Sum]
385 mov rdi, arg(4) ;[SSE]
386
387 movq rdx, xmm7
388 movsx rcx, dx
389
390 mov dword ptr [rax], ecx
391 movd DWORD PTR [rdi], xmm1
392
393 ; begin epilog
394 add rsp, 16
395 pop rdi
396 pop rsi
397 RESTORE_GOT
398 RESTORE_XMM
399 UNSHADOW_ARGS
400 pop rbp
401 ret
402
403 ;void vp8_filter_block2d_bil_var_sse2 16 ;void vp8_filter_block2d_bil_var_sse2
404 ;( 17 ;(
405 ; unsigned char *ref_ptr, 18 ; unsigned char *ref_ptr,
406 ; int ref_pixels_per_line, 19 ; int ref_pixels_per_line,
407 ; unsigned char *src_ptr, 20 ; unsigned char *src_ptr,
408 ; int src_pixels_per_line, 21 ; int src_pixels_per_line,
409 ; unsigned int Height, 22 ; unsigned int Height,
410 ; int xoffset, 23 ; int xoffset,
411 ; int yoffset, 24 ; int yoffset,
412 ; int *sum, 25 ; int *sum,
(...skipping 937 matching lines...) Expand 10 before | Expand all | Expand 10 after
1350 align 16 963 align 16
1351 vp8_bilinear_filters_sse2: 964 vp8_bilinear_filters_sse2:
1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 965 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
1353 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 966 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
1354 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 967 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
1355 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 968 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
1356 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 969 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
1357 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 970 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
1358 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 971 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
1359 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 972 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
OLDNEW
« no previous file with comments | « source/libvpx/vp8/common/x86/variance_impl_mmx.asm ('k') | source/libvpx/vp8/common/x86/variance_mmx.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698