Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(10)

Side by Side Diff: source/libvpx/vpx_dsp/x86/halfpix_variance_impl_sse2.asm

Issue 1322703002: Cherry pick vp8 halfpix variance fix (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx@m46-2490
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "vpx_ports/x86_abi_support.asm"
12
13 ;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
14 ; int ref_stride,
15 ; unsigned char *src,
16 ; int src_stride,
17 ; unsigned int height,
18 ; int *sum,
19 ; unsigned int *sumsquared)
20 global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE
21 sym(vpx_half_horiz_vert_variance16x_h_sse2):
22 push rbp
23 mov rbp, rsp
24 SHADOW_ARGS_TO_STACK 7
25 SAVE_XMM 7
26 GET_GOT rbx
27 push rsi
28 push rdi
29 ; end prolog
30
31 pxor xmm6, xmm6 ; error accumulator
32 pxor xmm7, xmm7 ; sse eaccumulator
33 mov rsi, arg(0) ;ref
34
35 mov rdi, arg(2) ;src
36 movsxd rcx, dword ptr arg(4) ;height
37 movsxd rax, dword ptr arg(1) ;ref_stride
38 movsxd rdx, dword ptr arg(3) ;src_stride
39
40 pxor xmm0, xmm0 ;
41
42 movdqu xmm5, XMMWORD PTR [rsi]
43 movdqu xmm3, XMMWORD PTR [rsi+1]
44 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x mm3) horizontal line 1
45
46 lea rsi, [rsi + rax]
47
48 vpx_half_horiz_vert_variance16x_h_1:
49 movdqu xmm1, XMMWORD PTR [rsi] ;
50 movdqu xmm2, XMMWORD PTR [rsi+1] ;
51 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,x mm3) horizontal line i+1
52
53 pavgb xmm5, xmm1 ; xmm = vertical av erage of the above
54
55 movdqa xmm4, xmm5
56 punpcklbw xmm5, xmm0 ; xmm5 = words of a bove
57 punpckhbw xmm4, xmm0
58
59 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2.. d7
60 punpcklbw xmm3, xmm0 ; xmm3 = words of a bove
61 psubw xmm5, xmm3 ; xmm5 -= xmm3
62
63 movq xmm3, QWORD PTR [rdi+8]
64 punpcklbw xmm3, xmm0
65 psubw xmm4, xmm3
66
67 paddw xmm6, xmm5 ; xmm6 += accumulat ed column differences
68 paddw xmm6, xmm4
69 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
70 pmaddwd xmm4, xmm4
71 paddd xmm7, xmm5 ; xmm7 += accumulat ed square column differences
72 paddd xmm7, xmm4
73
74 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
75
76 lea rsi, [rsi + rax]
77 lea rdi, [rdi + rdx]
78
79 sub rcx, 1 ;
80 jnz vpx_half_horiz_vert_variance16x_h_1 ;
81
82 pxor xmm1, xmm1
83 pxor xmm5, xmm5
84
85 punpcklwd xmm0, xmm6
86 punpckhwd xmm1, xmm6
87 psrad xmm0, 16
88 psrad xmm1, 16
89 paddd xmm0, xmm1
90 movdqa xmm1, xmm0
91
92 movdqa xmm6, xmm7
93 punpckldq xmm6, xmm5
94 punpckhdq xmm7, xmm5
95 paddd xmm6, xmm7
96
97 punpckldq xmm0, xmm5
98 punpckhdq xmm1, xmm5
99 paddd xmm0, xmm1
100
101 movdqa xmm7, xmm6
102 movdqa xmm1, xmm0
103
104 psrldq xmm7, 8
105 psrldq xmm1, 8
106
107 paddd xmm6, xmm7
108 paddd xmm0, xmm1
109
110 mov rsi, arg(5) ;[Sum]
111 mov rdi, arg(6) ;[SSE]
112
113 movd [rsi], xmm0
114 movd [rdi], xmm6
115
116 ; begin epilog
117 pop rdi
118 pop rsi
119 RESTORE_GOT
120 RESTORE_XMM
121 UNSHADOW_ARGS
122 pop rbp
123 ret
124
125
126 ;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref,
127 ; int ref_stride,
128 ; unsigned char *src,
129 ; int src_stride,
130 ; unsigned int height,
131 ; int *sum,
132 ; unsigned int *sumsquared)
133 global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE
134 sym(vpx_half_vert_variance16x_h_sse2):
135 push rbp
136 mov rbp, rsp
137 SHADOW_ARGS_TO_STACK 7
138 SAVE_XMM 7
139 GET_GOT rbx
140 push rsi
141 push rdi
142 ; end prolog
143
144 pxor xmm6, xmm6 ; error accumulator
145 pxor xmm7, xmm7 ; sse eaccumulator
146 mov rsi, arg(0) ;ref
147
148 mov rdi, arg(2) ;src
149 movsxd rcx, dword ptr arg(4) ;height
150 movsxd rax, dword ptr arg(1) ;ref_stride
151 movsxd rdx, dword ptr arg(3) ;src_stride
152
153 movdqu xmm5, XMMWORD PTR [rsi]
154 lea rsi, [rsi + rax ]
155 pxor xmm0, xmm0
156
157 vpx_half_vert_variance16x_h_1:
158 movdqu xmm3, XMMWORD PTR [rsi]
159
160 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x mm3)
161 movdqa xmm4, xmm5
162 punpcklbw xmm5, xmm0
163 punpckhbw xmm4, xmm0
164
165 movq xmm2, QWORD PTR [rdi]
166 punpcklbw xmm2, xmm0
167 psubw xmm5, xmm2
168 movq xmm2, QWORD PTR [rdi+8]
169 punpcklbw xmm2, xmm0
170 psubw xmm4, xmm2
171
172 paddw xmm6, xmm5 ; xmm6 += accumulat ed column differences
173 paddw xmm6, xmm4
174 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
175 pmaddwd xmm4, xmm4
176 paddd xmm7, xmm5 ; xmm7 += accumulat ed square column differences
177 paddd xmm7, xmm4
178
179 movdqa xmm5, xmm3
180
181 lea rsi, [rsi + rax]
182 lea rdi, [rdi + rdx]
183
184 sub rcx, 1
185 jnz vpx_half_vert_variance16x_h_1
186
187 pxor xmm1, xmm1
188 pxor xmm5, xmm5
189
190 punpcklwd xmm0, xmm6
191 punpckhwd xmm1, xmm6
192 psrad xmm0, 16
193 psrad xmm1, 16
194 paddd xmm0, xmm1
195 movdqa xmm1, xmm0
196
197 movdqa xmm6, xmm7
198 punpckldq xmm6, xmm5
199 punpckhdq xmm7, xmm5
200 paddd xmm6, xmm7
201
202 punpckldq xmm0, xmm5
203 punpckhdq xmm1, xmm5
204 paddd xmm0, xmm1
205
206 movdqa xmm7, xmm6
207 movdqa xmm1, xmm0
208
209 psrldq xmm7, 8
210 psrldq xmm1, 8
211
212 paddd xmm6, xmm7
213 paddd xmm0, xmm1
214
215 mov rsi, arg(5) ;[Sum]
216 mov rdi, arg(6) ;[SSE]
217
218 movd [rsi], xmm0
219 movd [rdi], xmm6
220
221 ; begin epilog
222 pop rdi
223 pop rsi
224 RESTORE_GOT
225 RESTORE_XMM
226 UNSHADOW_ARGS
227 pop rbp
228 ret
229
230
231 ;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref,
232 ; int ref_stride
233 ; unsigned char *src,
234 ; int src_stride,
235 ; unsigned int height,
236 ; int *sum,
237 ; unsigned int *sumsquared)
238 global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE
239 sym(vpx_half_horiz_variance16x_h_sse2):
240 push rbp
241 mov rbp, rsp
242 SHADOW_ARGS_TO_STACK 7
243 SAVE_XMM 7
244 GET_GOT rbx
245 push rsi
246 push rdi
247 ; end prolog
248
249 pxor xmm6, xmm6 ; error accumulator
250 pxor xmm7, xmm7 ; sse eaccumulator
251 mov rsi, arg(0) ;ref
252
253 mov rdi, arg(2) ;src
254 movsxd rcx, dword ptr arg(4) ;height
255 movsxd rax, dword ptr arg(1) ;ref_stride
256 movsxd rdx, dword ptr arg(3) ;src_stride
257
258 pxor xmm0, xmm0 ;
259
260 vpx_half_horiz_variance16x_h_1:
261 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2 ..s15
262 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3 ..s16
263
264 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x mm3)
265 movdqa xmm1, xmm5
266 punpcklbw xmm5, xmm0 ; xmm5 = words of a bove
267 punpckhbw xmm1, xmm0
268
269 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2.. d7
270 punpcklbw xmm3, xmm0 ; xmm3 = words of a bove
271 movq xmm2, QWORD PTR [rdi+8]
272 punpcklbw xmm2, xmm0
273
274 psubw xmm5, xmm3 ; xmm5 -= xmm3
275 psubw xmm1, xmm2
276 paddw xmm6, xmm5 ; xmm6 += accumulat ed column differences
277 paddw xmm6, xmm1
278 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
279 pmaddwd xmm1, xmm1
280 paddd xmm7, xmm5 ; xmm7 += accumulat ed square column differences
281 paddd xmm7, xmm1
282
283 lea rsi, [rsi + rax]
284 lea rdi, [rdi + rdx]
285
286 sub rcx, 1 ;
287 jnz vpx_half_horiz_variance16x_h_1 ;
288
289 pxor xmm1, xmm1
290 pxor xmm5, xmm5
291
292 punpcklwd xmm0, xmm6
293 punpckhwd xmm1, xmm6
294 psrad xmm0, 16
295 psrad xmm1, 16
296 paddd xmm0, xmm1
297 movdqa xmm1, xmm0
298
299 movdqa xmm6, xmm7
300 punpckldq xmm6, xmm5
301 punpckhdq xmm7, xmm5
302 paddd xmm6, xmm7
303
304 punpckldq xmm0, xmm5
305 punpckhdq xmm1, xmm5
306 paddd xmm0, xmm1
307
308 movdqa xmm7, xmm6
309 movdqa xmm1, xmm0
310
311 psrldq xmm7, 8
312 psrldq xmm1, 8
313
314 paddd xmm6, xmm7
315 paddd xmm0, xmm1
316
317 mov rsi, arg(5) ;[Sum]
318 mov rdi, arg(6) ;[SSE]
319
320 movd [rsi], xmm0
321 movd [rdi], xmm6
322
323 ; begin epilog
324 pop rdi
325 pop rsi
326 RESTORE_GOT
327 RESTORE_XMM
328 UNSHADOW_ARGS
329 pop rbp
330 ret
331
332 SECTION_RODATA
333 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
334 align 16
335 xmm_bi_rd:
336 times 8 dw 64
337 align 16
338 vpx_bilinear_filters_sse2:
339 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
340 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
341 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
342 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
343 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
344 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
345 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
346 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl ('k') | source/libvpx/vpx_dsp/x86/halfpix_variance_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698