Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(264)

Side by Side Diff: source/libvpx/vp8/encoder/x86/quantize_sse2.asm

Issue 7671004: Update libvpx snapshot to v0.9.7-p1 (Cayuga). (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: '' Created 9 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license and patent 4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source 5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS 6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree. 7 ; file in the root of the source tree.
8 ; 8 ;
9 9
10 10
11 %include "vpx_ports/x86_abi_support.asm" 11 %include "vpx_ports/x86_abi_support.asm"
12 %include "asm_enc_offsets.asm" 12 %include "asm_enc_offsets.asm"
13 13
14 14
15 ; void vp8_regular_quantize_b_sse2 | arg 15 ; void vp8_regular_quantize_b_sse2 | arg
16 ; (BLOCK *b, | 0 16 ; (BLOCK *b, | 0
17 ; BLOCKD *d) | 1 17 ; BLOCKD *d) | 1
18 18
19 global sym(vp8_regular_quantize_b_sse2) 19 global sym(vp8_regular_quantize_b_sse2)
20 sym(vp8_regular_quantize_b_sse2): 20 sym(vp8_regular_quantize_b_sse2):
21 push rbp 21 push rbp
22 mov rbp, rsp 22 mov rbp, rsp
23 SAVE_XMM 23 SAVE_XMM 7
24 GET_GOT rbx 24 GET_GOT rbx
25 push rsi
26 25
27 %if ABI_IS_32BIT 26 %if ABI_IS_32BIT
28 push rdi 27 push rdi
28 push rsi
29 %else 29 %else
30 %ifidn __OUTPUT_FORMAT__,x64 30 %ifidn __OUTPUT_FORMAT__,x64
31 push rdi 31 push rdi
32 push rsi
32 %endif 33 %endif
33 %endif 34 %endif
34 35
35 ALIGN_STACK 16, rax 36 ALIGN_STACK 16, rax
36 %define BLOCKD_d 0 ; 8 37 %define zrun_zbin_boost 0 ; 8
37 %define zrun_zbin_boost 8 ; 8 38 %define abs_minus_zbin 8 ; 32
38 %define abs_minus_zbin 16 ; 32 39 %define temp_qcoeff 40 ; 32
39 %define temp_qcoeff 48 ; 32 40 %define qcoeff 72 ; 32
40 %define qcoeff 80 ; 32 41 %define stack_size 104
41 %define stack_size 112
42 sub rsp, stack_size 42 sub rsp, stack_size
43 ; end prolog 43 ; end prolog
44 44
45 %if ABI_IS_32BIT 45 %if ABI_IS_32BIT
46 mov rdi, arg(0) 46 mov rdi, arg(0) ; BLOCK *b
47 mov rsi, arg(1) ; BLOCKD *d
47 %else 48 %else
48 %ifidn __OUTPUT_FORMAT__,x64 49 %ifidn __OUTPUT_FORMAT__,x64
49 mov rdi, rcx ; BLOCK *b 50 mov rdi, rcx ; BLOCK *b
50 mov [rsp + BLOCKD_d], rdx 51 mov rsi, rdx ; BLOCKD *d
51 %else 52 %else
52 ;mov rdi, rdi ; BLOCK *b 53 ;mov rdi, rdi ; BLOCK *b
53 mov [rsp + BLOCKD_d], rsi 54 ;mov rsi, rsi ; BLOCKD *d
54 %endif 55 %endif
55 %endif 56 %endif
56 57
57 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr 58 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr
58 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr 59 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr
59 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value 60 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
60 61
61 ; z 62 ; z
62 movdqa xmm0, [rdx] 63 movdqa xmm0, [rdx]
63 movdqa xmm4, [rdx + 16] 64 movdqa xmm4, [rdx + 16]
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
118 paddw xmm5, xmm7 119 paddw xmm5, xmm7
119 120
120 movdqa [rsp + temp_qcoeff], xmm1 121 movdqa [rsp + temp_qcoeff], xmm1
121 movdqa [rsp + temp_qcoeff + 16], xmm5 122 movdqa [rsp + temp_qcoeff + 16], xmm5
122 123
123 pxor xmm6, xmm6 124 pxor xmm6, xmm6
124 ; zero qcoeff 125 ; zero qcoeff
125 movdqa [rsp + qcoeff], xmm6 126 movdqa [rsp + qcoeff], xmm6
126 movdqa [rsp + qcoeff + 16], xmm6 127 movdqa [rsp + qcoeff + 16], xmm6
127 128
128 mov rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr 129 mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
129 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr 130 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
130 mov [rsp + zrun_zbin_boost], rsi 131 mov [rsp + zrun_zbin_boost], rdx
131 132
132 %macro ZIGZAG_LOOP 1 133 %macro ZIGZAG_LOOP 1
133 movsx edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc
134
135 ; x 134 ; x
136 movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] 135 movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
137 136
138 ; if (x >= zbin) 137 ; if (x >= zbin)
139 sub cx, WORD PTR[rsi] ; x - zbin 138 sub cx, WORD PTR[rdx] ; x - zbin
140 lea rsi, [rsi + 2] ; zbin_boost_ptr++ 139 lea rdx, [rdx + 2] ; zbin_boost_ptr++
141 jl rq_zigzag_loop_%1 ; x < zbin 140 jl rq_zigzag_loop_%1 ; x < zbin
142 141
143 movsx edi, WORD PTR[rsp + temp_qcoeff + rdx *2] 142 movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
144 143
145 ; downshift by quant_shift[rdx] 144 ; downshift by quant_shift[rc]
146 movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc] 145 movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
147 sar edi, cl ; also sets Z bit 146 sar edi, cl ; also sets Z bit
148 je rq_zigzag_loop_%1 ; !y 147 je rq_zigzag_loop_%1 ; !y
149 mov WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff [rc] 148 mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoef f[rc]
150 mov rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost 149 mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
151 rq_zigzag_loop_%1: 150 rq_zigzag_loop_%1:
152 %endmacro 151 %endmacro
153 ZIGZAG_LOOP 0 152 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
154 ZIGZAG_LOOP 1 153 ZIGZAG_LOOP 0
155 ZIGZAG_LOOP 2 154 ZIGZAG_LOOP 1
156 ZIGZAG_LOOP 3 155 ZIGZAG_LOOP 4
157 ZIGZAG_LOOP 4 156 ZIGZAG_LOOP 8
158 ZIGZAG_LOOP 5 157 ZIGZAG_LOOP 5
159 ZIGZAG_LOOP 6 158 ZIGZAG_LOOP 2
160 ZIGZAG_LOOP 7 159 ZIGZAG_LOOP 3
161 ZIGZAG_LOOP 8 160 ZIGZAG_LOOP 6
162 ZIGZAG_LOOP 9 161 ZIGZAG_LOOP 9
163 ZIGZAG_LOOP 10
164 ZIGZAG_LOOP 11
165 ZIGZAG_LOOP 12 162 ZIGZAG_LOOP 12
166 ZIGZAG_LOOP 13 163 ZIGZAG_LOOP 13
164 ZIGZAG_LOOP 10
165 ZIGZAG_LOOP 7
166 ZIGZAG_LOOP 11
167 ZIGZAG_LOOP 14 167 ZIGZAG_LOOP 14
168 ZIGZAG_LOOP 15 168 ZIGZAG_LOOP 15
169 169
170 movdqa xmm2, [rsp + qcoeff] 170 movdqa xmm2, [rsp + qcoeff]
171 movdqa xmm3, [rsp + qcoeff + 16] 171 movdqa xmm3, [rsp + qcoeff + 16]
172 172
173 %if ABI_IS_32BIT 173 mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
174 mov rdi, arg(1) 174 mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
175 %else
176 mov rdi, [rsp + BLOCKD_d]
177 %endif
178
179 mov rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr
180 mov rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
181 175
182 ; y ^ sz 176 ; y ^ sz
183 pxor xmm2, xmm0 177 pxor xmm2, xmm0
184 pxor xmm3, xmm4 178 pxor xmm3, xmm4
185 ; x = (y ^ sz) - sz 179 ; x = (y ^ sz) - sz
186 psubw xmm2, xmm0 180 psubw xmm2, xmm0
187 psubw xmm3, xmm4 181 psubw xmm3, xmm4
188 182
189 ; dequant 183 ; dequant
190 movdqa xmm0, [rcx] 184 movdqa xmm0, [rcx]
191 movdqa xmm1, [rcx + 16] 185 movdqa xmm1, [rcx + 16]
192 186
193 mov rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr 187 mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
194 188
195 pmullw xmm0, xmm2 189 pmullw xmm0, xmm2
196 pmullw xmm1, xmm3 190 pmullw xmm1, xmm3
197 191
198 movdqa [rcx], xmm2 ; store qcoeff 192 movdqa [rcx], xmm2 ; store qcoeff
199 movdqa [rcx + 16], xmm3 193 movdqa [rcx + 16], xmm3
200 movdqa [rsi], xmm0 ; store dqcoeff 194 movdqa [rdi], xmm0 ; store dqcoeff
201 movdqa [rsi + 16], xmm1 195 movdqa [rdi + 16], xmm1
202 196
203 ; select the last value (in zig_zag order) for EOB 197 ; select the last value (in zig_zag order) for EOB
204 pcmpeqw xmm2, xmm6 198 pcmpeqw xmm2, xmm6
205 pcmpeqw xmm3, xmm6 199 pcmpeqw xmm3, xmm6
206 ; ! 200 ; !
207 pcmpeqw xmm6, xmm6 201 pcmpeqw xmm6, xmm6
208 pxor xmm2, xmm6 202 pxor xmm2, xmm6
209 pxor xmm3, xmm6 203 pxor xmm3, xmm6
210 ; mask inv_zig_zag 204 ; mask inv_zig_zag
211 pand xmm2, [GLOBAL(inv_zig_zag)] 205 pand xmm2, [GLOBAL(inv_zig_zag)]
212 pand xmm3, [GLOBAL(inv_zig_zag + 16)] 206 pand xmm3, [GLOBAL(inv_zig_zag + 16)]
213 ; select the max value 207 ; select the max value
214 pmaxsw xmm2, xmm3 208 pmaxsw xmm2, xmm3
215 pshufd xmm3, xmm2, 00001110b 209 pshufd xmm3, xmm2, 00001110b
216 pmaxsw xmm2, xmm3 210 pmaxsw xmm2, xmm3
217 pshuflw xmm3, xmm2, 00001110b 211 pshuflw xmm3, xmm2, 00001110b
218 pmaxsw xmm2, xmm3 212 pmaxsw xmm2, xmm3
219 pshuflw xmm3, xmm2, 00000001b 213 pshuflw xmm3, xmm2, 00000001b
220 pmaxsw xmm2, xmm3 214 pmaxsw xmm2, xmm3
221 movd eax, xmm2 215 movd eax, xmm2
222 and eax, 0xff 216 and eax, 0xff
223 mov [rdi + vp8_blockd_eob], eax 217 mov [rsi + vp8_blockd_eob], eax
224 218
225 ; begin epilog 219 ; begin epilog
226 add rsp, stack_size 220 add rsp, stack_size
227 pop rsp 221 pop rsp
228 %if ABI_IS_32BIT 222 %if ABI_IS_32BIT
223 pop rsi
229 pop rdi 224 pop rdi
230 %else 225 %else
231 %ifidn __OUTPUT_FORMAT__,x64 226 %ifidn __OUTPUT_FORMAT__,x64
227 pop rsi
232 pop rdi 228 pop rdi
233 %endif 229 %endif
234 %endif 230 %endif
235 pop rsi
236 RESTORE_GOT 231 RESTORE_GOT
237 RESTORE_XMM 232 RESTORE_XMM
238 pop rbp 233 pop rbp
239 ret 234 ret
240 235
241 ; int vp8_fast_quantize_b_impl_sse2 | arg 236 ; void vp8_fast_quantize_b_sse2 | arg
242 ; (short *coeff_ptr, | 0 237 ; (BLOCK *b, | 0
243 ; short *qcoeff_ptr, | 1 238 ; BLOCKD *d) | 1
244 ; short *dequant_ptr, | 2
245 ; short *inv_scan_order, | 3
246 ; short *round_ptr, | 4
247 ; short *quant_ptr, | 5
248 ; short *dqcoeff_ptr) | 6
249 239
250 global sym(vp8_fast_quantize_b_impl_sse2) 240 global sym(vp8_fast_quantize_b_sse2)
251 sym(vp8_fast_quantize_b_impl_sse2): 241 sym(vp8_fast_quantize_b_sse2):
252 push rbp 242 push rbp
253 mov rbp, rsp 243 mov rbp, rsp
254 SHADOW_ARGS_TO_STACK 7 244 GET_GOT rbx
245
246 %if ABI_IS_32BIT
247 push rdi
255 push rsi 248 push rsi
249 %else
250 %ifidn __OUTPUT_FORMAT__,x64
256 push rdi 251 push rdi
252 push rsi
253 %else
254 ; these registers are used for passing arguments
255 %endif
256 %endif
257
257 ; end prolog 258 ; end prolog
258 259
259 mov rdx, arg(0) ;coeff_ptr 260 %if ABI_IS_32BIT
260 mov rcx, arg(2) ;dequant_ptr 261 mov rdi, arg(0) ; BLOCK *b
261 mov rdi, arg(4) ;round_ptr 262 mov rsi, arg(1) ; BLOCKD *d
262 mov rsi, arg(5) ;quant_ptr 263 %else
264 %ifidn __OUTPUT_FORMAT__,x64
265 mov rdi, rcx ; BLOCK *b
266 mov rsi, rdx ; BLOCKD *d
267 %else
268 ;mov rdi, rdi ; BLOCK *b
269 ;mov rsi, rsi ; BLOCKD *d
270 %endif
271 %endif
263 272
264 movdqa xmm0, XMMWORD PTR[rdx] 273 mov rax, [rdi + vp8_block_coeff]
265 movdqa xmm4, XMMWORD PTR[rdx + 16] 274 mov rcx, [rdi + vp8_block_round]
275 mov rdx, [rdi + vp8_block_quant_fast]
266 276
267 movdqa xmm2, XMMWORD PTR[rdi] ;round lo 277 ; z = coeff
268 movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi 278 movdqa xmm0, [rax]
279 movdqa xmm4, [rax + 16]
269 280
281 ; dup z so we can save sz
270 movdqa xmm1, xmm0 282 movdqa xmm1, xmm0
271 movdqa xmm5, xmm4 283 movdqa xmm5, xmm4
272 284
273 psraw xmm0, 15 ;sign of z (aka sz) 285 ; sz = z >> 15
274 psraw xmm4, 15 ;sign of z (aka sz) 286 psraw xmm0, 15
287 psraw xmm4, 15
275 288
276 pxor xmm1, xmm0 289 ; x = abs(z) = (z ^ sz) - sz
277 pxor xmm5, xmm4
278 psubw xmm1, xmm0 ;x = abs(z)
279 psubw xmm5, xmm4 ;x = abs(z)
280
281 paddw xmm1, xmm2
282 paddw xmm5, xmm3
283
284 pmulhw xmm1, XMMWORD PTR[rsi]
285 pmulhw xmm5, XMMWORD PTR[rsi + 16]
286
287 mov rdi, arg(1) ;qcoeff_ptr
288 mov rsi, arg(6) ;dqcoeff_ptr
289
290 movdqa xmm2, XMMWORD PTR[rcx]
291 movdqa xmm3, XMMWORD PTR[rcx + 16]
292
293 pxor xmm1, xmm0 290 pxor xmm1, xmm0
294 pxor xmm5, xmm4 291 pxor xmm5, xmm4
295 psubw xmm1, xmm0 292 psubw xmm1, xmm0
296 psubw xmm5, xmm4 293 psubw xmm5, xmm4
297 294
298 movdqa XMMWORD PTR[rdi], xmm1 295 ; x += round
299 movdqa XMMWORD PTR[rdi + 16], xmm5 296 paddw xmm1, [rcx]
297 paddw xmm5, [rcx + 16]
300 298
301 pmullw xmm2, xmm1 299 mov rax, [rsi + vp8_blockd_qcoeff]
302 pmullw xmm3, xmm5 300 mov rcx, [rsi + vp8_blockd_dequant]
301 mov rdi, [rsi + vp8_blockd_dqcoeff]
303 302
304 mov rdi, arg(3) ;inv_scan_order 303 ; y = x * quant >> 16
304 pmulhw xmm1, [rdx]
305 pmulhw xmm5, [rdx + 16]
305 306
306 ; Start with 16 307 ; x = (y ^ sz) - sz
308 pxor xmm1, xmm0
309 pxor xmm5, xmm4
310 psubw xmm1, xmm0
311 psubw xmm5, xmm4
312
313 ; qcoeff = x
314 movdqa [rax], xmm1
315 movdqa [rax + 16], xmm5
316
317 ; x * dequant
318 movdqa xmm2, xmm1
319 movdqa xmm3, xmm5
320 pmullw xmm2, [rcx]
321 pmullw xmm3, [rcx + 16]
322
323 ; dqcoeff = x * dequant
324 movdqa [rdi], xmm2
325 movdqa [rdi + 16], xmm3
326
307 pxor xmm4, xmm4 ;clear all bits 327 pxor xmm4, xmm4 ;clear all bits
308 pcmpeqw xmm1, xmm4 328 pcmpeqw xmm1, xmm4
309 pcmpeqw xmm5, xmm4 329 pcmpeqw xmm5, xmm4
310 330
311 pcmpeqw xmm4, xmm4 ;set all bits 331 pcmpeqw xmm4, xmm4 ;set all bits
312 pxor xmm1, xmm4 332 pxor xmm1, xmm4
313 pxor xmm5, xmm4 333 pxor xmm5, xmm4
314 334
315 pand xmm1, XMMWORD PTR[rdi] 335 pand xmm1, [GLOBAL(inv_zig_zag)]
316 pand xmm5, XMMWORD PTR[rdi+16] 336 pand xmm5, [GLOBAL(inv_zig_zag + 16)]
317 337
318 pmaxsw xmm1, xmm5 338 pmaxsw xmm1, xmm5
319 339
320 ; now down to 8 340 ; now down to 8
321 pshufd xmm5, xmm1, 00001110b 341 pshufd xmm5, xmm1, 00001110b
322 342
323 pmaxsw xmm1, xmm5 343 pmaxsw xmm1, xmm5
324 344
325 ; only 4 left 345 ; only 4 left
326 pshuflw xmm5, xmm1, 00001110b 346 pshuflw xmm5, xmm1, 00001110b
327 347
328 pmaxsw xmm1, xmm5 348 pmaxsw xmm1, xmm5
329 349
330 ; okay, just 2! 350 ; okay, just 2!
331 pshuflw xmm5, xmm1, 00000001b 351 pshuflw xmm5, xmm1, 00000001b
332 352
333 pmaxsw xmm1, xmm5 353 pmaxsw xmm1, xmm5
334 354
335 movd rax, xmm1 355 movd eax, xmm1
336 and rax, 0xff 356 and eax, 0xff
337 357 mov [rsi + vp8_blockd_eob], eax
338 movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff
339 movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff
340 358
341 ; begin epilog 359 ; begin epilog
360 %if ABI_IS_32BIT
361 pop rsi
342 pop rdi 362 pop rdi
363 %else
364 %ifidn __OUTPUT_FORMAT__,x64
343 pop rsi 365 pop rsi
344 UNSHADOW_ARGS 366 pop rdi
367 %endif
368 %endif
369
370 RESTORE_GOT
345 pop rbp 371 pop rbp
346 ret 372 ret
347 373
348 SECTION_RODATA 374 SECTION_RODATA
349 align 16 375 align 16
350 zig_zag:
351 dw 0x0000, 0x0001, 0x0004, 0x0008
352 dw 0x0005, 0x0002, 0x0003, 0x0006
353 dw 0x0009, 0x000c, 0x000d, 0x000a
354 dw 0x0007, 0x000b, 0x000e, 0x000f
355 inv_zig_zag: 376 inv_zig_zag:
356 dw 0x0001, 0x0002, 0x0006, 0x0007 377 dw 0x0001, 0x0002, 0x0006, 0x0007
357 dw 0x0003, 0x0005, 0x0008, 0x000d 378 dw 0x0003, 0x0005, 0x0008, 0x000d
358 dw 0x0004, 0x0009, 0x000c, 0x000e 379 dw 0x0004, 0x0009, 0x000c, 0x000e
359 dw 0x000a, 0x000b, 0x000f, 0x0010 380 dw 0x000a, 0x000b, 0x000f, 0x0010
OLDNEW
« no previous file with comments | « source/libvpx/vp8/encoder/x86/mcomp_x86.h ('k') | source/libvpx/vp8/encoder/x86/quantize_sse4.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698