Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(52)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %define VP9_FILTER_WEIGHT 128
15 %define VP9_FILTER_SHIFT 7
16
17 ;void vp9_post_proc_down_and_across_mmx
18 ;(
19 ; unsigned char *src_ptr,
20 ; unsigned char *dst_ptr,
21 ; int src_pixels_per_line,
22 ; int dst_pixels_per_line,
23 ; int rows,
24 ; int cols,
25 ; int flimit
26 ;)
27 global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
28 sym(vp9_post_proc_down_and_across_mmx):
29 push rbp
30 mov rbp, rsp
31 SHADOW_ARGS_TO_STACK 7
32 GET_GOT rbx
33 push rsi
34 push rdi
35 ; end prolog
36
37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
38 ; move the global rd onto the stack, since we don't have enough registers
39 ; to do PIC addressing
40 movq mm0, [GLOBAL(rd)]
41 sub rsp, 8
42 movq [rsp], mm0
43 %define RD [rsp]
44 %else
45 %define RD [GLOBAL(rd)]
46 %endif
47
48 push rbx
49 lea rbx, [GLOBAL(Blur)]
50 movd mm2, dword ptr arg(6) ;flimit
51 punpcklwd mm2, mm2
52 punpckldq mm2, mm2
53
54 mov rsi, arg(0) ;src_ptr
55 mov rdi, arg(1) ;dst_ptr
56
57 movsxd rcx, DWORD PTR arg(4) ;rows
58 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pit ch?
59 pxor mm0, mm0 ; mm0 = 00000000
60
61 .nextrow:
62
63 xor rdx, rdx ; clear out rdx for use as loop counte r
64 .nextcol:
65
66 pxor mm7, mm7 ; mm7 = 00000000
67 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
68 movq mm3, [rsi] ; mm4 = r0 p0..p7
69 punpcklbw mm3, mm0 ; mm3 = p0..p3
70 movq mm1, mm3 ; mm1 = p0..p3
71 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
72
73 movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
74 movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
75 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
76 pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
77 paddusw mm3, mm6 ; mm3 += mm6
78
79 ; thresholding
80 movq mm7, mm1 ; mm7 = r0 p0..p3
81 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
82 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
83 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
84 pcmpgtw mm7, mm2
85
86 movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
87 movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
88 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
89 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
90 paddusw mm3, mm6 ; mm3 += mm5
91
92 ; thresholding
93 movq mm6, mm1 ; mm6 = r0 p0..p3
94 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
95 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
96 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
97 pcmpgtw mm6, mm2
98 por mm7, mm6 ; accumulate thresholds
99
100
101 neg rax
102 movq mm6, [rbx ] ; kernel 0 taps
103 movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
104 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
105 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
106 paddusw mm3, mm6 ; mm3 += mm5
107
108 ; thresholding
109 movq mm6, mm1 ; mm6 = r0 p0..p3
110 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
111 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
112 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
113 pcmpgtw mm6, mm2
114 por mm7, mm6 ; accumulate thresholds
115
116 movq mm6, [rbx + 16] ; kernel 1 taps
117 movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
118 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
119 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
120 paddusw mm3, mm6 ; mm3 += mm5
121
122 ; thresholding
123 movq mm6, mm1 ; mm6 = r0 p0..p3
124 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
125 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
126 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
127 pcmpgtw mm6, mm2
128 por mm7, mm6 ; accumulate thresholds
129
130
131 paddusw mm3, RD ; mm3 += round value
132 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
133
134 pand mm1, mm7 ; mm1 select vals > thresh from source
135 pandn mm7, mm3 ; mm7 select vals < thresh from blurre d result
136 paddusw mm1, mm7 ; combination
137
138 packuswb mm1, mm0 ; pack to bytes
139
140 movd [rdi], mm1 ;
141 neg rax ; pitch is positive
142
143
144 add rsi, 4
145 add rdi, 4
146 add rdx, 4
147
148 cmp edx, dword ptr arg(5) ;cols
149 jl .nextcol
150 ; done with the all cols, start the across filtering in place
151 sub rsi, rdx
152 sub rdi, rdx
153
154
155 push rax
156 xor rdx, rdx
157 mov rax, [rdi-4];
158
159 .acrossnextcol:
160 pxor mm7, mm7 ; mm7 = 00000000
161 movq mm6, [rbx + 32 ] ;
162 movq mm4, [rdi+rdx] ; mm4 = p0..p7
163 movq mm3, mm4 ; mm3 = p0..p7
164 punpcklbw mm3, mm0 ; mm3 = p0..p3
165 movq mm1, mm3 ; mm1 = p0..p3
166 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
167
168 movq mm6, [rbx + 48]
169 psrlq mm4, 8 ; mm4 = p1..p7
170 movq mm5, mm4 ; mm5 = p1..p7
171 punpcklbw mm5, mm0 ; mm5 = p1..p4
172 pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
173 paddusw mm3, mm6 ; mm3 += mm6
174
175 ; thresholding
176 movq mm7, mm1 ; mm7 = p0..p3
177 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
178 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
179 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
180 pcmpgtw mm7, mm2
181
182 movq mm6, [rbx + 64 ]
183 psrlq mm4, 8 ; mm4 = p2..p7
184 movq mm5, mm4 ; mm5 = p2..p7
185 punpcklbw mm5, mm0 ; mm5 = p2..p5
186 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
187 paddusw mm3, mm6 ; mm3 += mm5
188
189 ; thresholding
190 movq mm6, mm1 ; mm6 = p0..p3
191 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
192 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
193 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
194 pcmpgtw mm6, mm2
195 por mm7, mm6 ; accumulate thresholds
196
197
198 movq mm6, [rbx ]
199 movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
200 movq mm5, mm4 ; mm5 = p-2..p5
201 punpcklbw mm5, mm0 ; mm5 = p-2..p1
202 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
203 paddusw mm3, mm6 ; mm3 += mm5
204
205 ; thresholding
206 movq mm6, mm1 ; mm6 = p0..p3
207 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
208 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
209 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
210 pcmpgtw mm6, mm2
211 por mm7, mm6 ; accumulate thresholds
212
213 movq mm6, [rbx + 16]
214 psrlq mm4, 8 ; mm4 = p-1..p5
215 punpcklbw mm4, mm0 ; mm4 = p-1..p2
216 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
217 paddusw mm3, mm6 ; mm3 += mm5
218
219 ; thresholding
220 movq mm6, mm1 ; mm6 = p0..p3
221 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
222 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
223 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
224 pcmpgtw mm6, mm2
225 por mm7, mm6 ; accumulate thresholds
226
227 paddusw mm3, RD ; mm3 += round value
228 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
229
230 pand mm1, mm7 ; mm1 select vals > thresh from source
231 pandn mm7, mm3 ; mm7 select vals < thresh from blurre d result
232 paddusw mm1, mm7 ; combination
233
234 packuswb mm1, mm0 ; pack to bytes
235 mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
236 movd eax, mm1
237
238 add rdx, 4
239 cmp edx, dword ptr arg(5) ;cols
240 jl .acrossnextcol;
241
242 mov DWORD PTR [rdi+rdx-4], eax
243 pop rax
244
245 ; done with this rwo
246 add rsi,rax ; next line
247 movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pit ch?
248 add rdi,rax ; next destination
249 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pit ch?
250
251 dec rcx ; decrement count
252 jnz .nextrow ; next row
253 pop rbx
254
255 ; begin epilog
256 pop rdi
257 pop rsi
258 RESTORE_GOT
259 UNSHADOW_ARGS
260 pop rbp
261 ret
262 %undef RD
263
264
265 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
266 ; int pitch, int rows, int cols,int flimit)
267 extern sym(vp9_rv)
268 global sym(vp9_mbpost_proc_down_mmx) PRIVATE
269 sym(vp9_mbpost_proc_down_mmx):
270 push rbp
271 mov rbp, rsp
272 SHADOW_ARGS_TO_STACK 5
273 GET_GOT rbx
274 push rsi
275 push rdi
276 ; end prolog
277
278 ALIGN_STACK 16, rax
279 sub rsp, 136
280
281 ; unsigned char d[16][8] at [rsp]
282 ; create flimit2 at [rsp+128]
283 mov eax, dword ptr arg(4) ;flimit
284 mov [rsp+128], eax
285 mov [rsp+128+4], eax
286 %define flimit2 [rsp+128]
287
288 %if ABI_IS_32BIT=0
289 lea r8, [GLOBAL(sym(vp9_rv))]
290 %endif
291
292 ;rows +=8;
293 add dword ptr arg(2), 8
294
295 ;for(c=0; c<cols; c+=4)
296 .loop_col:
297 mov rsi, arg(0) ;s
298 pxor mm0, mm0 ;
299
300 movsxd rax, dword ptr arg(1) ;pitch ;
301 neg rax ; rax = -pitch
302
303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch *8]
304 neg rax
305
306
307 pxor mm5, mm5
308 pxor mm6, mm6 ;
309
310 pxor mm7, mm7 ;
311 mov rdi, rsi
312
313 mov rcx, 15 ;
314
315 .loop_initvar:
316 movd mm1, DWORD PTR [rdi];
317 punpcklbw mm1, mm0 ;
318
319 paddw mm5, mm1 ;
320 pmullw mm1, mm1 ;
321
322 movq mm2, mm1 ;
323 punpcklwd mm1, mm0 ;
324
325 punpckhwd mm2, mm0 ;
326 paddd mm6, mm1 ;
327
328 paddd mm7, mm2 ;
329 lea rdi, [rdi+rax] ;
330
331 dec rcx
332 jne .loop_initvar
333 ;save the var and sum
334 xor rdx, rdx
335 .loop_row:
336 movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
337 movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
338
339 punpcklbw mm1, mm0
340 punpcklbw mm2, mm0
341
342 paddw mm5, mm2
343 psubw mm5, mm1
344
345 pmullw mm2, mm2
346 movq mm4, mm2
347
348 punpcklwd mm2, mm0
349 punpckhwd mm4, mm0
350
351 paddd mm6, mm2
352 paddd mm7, mm4
353
354 pmullw mm1, mm1
355 movq mm2, mm1
356
357 punpcklwd mm1, mm0
358 psubd mm6, mm1
359
360 punpckhwd mm2, mm0
361 psubd mm7, mm2
362
363
364 movq mm3, mm6
365 pslld mm3, 4
366
367 psubd mm3, mm6
368 movq mm1, mm5
369
370 movq mm4, mm5
371 pmullw mm1, mm1
372
373 pmulhw mm4, mm4
374 movq mm2, mm1
375
376 punpcklwd mm1, mm4
377 punpckhwd mm2, mm4
378
379 movq mm4, mm7
380 pslld mm4, 4
381
382 psubd mm4, mm7
383
384 psubd mm3, mm1
385 psubd mm4, mm2
386
387 psubd mm3, flimit2
388 psubd mm4, flimit2
389
390 psrad mm3, 31
391 psrad mm4, 31
392
393 packssdw mm3, mm4
394 packsswb mm3, mm0
395
396 movd mm1, DWORD PTR [rsi+rax*8]
397
398 movq mm2, mm1
399 punpcklbw mm1, mm0
400
401 paddw mm1, mm5
402 mov rcx, rdx
403
404 and rcx, 127
405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
406 push rax
407 lea rax, [GLOBAL(sym(vp9_rv))]
408 movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2]
409 pop rax
410 %elif ABI_IS_32BIT=0
411 movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
412 %else
413 movq mm4, [sym(vp9_rv) + rcx*2]
414 %endif
415 paddw mm1, mm4
416 ;paddw xmm1, eight8s
417 psraw mm1, 4
418
419 packuswb mm1, mm0
420 pand mm1, mm3
421
422 pandn mm3, mm2
423 por mm1, mm3
424
425 and rcx, 15
426 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
427
428 mov rcx, rdx
429 sub rcx, 8
430
431 and rcx, 15
432 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
433
434 movd [rsi], mm1
435 lea rsi, [rsi+rax]
436
437 lea rdi, [rdi+rax]
438 add rdx, 1
439
440 cmp edx, dword arg(2) ;rows
441 jl .loop_row
442
443
444 add dword arg(0), 4 ; s += 4
445 sub dword arg(3), 4 ; cols -= 4
446 cmp dword arg(3), 0
447 jg .loop_col
448
449 add rsp, 136
450 pop rsp
451
452 ; begin epilog
453 pop rdi
454 pop rsi
455 RESTORE_GOT
456 UNSHADOW_ARGS
457 pop rbp
458 ret
459 %undef flimit2
460
461
462 ;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
463 ; unsigned char blackclamp[16],
464 ; unsigned char whiteclamp[16],
465 ; unsigned char bothclamp[16],
466 ; unsigned int width, unsigned int height, int pitch)
467 global sym(vp9_plane_add_noise_mmx) PRIVATE
468 sym(vp9_plane_add_noise_mmx):
469 push rbp
470 mov rbp, rsp
471 SHADOW_ARGS_TO_STACK 8
472 GET_GOT rbx
473 push rsi
474 push rdi
475 ; end prolog
476
477 .addnoise_loop:
478 call sym(LIBVPX_RAND) WRT_PLT
479 mov rcx, arg(1) ;noise
480 and rax, 0xff
481 add rcx, rax
482
483 ; we rely on the fact that the clamping vectors are stored contiguously
484 ; in black/white/both order. Note that we have to reload this here because
485 ; rdx could be trashed by rand()
486 mov rdx, arg(2) ; blackclamp
487
488
489 mov rdi, rcx
490 movsxd rcx, dword arg(5) ;[Width]
491 mov rsi, arg(0) ;Pos
492 xor rax,rax
493
494 .addnoise_nextset:
495 movq mm1,[rsi+rax] ; get the source
496
497 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so w e don't outrange adding noise
498 paddusb mm1, [rdx+32] ;bothclamp
499 psubusb mm1, [rdx+16] ;whiteclamp
500
501 movq mm2,[rdi+rax] ; get the noise for this line
502 paddb mm1,mm2 ; add it in
503 movq [rsi+rax],mm1 ; store the result
504
505 add rax,8 ; move to the next line
506
507 cmp rax, rcx
508 jl .addnoise_nextset
509
510 movsxd rax, dword arg(7) ; Pitch
511 add arg(0), rax ; Start += Pitch
512 sub dword arg(6), 1 ; Height -= 1
513 jg .addnoise_loop
514
515 ; begin epilog
516 pop rdi
517 pop rsi
518 RESTORE_GOT
519 UNSHADOW_ARGS
520 pop rbp
521 ret
522
523
524 SECTION_RODATA
525 align 16
526 Blur:
527 times 16 dw 16
528 times 8 dw 64
529 times 16 dw 16
530 times 8 dw 0
531
532 rd:
533 times 4 dw 0x40
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698