source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm - Issue 341293003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 ;	1 ;

2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 ;	3 ;

4 ; Use of this source code is governed by a BSD-style license	4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source	5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found	6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may	7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.	8 ; be found in the AUTHORS file in the root of the source tree.

9 ;	9 ;

10	10

11	11

12 %include "vpx_ports/x86_abi_support.asm"	12 %include "vpx_ports/x86_abi_support.asm"

13	13

14 ; void vp9_temporal_filter_apply_sse2 \| arg	14 ; void vp9_temporal_filter_apply_sse2 \| arg

15 ; (unsigned char *frame1, \| 0	15 ; (unsigned char *frame1, \| 0

16 ; unsigned int stride, \| 1	16 ; unsigned int stride, \| 1

17 ; unsigned char *frame2, \| 2	17 ; unsigned char *frame2, \| 2

18 ; unsigned int block_size, \| 3	18 ; unsigned int block_width, \| 3

19 ; int strength, \| 4	19 ; unsigned int block_height, \| 4

20 ; int filter_weight, \| 5	20 ; int strength, \| 5

21 ; unsigned int *accumulator, \| 6	21 ; int filter_weight, \| 6

22 ; unsigned short *count) \| 7	22 ; unsigned int *accumulator, \| 7

	23 ; unsigned short *count) \| 8

23 global sym(vp9_temporal_filter_apply_sse2) PRIVATE	24 global sym(vp9_temporal_filter_apply_sse2) PRIVATE

24 sym(vp9_temporal_filter_apply_sse2):	25 sym(vp9_temporal_filter_apply_sse2):

25	26

26 push rbp	27 push rbp

27 mov rbp, rsp	28 mov rbp, rsp

28 SHADOW_ARGS_TO_STACK 8	29 SHADOW_ARGS_TO_STACK 9

29 SAVE_XMM 7	30 SAVE_XMM 7

30 GET_GOT rbx	31 GET_GOT rbx

31 push rsi	32 push rsi

32 push rdi	33 push rdi

33 ALIGN_STACK 16, rax	34 ALIGN_STACK 16, rax

34 %define block_size 0	35 %define block_width 0

35 %define strength 16	36 %define block_height 16

36 %define filter_weight 32	37 %define strength 32

37 %define rounding_bit 48	38 %define filter_weight 48

38 %define rbp_backup 64	39 %define rounding_bit 64

39 %define stack_size 80	40 %define rbp_backup 80

	41 %define stack_size 96

40 sub rsp, stack_size	42 sub rsp, stack_size

41 mov [rsp + rbp_backup], rbp	43 mov [rsp + rbp_backup], rbp

42 ; end prolog	44 ; end prolog

43	45

44 mov rdx, arg(3)	46 mov edx, arg(3)

45 mov [rsp + block_size], rdx	47 mov [rsp + block_width], rdx

46 movd xmm6, arg(4)	48 mov edx, arg(4)

	49 mov [rsp + block_height], rdx

	50 movd xmm6, arg(5)

47 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 byte s are read	51 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 byte s are read

48	52

49 ; calculate the rounding bit outside the loop	53 ; calculate the rounding bit outside the loop

50 ; 0x8000 >> (16 - strength)	54 ; 0x8000 >> (16 - strength)

51 mov rdx, 16	55 mov rdx, 16

52 sub rdx, arg(4) ; 16 - strength	56 sub rdx, arg(5) ; 16 - strength

53 movq xmm4, rdx ; can't use rdx w/ shift	57 movq xmm4, rdx ; can't use rdx w/ shift

54 movdqa xmm5, [GLOBAL(_const_top_bit)]	58 movdqa xmm5, [GLOBAL(_const_top_bit)]

55 psrlw xmm5, xmm4	59 psrlw xmm5, xmm4

56 movdqa [rsp + rounding_bit], xmm5	60 movdqa [rsp + rounding_bit], xmm5

57	61

58 mov rsi, arg(0) ; src/frame1	62 mov rsi, arg(0) ; src/frame1

59 mov rdx, arg(2) ; predictor frame	63 mov rdx, arg(2) ; predictor frame

60 mov rdi, arg(6) ; accumulator	64 mov rdi, arg(7) ; accumulator

61 mov rax, arg(7) ; count	65 mov rax, arg(8) ; count

62	66

63 ; dup the filter weight and store for later	67 ; dup the filter weight and store for later

64 movd xmm0, arg(5) ; filter_weight	68 movd xmm0, arg(6) ; filter_weight

65 pshuflw xmm0, xmm0, 0	69 pshuflw xmm0, xmm0, 0

66 punpcklwd xmm0, xmm0	70 punpcklwd xmm0, xmm0

67 movdqa [rsp + filter_weight], xmm0	71 movdqa [rsp + filter_weight], xmm0

68	72

69 mov rbp, arg(1) ; stride	73 mov rbp, arg(1) ; stride

70 pxor xmm7, xmm7 ; zero for extraction	74 pxor xmm7, xmm7 ; zero for extraction

71	75

72 lea rcx, [rdx + 16161]	76 mov rcx, [rsp + block_width]

73 cmp dword ptr [rsp + block_size], 8	77 imul rcx, [rsp + block_height]

	78 add rcx, rdx

	79 cmp dword ptr [rsp + block_width], 8

74 jne .temporal_filter_apply_load_16	80 jne .temporal_filter_apply_load_16

75 lea rcx, [rdx + 881]

76	81

77 .temporal_filter_apply_load_8:	82 .temporal_filter_apply_load_8:

78 movq xmm0, [rsi] ; first row	83 movq xmm0, [rsi] ; first row

79 lea rsi, [rsi + rbp] ; += stride	84 lea rsi, [rsi + rbp] ; += stride

80 punpcklbw xmm0, xmm7 ; src[ 0- 7]	85 punpcklbw xmm0, xmm7 ; src[ 0- 7]

81 movq xmm1, [rsi] ; second row	86 movq xmm1, [rsi] ; second row

82 lea rsi, [rsi + rbp] ; += stride	87 lea rsi, [rsi + rbp] ; += stride

83 punpcklbw xmm1, xmm7 ; src[ 8-15]	88 punpcklbw xmm1, xmm7 ; src[ 8-15]

84 jmp .temporal_filter_apply_load_finished	89 jmp .temporal_filter_apply_load_finished

85	90

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
171 ; write back	176 ; write back

172 movdqa [rdi], xmm4	177 movdqa [rdi], xmm4

173 movdqa [rdi+16], xmm5	178 movdqa [rdi+16], xmm5

174 movdqa [rdi+32], xmm6	179 movdqa [rdi+32], xmm6

175 movdqa [rdi+48], xmm7	180 movdqa [rdi+48], xmm7

176 lea rdi, [rdi + 164] ; accumulator += 16(sizeof(int ))	181 lea rdi, [rdi + 164] ; accumulator += 16(sizeof(int ))

177	182

178 cmp rdx, rcx	183 cmp rdx, rcx

179 je .temporal_filter_apply_epilog	184 je .temporal_filter_apply_epilog

180 pxor xmm7, xmm7 ; zero for extraction	185 pxor xmm7, xmm7 ; zero for extraction

181 cmp dword ptr [rsp + block_size], 16	186 cmp dword ptr [rsp + block_width], 16

182 je .temporal_filter_apply_load_16	187 je .temporal_filter_apply_load_16

183 jmp .temporal_filter_apply_load_8	188 jmp .temporal_filter_apply_load_8

184	189

185 .temporal_filter_apply_epilog:	190 .temporal_filter_apply_epilog:

186 ; begin epilog	191 ; begin epilog

187 mov rbp, [rsp + rbp_backup]	192 mov rbp, [rsp + rbp_backup]

188 add rsp, stack_size	193 add rsp, stack_size

189 pop rsp	194 pop rsp

190 pop rdi	195 pop rdi

191 pop rsi	196 pop rsi

192 RESTORE_GOT	197 RESTORE_GOT

193 RESTORE_XMM	198 RESTORE_XMM

194 UNSHADOW_ARGS	199 UNSHADOW_ARGS

195 pop rbp	200 pop rbp

196 ret	201 ret

197	202

198 SECTION_RODATA	203 SECTION_RODATA

199 align 16	204 align 16

200 _const_3w:	205 _const_3w:

201 times 8 dw 3	206 times 8 dw 3

202 align 16	207 align 16

203 _const_top_bit:	208 _const_top_bit:

204 times 8 dw 1<<15	209 times 8 dw 1<<15

205 align 16	210 align 16

206 _const_16w	211 _const_16w

207 times 8 dw 16	212 times 8 dw 16

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c » ('j') | no next file with comments »