Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(105)

Side by Side Diff: source/libvpx/vp8/common/x86/subpixel_ssse3.asm

Issue 3417017: Update libvpx sources to v0.9.2-35-ga8a38bc. ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 10 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 11
12 %include "vpx_ports/x86_abi_support.asm" 12 %include "vpx_ports/x86_abi_support.asm"
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
63 63
64 mov rsi, arg(0) ;src_ptr 64 mov rsi, arg(0) ;src_ptr
65 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 65 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
66 movsxd rcx, dword ptr arg(4) ;output_height 66 movsxd rcx, dword ptr arg(4) ;output_height
67 67
68 movsxd rdx, dword ptr arg(3) ;output_pitch 68 movsxd rdx, dword ptr arg(3) ;output_pitch
69 69
70 sub rdi, rdx 70 sub rdi, rdx
71 ;xmm3 free 71 ;xmm3 free
72 filter_block1d8_h6_rowloop_ssse3: 72 filter_block1d8_h6_rowloop_ssse3:
73 movdqu xmm0, XMMWORD PTR [rsi - 2] 73 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
74 74
75 movdqa xmm1, xmm0 75 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
76 pshufb xmm0, [shuf1b GLOBAL]
77 76
78 movdqa xmm2, xmm1 77 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
79 pshufb xmm1, [shuf2b GLOBAL]
80 pmaddubsw xmm0, xmm4
81 pmaddubsw xmm1, xmm5
82 78
83 pshufb xmm2, [shuf3b GLOBAL] 79 movdqa xmm1, xmm0
84 add rdi, rdx 80 pmaddubsw xmm0, xmm4
85 pmaddubsw xmm2, xmm6 81
82 movdqa xmm2, xmm1
83 pshufb xmm1, [shuf2bfrom1 GLOBAL]
84
85 pshufb xmm2, [shuf3bfrom1 GLOBAL]
86 pmaddubsw xmm1, xmm5
87
88 lea rdi, [rdi + rdx]
89 pmaddubsw xmm2, xmm6
86 90
87 lea rsi, [rsi + rax] 91 lea rsi, [rsi + rax]
88 dec rcx 92 dec rcx
89 paddsw xmm0, xmm1 93
90 paddsw xmm0, xmm7 94 paddsw xmm0, xmm1
91 paddsw xmm0, xmm2 95 paddsw xmm2, xmm7
92 psraw xmm0, 7 96
93 packuswb xmm0, xmm0 97 paddsw xmm0, xmm2
98
99 psraw xmm0, 7
100
101 packuswb xmm0, xmm0
94 102
95 movq MMWORD Ptr [rdi], xmm0 103 movq MMWORD Ptr [rdi], xmm0
96 jnz filter_block1d8_h6_rowloop_ssse3 104 jnz filter_block1d8_h6_rowloop_ssse3
97 105
98 ; begin epilog 106 ; begin epilog
99 pop rdi 107 pop rdi
100 pop rsi 108 pop rsi
101 RESTORE_GOT 109 RESTORE_GOT
102 UNSHADOW_ARGS 110 UNSHADOW_ARGS
103 pop rbp 111 pop rbp
104 ret 112 ret
105 113
106 vp8_filter_block1d8_h4_ssse3: 114 vp8_filter_block1d8_h4_ssse3:
107 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 115 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
108 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 116 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
109 117
110 movdqa xmm3, XMMWORD PTR [shuf2b GLOBAL] 118 movdqa xmm3, XMMWORD PTR [shuf2bfrom1 GLOBAL]
111 movdqa xmm4, XMMWORD PTR [shuf3b GLOBAL] 119 movdqa xmm4, XMMWORD PTR [shuf3bfrom1 GLOBAL]
112 120
113 mov rsi, arg(0) ;src_ptr 121 mov rsi, arg(0) ;src_ptr
114 122
115 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 123 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
116 movsxd rcx, dword ptr arg(4) ;output_height 124 movsxd rcx, dword ptr arg(4) ;output_height
117 125
118 movsxd rdx, dword ptr arg(3) ;output_pitch 126 movsxd rdx, dword ptr arg(3) ;output_pitch
119 127
120 sub rdi, rdx 128 sub rdi, rdx
121 ;xmm3 free 129
122 filter_block1d8_h4_rowloop_ssse3: 130 filter_block1d8_h4_rowloop_ssse3:
123 movdqu xmm0, XMMWORD PTR [rsi - 2] 131 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
124 132
125 movdqa xmm2, xmm0 133 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
126 pshufb xmm0, xmm3 ;[shuf2b GLOBAL]
127 pshufb xmm2, xmm4 ;[shuf3b GLOBAL]
128 134
129 pmaddubsw xmm0, xmm5 135 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
130 add rdi, rdx 136
131 pmaddubsw xmm2, xmm6 137 movdqa xmm2, xmm0
138 pshufb xmm0, xmm3
139
140 pshufb xmm2, xmm4
141 pmaddubsw xmm0, xmm5
142
143 lea rdi, [rdi + rdx]
144 pmaddubsw xmm2, xmm6
132 145
133 lea rsi, [rsi + rax] 146 lea rsi, [rsi + rax]
134 dec rcx 147 dec rcx
135 paddsw xmm0, xmm7 148
136 paddsw xmm0, xmm2 149 paddsw xmm0, xmm7
137 psraw xmm0, 7 150
138 packuswb xmm0, xmm0 151 paddsw xmm0, xmm2
152
153 psraw xmm0, 7
154
155 packuswb xmm0, xmm0
139 156
140 movq MMWORD Ptr [rdi], xmm0 157 movq MMWORD Ptr [rdi], xmm0
141 158
142 jnz filter_block1d8_h4_rowloop_ssse3 159 jnz filter_block1d8_h4_rowloop_ssse3
143 160
144 ; begin epilog 161 ; begin epilog
145 pop rdi 162 pop rdi
146 pop rsi 163 pop rsi
147 RESTORE_GOT 164 RESTORE_GOT
148 UNSHADOW_ARGS 165 UNSHADOW_ARGS
(...skipping 12 matching lines...) Expand all
161 sym(vp8_filter_block1d16_h6_ssse3): 178 sym(vp8_filter_block1d16_h6_ssse3):
162 push rbp 179 push rbp
163 mov rbp, rsp 180 mov rbp, rsp
164 SHADOW_ARGS_TO_STACK 6 181 SHADOW_ARGS_TO_STACK 6
165 SAVE_XMM 182 SAVE_XMM
166 GET_GOT rbx 183 GET_GOT rbx
167 push rsi 184 push rsi
168 push rdi 185 push rdi
169 ; end prolog 186 ; end prolog
170 187
171 movsxd rdx, DWORD PTR arg(5) ;table index 188 movsxd rdx, DWORD PTR arg(5) ;table index
172 xor rsi, rsi 189 xor rsi, rsi
173 shl rdx, 4 ; 190 shl rdx, 4 ;
174 191
175 lea rax, [k0_k5 GLOBAL] 192 lea rax, [k0_k5 GLOBAL]
176 add rax, rdx 193 add rax, rdx
177 194
178 mov rdi, arg(2) ;output_ptr 195 mov rdi, arg(2) ;output_ptr
179 movdqa xmm7, [rd GLOBAL]
180 196
181 ;; 197 ;;
182 ;; cmp esi, DWORD PTR [rax] 198 ;; cmp esi, DWORD PTR [rax]
183 ;; je vp8_filter_block1d16_h4_ssse3 199 ;; je vp8_filter_block1d16_h4_ssse3
184 200
185 mov rsi, arg(0) ;src_ptr 201 mov rsi, arg(0) ;src_ptr
186 202
187 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 203 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
188 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 204 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
189 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 205 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
190 206
191 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 207 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
192 movsxd rcx, dword ptr arg(4) ;output_height 208 movsxd rcx, dword ptr arg(4) ;output_height
193 movsxd rdx, dword ptr arg(3) ;output_pitch 209 movsxd rdx, dword ptr arg(3) ;output_pitch
194 210
195 filter_block1d16_h6_rowloop_ssse3: 211 filter_block1d16_h6_rowloop_ssse3:
196 movdqu xmm0, XMMWORD PTR [rsi - 2] 212 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
197 213
198 movdqa xmm1, xmm0 214 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
199 pshufb xmm0, [shuf1b GLOBAL]
200 movdqa xmm2, xmm1
201 pmaddubsw xmm0, xmm4
202 pshufb xmm1, [shuf2b GLOBAL]
203 pshufb xmm2, [shuf3b GLOBAL]
204 pmaddubsw xmm1, xmm5
205 215
206 movdqu xmm3, XMMWORD PTR [rsi + 6] 216 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
207 217
208 pmaddubsw xmm2, xmm6 218 movdqa xmm1, xmm0
209 paddsw xmm0, xmm1 219 pmaddubsw xmm0, xmm4
210 movdqa xmm1, xmm3
211 pshufb xmm3, [shuf1b GLOBAL]
212 paddsw xmm0, xmm7
213 pmaddubsw xmm3, xmm4
214 paddsw xmm0, xmm2
215 movdqa xmm2, xmm1
216 pshufb xmm1, [shuf2b GLOBAL]
217 pshufb xmm2, [shuf3b GLOBAL]
218 pmaddubsw xmm1, xmm5
219 pmaddubsw xmm2, xmm6
220 220
221 psraw xmm0, 7 221 movdqa xmm2, xmm1
222 packuswb xmm0, xmm0 222 pshufb xmm1, [shuf2bfrom1 GLOBAL]
223
224 pshufb xmm2, [shuf3bfrom1 GLOBAL]
225 movq xmm3, MMWORD PTR [rsi + 6]
226
227 pmaddubsw xmm1, xmm5
228 movq xmm7, MMWORD PTR [rsi + 11]
229
230 pmaddubsw xmm2, xmm6
231 punpcklbw xmm3, xmm7
232
233 paddsw xmm0, xmm1
234 movdqa xmm1, xmm3
235
236 pmaddubsw xmm3, xmm4
237 paddsw xmm0, xmm2
238
239 movdqa xmm2, xmm1
240 paddsw xmm0, [rd GLOBAL]
241
242 pshufb xmm1, [shuf2bfrom1 GLOBAL]
243 pshufb xmm2, [shuf3bfrom1 GLOBAL]
244
245 psraw xmm0, 7
246 pmaddubsw xmm1, xmm5
247
248 pmaddubsw xmm2, xmm6
249 packuswb xmm0, xmm0
250
223 lea rsi, [rsi + rax] 251 lea rsi, [rsi + rax]
224 paddsw xmm3, xmm1 252 paddsw xmm3, xmm1
225 paddsw xmm3, xmm7
226 paddsw xmm3, xmm2
227 psraw xmm3, 7
228 packuswb xmm3, xmm3
229 253
230 punpcklqdq xmm0, xmm3 254 paddsw xmm3, xmm2
255
256 paddsw xmm3, [rd GLOBAL]
257
258 psraw xmm3, 7
259
260 packuswb xmm3, xmm3
261
262 punpcklqdq xmm0, xmm3
231 263
232 movdqa XMMWORD Ptr [rdi], xmm0 264 movdqa XMMWORD Ptr [rdi], xmm0
233 265
234 add rdi, rdx 266 lea rdi, [rdi + rdx]
235 dec rcx 267 dec rcx
236 jnz filter_block1d16_h6_rowloop_ssse3 268 jnz filter_block1d16_h6_rowloop_ssse3
237 269
238
239 ; begin epilog 270 ; begin epilog
240 pop rdi 271 pop rdi
241 pop rsi 272 pop rsi
242 RESTORE_GOT 273 RESTORE_GOT
243 UNSHADOW_ARGS 274 UNSHADOW_ARGS
244 pop rbp 275 pop rbp
245 ret 276 ret
246 277
247 vp8_filter_block1d16_h4_ssse3: 278 vp8_filter_block1d16_h4_ssse3:
248 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 279 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
(...skipping 12 matching lines...) Expand all
261 pshufb xmm2, [shuf3b GLOBAL] 292 pshufb xmm2, [shuf3b GLOBAL]
262 pmaddubsw xmm1, xmm5 293 pmaddubsw xmm1, xmm5
263 294
264 movdqu xmm3, XMMWORD PTR [rsi + 6] 295 movdqu xmm3, XMMWORD PTR [rsi + 6]
265 296
266 pmaddubsw xmm2, xmm6 297 pmaddubsw xmm2, xmm6
267 movdqa xmm0, xmm3 298 movdqa xmm0, xmm3
268 pshufb xmm3, [shuf3b GLOBAL] 299 pshufb xmm3, [shuf3b GLOBAL]
269 pshufb xmm0, [shuf2b GLOBAL] 300 pshufb xmm0, [shuf2b GLOBAL]
270 301
271 paddsw xmm1, xmm7 302 paddsw xmm1, [rd GLOBAL]
272 paddsw xmm1, xmm2 303 paddsw xmm1, xmm2
273 304
274 pmaddubsw xmm0, xmm5 305 pmaddubsw xmm0, xmm5
275 pmaddubsw xmm3, xmm6 306 pmaddubsw xmm3, xmm6
276 307
277 psraw xmm1, 7 308 psraw xmm1, 7
278 packuswb xmm1, xmm1 309 packuswb xmm1, xmm1
279 lea rsi, [rsi + rax] 310 lea rsi, [rsi + rax]
280 paddsw xmm3, xmm0 311 paddsw xmm3, xmm0
281 paddsw xmm3, xmm7 312 paddsw xmm3, [rd GLOBAL]
282 psraw xmm3, 7 313 psraw xmm3, 7
283 packuswb xmm3, xmm3 314 packuswb xmm3, xmm3
284 315
285 punpcklqdq xmm1, xmm3 316 punpcklqdq xmm1, xmm3
286 317
287 movdqa XMMWORD Ptr [rdi], xmm1 318 movdqa XMMWORD Ptr [rdi], xmm1
288 319
289 add rdi, rdx 320 add rdi, rdx
290 dec rcx 321 dec rcx
291 jnz filter_block1d16_h4_rowloop_ssse3 322 jnz filter_block1d16_h4_rowloop_ssse3
(...skipping 588 matching lines...) Expand 10 before | Expand all | Expand 10 after
880 jnz vp8_filter_block1d4_v4_ssse3_loop 911 jnz vp8_filter_block1d4_v4_ssse3_loop
881 912
882 ; begin epilog 913 ; begin epilog
883 pop rdi 914 pop rdi
884 pop rsi 915 pop rsi
885 RESTORE_GOT 916 RESTORE_GOT
886 UNSHADOW_ARGS 917 UNSHADOW_ARGS
887 pop rbp 918 pop rbp
888 ret 919 ret
889 920
921 ;void vp8_bilinear_predict16x16_ssse3
922 ;(
923 ; unsigned char *src_ptr,
924 ; int src_pixels_per_line,
925 ; int xoffset,
926 ; int yoffset,
927 ; unsigned char *dst_ptr,
928 ; int dst_pitch
929 ;)
930 global sym(vp8_bilinear_predict16x16_ssse3)
931 sym(vp8_bilinear_predict16x16_ssse3):
932 push rbp
933 mov rbp, rsp
934 SHADOW_ARGS_TO_STACK 6
935 SAVE_XMM
936 GET_GOT rbx
937 push rsi
938 push rdi
939 ; end prolog
940
941 lea rcx, [vp8_bilinear_filters_ssse3 GLOBAL]
942 movsxd rax, dword ptr arg(2) ; xoffset
943
944 cmp rax, 0 ; skip first_pass filter if xoffset=0
945 je b16x16_sp_only
946
947 shl rax, 4
948 lea rax, [rax + rcx] ; HFilter
949
950 mov rdi, arg(4) ; dst_ptr
951 mov rsi, arg(0) ; src_ptr
952 movsxd rdx, dword ptr arg(5) ; dst_pitch
953
954 movdqa xmm1, [rax]
955
956 movsxd rax, dword ptr arg(3) ; yoffset
957
958 cmp rax, 0 ; skip second_pass filter if yoffset=0
959 je b16x16_fp_only
960
961 shl rax, 4
962 lea rax, [rax + rcx] ; VFilter
963
964 lea rcx, [rdi+rdx*8]
965 lea rcx, [rcx+rdx*8]
966 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
967
968 movdqa xmm2, [rax]
969
970 %if ABI_IS_32BIT=0
971 movsxd r8, dword ptr arg(5) ; dst_pitch
972 %endif
973 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
974 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
975
976 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
977 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
978
979 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
980
981 lea rsi, [rsi + rdx] ; next line
982
983 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
984
985 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
986 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
987
988 paddw xmm3, [rd GLOBAL] ; xmm3 += round value
989 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
990
991 paddw xmm4, [rd GLOBAL] ; xmm4 += round value
992 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
993
994 movdqa xmm7, xmm3
995 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
996
997 .next_row:
998 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
999 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
1000
1001 punpcklbw xmm6, xmm5
1002 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
1003
1004 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
1005 lea rsi, [rsi + rdx] ; next line
1006
1007 pmaddubsw xmm6, xmm1
1008
1009 punpcklbw xmm4, xmm5
1010 pmaddubsw xmm4, xmm1
1011
1012 paddw xmm6, [rd GLOBAL] ; xmm6 += round value
1013 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
1014
1015 paddw xmm4, [rd GLOBAL] ; xmm4 += round value
1016 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
1017
1018 packuswb xmm6, xmm4
1019 movdqa xmm5, xmm7
1020
1021 punpcklbw xmm5, xmm6
1022 pmaddubsw xmm5, xmm2
1023
1024 punpckhbw xmm7, xmm6
1025 pmaddubsw xmm7, xmm2
1026
1027 paddw xmm5, [rd GLOBAL] ; xmm5 += round value
1028 psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
1029
1030 paddw xmm7, [rd GLOBAL] ; xmm7 += round value
1031 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
1032
1033 packuswb xmm5, xmm7
1034 movdqa xmm7, xmm6
1035
1036 movdqa [rdi], xmm5 ; store the results in the d estination
1037 %if ABI_IS_32BIT
1038 add rdi, DWORD PTR arg(5) ; dst_pitch
1039 %else
1040 add rdi, r8
1041 %endif
1042
1043 cmp rdi, rcx
1044 jne .next_row
1045
1046 jmp done
1047
1048 b16x16_sp_only:
1049 movsxd rax, dword ptr arg(3) ; yoffset
1050 shl rax, 4
1051 lea rax, [rax + rcx] ; VFilter
1052
1053 mov rdi, arg(4) ; dst_ptr
1054 mov rsi, arg(0) ; src_ptr
1055 movsxd rdx, dword ptr arg(5) ; dst_pitch
1056
1057 movdqa xmm1, [rax] ; VFilter
1058
1059 lea rcx, [rdi+rdx*8]
1060 lea rcx, [rcx+rdx*8]
1061 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
1062
1063 ; get the first horizontal line done
1064 movq xmm4, [rsi] ; load row 0
1065 movq xmm2, [rsi + 8] ; load row 0
1066
1067 lea rsi, [rsi + rax] ; next line
1068 .next_row:
1069 movq xmm3, [rsi] ; load row + 1
1070 movq xmm5, [rsi + 8] ; load row + 1
1071
1072 punpcklbw xmm4, xmm3
1073 punpcklbw xmm2, xmm5
1074
1075 pmaddubsw xmm4, xmm1
1076 movq xmm7, [rsi + rax] ; load row + 2
1077
1078 pmaddubsw xmm2, xmm1
1079 movq xmm6, [rsi + rax + 8] ; load row + 2
1080
1081 punpcklbw xmm3, xmm7
1082 punpcklbw xmm5, xmm6
1083
1084 pmaddubsw xmm3, xmm1
1085 paddw xmm4, [rd GLOBAL]
1086
1087 pmaddubsw xmm5, xmm1
1088 paddw xmm2, [rd GLOBAL]
1089
1090 psraw xmm4, VP8_FILTER_SHIFT
1091 psraw xmm2, VP8_FILTER_SHIFT
1092
1093 packuswb xmm4, xmm2
1094 paddw xmm3, [rd GLOBAL]
1095
1096 movdqa [rdi], xmm4 ; store row 0
1097 paddw xmm5, [rd GLOBAL]
1098
1099 psraw xmm3, VP8_FILTER_SHIFT
1100 psraw xmm5, VP8_FILTER_SHIFT
1101
1102 packuswb xmm3, xmm5
1103 movdqa xmm4, xmm7
1104
1105 movdqa [rdi + rdx],xmm3 ; store row 1
1106 lea rsi, [rsi + 2*rax]
1107
1108 movdqa xmm2, xmm6
1109 lea rdi, [rdi + 2*rdx]
1110
1111 cmp rdi, rcx
1112 jne .next_row
1113
1114 jmp done
1115
1116 b16x16_fp_only:
1117 lea rcx, [rdi+rdx*8]
1118 lea rcx, [rcx+rdx*8]
1119 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
1120
1121 .next_row:
1122 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
1123 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
1124
1125 punpcklbw xmm2, xmm4
1126 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
1127
1128 pmaddubsw xmm2, xmm1
1129 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
1130
1131 lea rsi, [rsi + rax] ; next line
1132 punpcklbw xmm3, xmm4
1133
1134 pmaddubsw xmm3, xmm1
1135 movq xmm5, [rsi]
1136
1137 paddw xmm2, [rd GLOBAL]
1138 movq xmm7, [rsi+1]
1139
1140 movq xmm6, [rsi+8]
1141 psraw xmm2, VP8_FILTER_SHIFT
1142
1143 punpcklbw xmm5, xmm7
1144 movq xmm7, [rsi+9]
1145
1146 paddw xmm3, [rd GLOBAL]
1147 pmaddubsw xmm5, xmm1
1148
1149 psraw xmm3, VP8_FILTER_SHIFT
1150 punpcklbw xmm6, xmm7
1151
1152 packuswb xmm2, xmm3
1153 pmaddubsw xmm6, xmm1
1154
1155 movdqa [rdi], xmm2 ; store the results in the d estination
1156 paddw xmm5, [rd GLOBAL]
1157
1158 lea rdi, [rdi + rdx] ; dst_pitch
1159 psraw xmm5, VP8_FILTER_SHIFT
1160
1161 paddw xmm6, [rd GLOBAL]
1162 psraw xmm6, VP8_FILTER_SHIFT
1163
1164 packuswb xmm5, xmm6
1165 lea rsi, [rsi + rax] ; next line
1166
1167 movdqa [rdi], xmm5 ; store the results in the d estination
1168 lea rdi, [rdi + rdx] ; dst_pitch
1169
1170 cmp rdi, rcx
1171
1172 jne .next_row
1173
1174 done:
1175 ; begin epilog
1176 pop rdi
1177 pop rsi
1178 RESTORE_GOT
1179 RESTORE_XMM
1180 UNSHADOW_ARGS
1181 pop rbp
1182 ret
1183
1184 ;void vp8_bilinear_predict8x8_ssse3
1185 ;(
1186 ; unsigned char *src_ptr,
1187 ; int src_pixels_per_line,
1188 ; int xoffset,
1189 ; int yoffset,
1190 ; unsigned char *dst_ptr,
1191 ; int dst_pitch
1192 ;)
1193 global sym(vp8_bilinear_predict8x8_ssse3)
1194 sym(vp8_bilinear_predict8x8_ssse3):
1195 push rbp
1196 mov rbp, rsp
1197 SHADOW_ARGS_TO_STACK 6
1198 SAVE_XMM
1199 GET_GOT rbx
1200 push rsi
1201 push rdi
1202 ; end prolog
1203
1204 ALIGN_STACK 16, rax
1205 sub rsp, 144 ; reserve 144 bytes
1206
1207 lea rcx, [vp8_bilinear_filters_ssse3 GLOBAL]
1208
1209 mov rsi, arg(0) ;src_ptr
1210 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1211
1212 ;Read 9-line unaligned data in and put them on stack. This gives a big
1213 ;performance boost.
1214 movdqu xmm0, [rsi]
1215 lea rax, [rdx + rdx*2]
1216 movdqu xmm1, [rsi+rdx]
1217 movdqu xmm2, [rsi+rdx*2]
1218 add rsi, rax
1219 movdqu xmm3, [rsi]
1220 movdqu xmm4, [rsi+rdx]
1221 movdqu xmm5, [rsi+rdx*2]
1222 add rsi, rax
1223 movdqu xmm6, [rsi]
1224 movdqu xmm7, [rsi+rdx]
1225
1226 movdqa XMMWORD PTR [rsp], xmm0
1227
1228 movdqu xmm0, [rsi+rdx*2]
1229
1230 movdqa XMMWORD PTR [rsp+16], xmm1
1231 movdqa XMMWORD PTR [rsp+32], xmm2
1232 movdqa XMMWORD PTR [rsp+48], xmm3
1233 movdqa XMMWORD PTR [rsp+64], xmm4
1234 movdqa XMMWORD PTR [rsp+80], xmm5
1235 movdqa XMMWORD PTR [rsp+96], xmm6
1236 movdqa XMMWORD PTR [rsp+112], xmm7
1237 movdqa XMMWORD PTR [rsp+128], xmm0
1238
1239 movsxd rax, dword ptr arg(2) ; xoffset
1240 cmp rax, 0 ; skip first_pass filter if xoffset=0
1241 je b8x8_sp_only
1242
1243 shl rax, 4
1244 add rax, rcx ; HFilter
1245
1246 mov rdi, arg(4) ; dst_ptr
1247 movsxd rdx, dword ptr arg(5) ; dst_pitch
1248
1249 movdqa xmm0, [rax]
1250
1251 movsxd rax, dword ptr arg(3) ; yoffset
1252 cmp rax, 0 ; skip second_pass filter if yoffset=0
1253 je b8x8_fp_only
1254
1255 shl rax, 4
1256 lea rax, [rax + rcx] ; VFilter
1257
1258 lea rcx, [rdi+rdx*8]
1259
1260 movdqa xmm1, [rax]
1261
1262 ; get the first horizontal line done
1263 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1264 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1265
1266 psrldq xmm5, 1
1267 lea rsp, [rsp + 16] ; next line
1268
1269 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1270 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
1271
1272 paddw xmm3, [rd GLOBAL] ; xmm3 += round value
1273 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1274
1275 movdqa xmm7, xmm3
1276 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1277
1278 .next_row:
1279 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1280 lea rsp, [rsp + 16] ; next line
1281
1282 movdqa xmm5, xmm6
1283
1284 psrldq xmm5, 1
1285
1286 punpcklbw xmm6, xmm5
1287 pmaddubsw xmm6, xmm0
1288
1289 paddw xmm6, [rd GLOBAL] ; xmm6 += round value
1290 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
1291
1292 packuswb xmm6, xmm6
1293
1294 punpcklbw xmm7, xmm6
1295 pmaddubsw xmm7, xmm1
1296
1297 paddw xmm7, [rd GLOBAL] ; xmm7 += round value
1298 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
1299
1300 packuswb xmm7, xmm7
1301
1302 movq [rdi], xmm7 ; store the results in the d estination
1303 lea rdi, [rdi + rdx]
1304
1305 movdqa xmm7, xmm6
1306
1307 cmp rdi, rcx
1308 jne .next_row
1309
1310 jmp done8x8
1311
1312 b8x8_sp_only:
1313 movsxd rax, dword ptr arg(3) ; yoffset
1314 shl rax, 4
1315 lea rax, [rax + rcx] ; VFilter
1316
1317 mov rdi, arg(4) ;dst_ptr
1318 movsxd rdx, dword ptr arg(5) ; dst_pitch
1319
1320 movdqa xmm0, [rax] ; VFilter
1321
1322 movq xmm1, XMMWORD PTR [rsp]
1323 movq xmm2, XMMWORD PTR [rsp+16]
1324
1325 movq xmm3, XMMWORD PTR [rsp+32]
1326 punpcklbw xmm1, xmm2
1327
1328 movq xmm4, XMMWORD PTR [rsp+48]
1329 punpcklbw xmm2, xmm3
1330
1331 movq xmm5, XMMWORD PTR [rsp+64]
1332 punpcklbw xmm3, xmm4
1333
1334 movq xmm6, XMMWORD PTR [rsp+80]
1335 punpcklbw xmm4, xmm5
1336
1337 movq xmm7, XMMWORD PTR [rsp+96]
1338 punpcklbw xmm5, xmm6
1339
1340 pmaddubsw xmm1, xmm0
1341 pmaddubsw xmm2, xmm0
1342
1343 pmaddubsw xmm3, xmm0
1344 pmaddubsw xmm4, xmm0
1345
1346 pmaddubsw xmm5, xmm0
1347 punpcklbw xmm6, xmm7
1348
1349 pmaddubsw xmm6, xmm0
1350 paddw xmm1, [rd GLOBAL]
1351
1352 paddw xmm2, [rd GLOBAL]
1353 psraw xmm1, VP8_FILTER_SHIFT
1354
1355 paddw xmm3, [rd GLOBAL]
1356 psraw xmm2, VP8_FILTER_SHIFT
1357
1358 paddw xmm4, [rd GLOBAL]
1359 psraw xmm3, VP8_FILTER_SHIFT
1360
1361 paddw xmm5, [rd GLOBAL]
1362 psraw xmm4, VP8_FILTER_SHIFT
1363
1364 paddw xmm6, [rd GLOBAL]
1365 psraw xmm5, VP8_FILTER_SHIFT
1366
1367 psraw xmm6, VP8_FILTER_SHIFT
1368 packuswb xmm1, xmm1
1369
1370 packuswb xmm2, xmm2
1371 movq [rdi], xmm1
1372
1373 packuswb xmm3, xmm3
1374 movq [rdi+rdx], xmm2
1375
1376 packuswb xmm4, xmm4
1377 movq xmm1, XMMWORD PTR [rsp+112]
1378
1379 lea rdi, [rdi + 2*rdx]
1380 movq xmm2, XMMWORD PTR [rsp+128]
1381
1382 packuswb xmm5, xmm5
1383 movq [rdi], xmm3
1384
1385 packuswb xmm6, xmm6
1386 movq [rdi+rdx], xmm4
1387
1388 lea rdi, [rdi + 2*rdx]
1389 punpcklbw xmm7, xmm1
1390
1391 movq [rdi], xmm5
1392 pmaddubsw xmm7, xmm0
1393
1394 movq [rdi+rdx], xmm6
1395 punpcklbw xmm1, xmm2
1396
1397 pmaddubsw xmm1, xmm0
1398 paddw xmm7, [rd GLOBAL]
1399
1400 psraw xmm7, VP8_FILTER_SHIFT
1401 paddw xmm1, [rd GLOBAL]
1402
1403 psraw xmm1, VP8_FILTER_SHIFT
1404 packuswb xmm7, xmm7
1405
1406 packuswb xmm1, xmm1
1407 lea rdi, [rdi + 2*rdx]
1408
1409 movq [rdi], xmm7
1410
1411 movq [rdi+rdx], xmm1
1412 lea rsp, [rsp + 144]
1413
1414 jmp done8x8
1415
1416 b8x8_fp_only:
1417 lea rcx, [rdi+rdx*8]
1418
1419 .next_row:
1420 movdqa xmm1, XMMWORD PTR [rsp]
1421 movdqa xmm3, XMMWORD PTR [rsp+16]
1422
1423 movdqa xmm2, xmm1
1424 movdqa xmm5, XMMWORD PTR [rsp+32]
1425
1426 psrldq xmm2, 1
1427 movdqa xmm7, XMMWORD PTR [rsp+48]
1428
1429 movdqa xmm4, xmm3
1430 psrldq xmm4, 1
1431
1432 movdqa xmm6, xmm5
1433 psrldq xmm6, 1
1434
1435 punpcklbw xmm1, xmm2
1436 pmaddubsw xmm1, xmm0
1437
1438 punpcklbw xmm3, xmm4
1439 pmaddubsw xmm3, xmm0
1440
1441 punpcklbw xmm5, xmm6
1442 pmaddubsw xmm5, xmm0
1443
1444 movdqa xmm2, xmm7
1445 psrldq xmm2, 1
1446
1447 punpcklbw xmm7, xmm2
1448 pmaddubsw xmm7, xmm0
1449
1450 paddw xmm1, [rd GLOBAL]
1451 psraw xmm1, VP8_FILTER_SHIFT
1452
1453 paddw xmm3, [rd GLOBAL]
1454 psraw xmm3, VP8_FILTER_SHIFT
1455
1456 paddw xmm5, [rd GLOBAL]
1457 psraw xmm5, VP8_FILTER_SHIFT
1458
1459 paddw xmm7, [rd GLOBAL]
1460 psraw xmm7, VP8_FILTER_SHIFT
1461
1462 packuswb xmm1, xmm1
1463 packuswb xmm3, xmm3
1464
1465 packuswb xmm5, xmm5
1466 movq [rdi], xmm1
1467
1468 packuswb xmm7, xmm7
1469 movq [rdi+rdx], xmm3
1470
1471 lea rdi, [rdi + 2*rdx]
1472 movq [rdi], xmm5
1473
1474 lea rsp, [rsp + 4*16]
1475 movq [rdi+rdx], xmm7
1476
1477 lea rdi, [rdi + 2*rdx]
1478 cmp rdi, rcx
1479
1480 jne .next_row
1481
1482 lea rsp, [rsp + 16]
1483
1484 done8x8:
1485 ;add rsp, 144
1486 pop rsp
1487 ; begin epilog
1488 pop rdi
1489 pop rsi
1490 RESTORE_GOT
1491 RESTORE_XMM
1492 UNSHADOW_ARGS
1493 pop rbp
1494 ret
1495
890 SECTION_RODATA 1496 SECTION_RODATA
891 align 16 1497 align 16
892 shuf1b: 1498 shuf1b:
893 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 1499 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
894 shuf2b: 1500 shuf2b:
895 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 1501 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
896 shuf3b: 1502 shuf3b:
897 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 1503 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
898 1504
899 align 16 1505 align 16
1506 shuf2bfrom1:
1507 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1508 align 16
1509 shuf3bfrom1:
1510 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1511
1512 align 16
900 rd: 1513 rd:
901 times 8 dw 0x40 1514 times 8 dw 0x40
902 1515
903 align 16 1516 align 16
904 k0_k5: 1517 k0_k5:
905 times 8 db 0, 0 ;placeholder 1518 times 8 db 0, 0 ;placeholder
906 times 8 db 0, 0 1519 times 8 db 0, 0
907 times 8 db 2, 1 1520 times 8 db 2, 1
908 times 8 db 0, 0 1521 times 8 db 0, 0
909 times 8 db 3, 3 1522 times 8 db 3, 3
(...skipping 11 matching lines...) Expand all
921 times 8 db -1, 123 1534 times 8 db -1, 123
922 k2_k4: 1535 k2_k4:
923 times 8 db 128, 0 ;placeholder 1536 times 8 db 128, 0 ;placeholder
924 times 8 db 123, -1 1537 times 8 db 123, -1
925 times 8 db 108, -8 1538 times 8 db 108, -8
926 times 8 db 93, -6 1539 times 8 db 93, -6
927 times 8 db 77, -16 1540 times 8 db 77, -16
928 times 8 db 50, -9 1541 times 8 db 50, -9
929 times 8 db 36, -11 1542 times 8 db 36, -11
930 times 8 db 12, -6 1543 times 8 db 12, -6
1544 align 16
1545 vp8_bilinear_filters_ssse3:
1546 times 8 db 128, 0
1547 times 8 db 112, 16
1548 times 8 db 96, 32
1549 times 8 db 80, 48
1550 times 8 db 64, 64
1551 times 8 db 48, 80
1552 times 8 db 32, 96
1553 times 8 db 16, 112
931 1554
OLDNEW
« no previous file with comments | « source/libvpx/vp8/common/x86/subpixel_sse2.asm ('k') | source/libvpx/vp8/common/x86/subpixel_x86.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698