Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(98)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "third_party/x86inc/x86inc.asm"
12
13 SECTION .text
14
15 ; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
16 %macro PROCESS_4x2x4 5-6 0
17 movd m0, [srcq +%2]
18 %if %1 == 1
19 movd m6, [ref1q+%3]
20 movd m4, [ref2q+%3]
21 movd m7, [ref3q+%3]
22 movd m5, [ref4q+%3]
23 punpckldq m0, [srcq +%4]
24 punpckldq m6, [ref1q+%5]
25 punpckldq m4, [ref2q+%5]
26 punpckldq m7, [ref3q+%5]
27 punpckldq m5, [ref4q+%5]
28 psadbw m6, m0
29 psadbw m4, m0
30 psadbw m7, m0
31 psadbw m5, m0
32 punpckldq m6, m4
33 punpckldq m7, m5
34 %else
35 movd m1, [ref1q+%3]
36 movd m2, [ref2q+%3]
37 movd m3, [ref3q+%3]
38 movd m4, [ref4q+%3]
39 punpckldq m0, [srcq +%4]
40 punpckldq m1, [ref1q+%5]
41 punpckldq m2, [ref2q+%5]
42 punpckldq m3, [ref3q+%5]
43 punpckldq m4, [ref4q+%5]
44 psadbw m1, m0
45 psadbw m2, m0
46 psadbw m3, m0
47 psadbw m4, m0
48 punpckldq m1, m2
49 punpckldq m3, m4
50 paddd m6, m1
51 paddd m7, m3
52 %endif
53 %if %6 == 1
54 lea srcq, [srcq +src_strideq*2]
55 lea ref1q, [ref1q+ref_strideq*2]
56 lea ref2q, [ref2q+ref_strideq*2]
57 lea ref3q, [ref3q+ref_strideq*2]
58 lea ref4q, [ref4q+ref_strideq*2]
59 %endif
60 %endmacro
61
62 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
63 %macro PROCESS_8x2x4 5-6 0
64 movh m0, [srcq +%2]
65 %if %1 == 1
66 movh m4, [ref1q+%3]
67 movh m5, [ref2q+%3]
68 movh m6, [ref3q+%3]
69 movh m7, [ref4q+%3]
70 movhps m0, [srcq +%4]
71 movhps m4, [ref1q+%5]
72 movhps m5, [ref2q+%5]
73 movhps m6, [ref3q+%5]
74 movhps m7, [ref4q+%5]
75 psadbw m4, m0
76 psadbw m5, m0
77 psadbw m6, m0
78 psadbw m7, m0
79 %else
80 movh m1, [ref1q+%3]
81 movh m2, [ref2q+%3]
82 movh m3, [ref3q+%3]
83 movhps m0, [srcq +%4]
84 movhps m1, [ref1q+%5]
85 movhps m2, [ref2q+%5]
86 movhps m3, [ref3q+%5]
87 psadbw m1, m0
88 psadbw m2, m0
89 psadbw m3, m0
90 paddd m4, m1
91 movh m1, [ref4q+%3]
92 movhps m1, [ref4q+%5]
93 paddd m5, m2
94 paddd m6, m3
95 psadbw m1, m0
96 paddd m7, m1
97 %endif
98 %if %6 == 1
99 lea srcq, [srcq +src_strideq*2]
100 lea ref1q, [ref1q+ref_strideq*2]
101 lea ref2q, [ref2q+ref_strideq*2]
102 lea ref3q, [ref3q+ref_strideq*2]
103 lea ref4q, [ref4q+ref_strideq*2]
104 %endif
105 %endmacro
106
107 ; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
108 %macro PROCESS_16x2x4 5-6 0
109 ; 1st 16 px
110 mova m0, [srcq +%2]
111 %if %1 == 1
112 movu m4, [ref1q+%3]
113 movu m5, [ref2q+%3]
114 movu m6, [ref3q+%3]
115 movu m7, [ref4q+%3]
116 psadbw m4, m0
117 psadbw m5, m0
118 psadbw m6, m0
119 psadbw m7, m0
120 %else
121 movu m1, [ref1q+%3]
122 movu m2, [ref2q+%3]
123 movu m3, [ref3q+%3]
124 psadbw m1, m0
125 psadbw m2, m0
126 psadbw m3, m0
127 paddd m4, m1
128 movu m1, [ref4q+%3]
129 paddd m5, m2
130 paddd m6, m3
131 psadbw m1, m0
132 paddd m7, m1
133 %endif
134
135 ; 2nd 16 px
136 mova m0, [srcq +%4]
137 movu m1, [ref1q+%5]
138 movu m2, [ref2q+%5]
139 movu m3, [ref3q+%5]
140 psadbw m1, m0
141 psadbw m2, m0
142 psadbw m3, m0
143 paddd m4, m1
144 movu m1, [ref4q+%5]
145 paddd m5, m2
146 paddd m6, m3
147 %if %6 == 1
148 lea srcq, [srcq +src_strideq*2]
149 lea ref1q, [ref1q+ref_strideq*2]
150 lea ref2q, [ref2q+ref_strideq*2]
151 lea ref3q, [ref3q+ref_strideq*2]
152 lea ref4q, [ref4q+ref_strideq*2]
153 %endif
154 psadbw m1, m0
155 paddd m7, m1
156 %endmacro
157
158 ; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
159 %macro PROCESS_32x2x4 5-6 0
160 PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
161 PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
162 %endmacro
163
164 ; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
165 %macro PROCESS_64x2x4 5-6 0
166 PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
167 PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
168 %endmacro
169
170 ; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride,
171 ; uint8_t *ref[4], int ref_stride,
172 ; unsigned int res[4]);
173 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
174 %macro SADNXN4D 2
175 %if UNIX64
176 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
177 res, ref2, ref3, ref4
178 %else
179 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
180 ref2, ref3, ref4
181 %endif
182 movsxdifnidn src_strideq, src_strided
183 movsxdifnidn ref_strideq, ref_strided
184 mov ref2q, [ref1q+gprsize*1]
185 mov ref3q, [ref1q+gprsize*2]
186 mov ref4q, [ref1q+gprsize*3]
187 mov ref1q, [ref1q+gprsize*0]
188
189 PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
190 %rep (%2-4)/2
191 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
192 %endrep
193 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
194
195 %if mmsize == 16
196 pslldq m5, 4
197 pslldq m7, 4
198 por m4, m5
199 por m6, m7
200 mova m5, m4
201 mova m7, m6
202 punpcklqdq m4, m6
203 punpckhqdq m5, m7
204 movifnidn r4, r4mp
205 paddd m4, m5
206 movu [r4], m4
207 RET
208 %else
209 movifnidn r4, r4mp
210 movq [r4+0], m6
211 movq [r4+8], m7
212 RET
213 %endif
214 %endmacro
215
216 INIT_XMM sse2
217 SADNXN4D 64, 64
218 SADNXN4D 64, 32
219 SADNXN4D 32, 64
220 SADNXN4D 32, 32
221 SADNXN4D 32, 16
222 SADNXN4D 16, 32
223 SADNXN4D 16, 16
224 SADNXN4D 16, 8
225 SADNXN4D 8, 16
226 SADNXN4D 8, 8
227 SADNXN4D 8, 4
228
229 INIT_MMX sse
230 SADNXN4D 4, 8
231 SADNXN4D 4, 4
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698