Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(684)

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_posix.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
20
21 // Offsets for source bytes 0 to 9
22 static uvec8 kShuf0 =
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
24
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
26 static uvec8 kShuf1 =
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
28
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
30 static uvec8 kShuf2 =
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
32
33 // Offsets for source bytes 0 to 10
34 static uvec8 kShuf01 =
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 =
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 =
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
44
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 =
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
48
49 // Coefficients for source bytes 10 to 21
50 static uvec8 kMadd11 =
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
52
53 // Coefficients for source bytes 21 to 31
54 static uvec8 kMadd21 =
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
56
57 // Coefficients for source bytes 21 to 31
58 static vec16 kRound34 =
59 { 2, 2, 2, 2, 2, 2, 2, 2 };
60
61 static uvec8 kShuf38a =
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
63
64 static uvec8 kShuf38b =
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
66
67 // Arrange words 0,3,6 into 0,1,2
68 static uvec8 kShufAc =
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
70
71 // Arrange words 0,3,6 into 3,4,5
72 static uvec8 kShufAc3 =
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
74
75 // Scaling values for boxes of 3x3 and 2x3
76 static uvec16 kScaleAc33 =
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
78
79 // Arrange first value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb0 =
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
82
83 // Arrange second value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb1 =
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
86
87 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 =
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
90
91 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 =
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
94
95 // GCC versions of row functions are verbatim conversions from Visual C.
96 // Generated using gcc disassembly on Visual C object file:
97 // objdump -D yuvscaler.obj >yuvscaler.txt
98
99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100 uint8* dst_ptr, int dst_width) {
101 asm volatile (
102 LABELALIGN
103 "1: \n"
104 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
105 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
106 "lea " MEMLEA(0x20,0) ",%0 \n"
107 "psrlw $0x8,%%xmm0 \n"
108 "psrlw $0x8,%%xmm1 \n"
109 "packuswb %%xmm1,%%xmm0 \n"
110 "movdqu %%xmm0," MEMACCESS(1) " \n"
111 "lea " MEMLEA(0x10,1) ",%1 \n"
112 "sub $0x10,%2 \n"
113 "jg 1b \n"
114 : "+r"(src_ptr), // %0
115 "+r"(dst_ptr), // %1
116 "+r"(dst_width) // %2
117 :: "memory", "cc", "xmm0", "xmm1"
118 );
119 }
120
121 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
122 uint8* dst_ptr, int dst_width) {
123 asm volatile (
124 "pcmpeqb %%xmm5,%%xmm5 \n"
125 "psrlw $0x8,%%xmm5 \n"
126
127 LABELALIGN
128 "1: \n"
129 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
130 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
131 "lea " MEMLEA(0x20,0) ",%0 \n"
132 "movdqa %%xmm0,%%xmm2 \n"
133 "psrlw $0x8,%%xmm0 \n"
134 "movdqa %%xmm1,%%xmm3 \n"
135 "psrlw $0x8,%%xmm1 \n"
136 "pand %%xmm5,%%xmm2 \n"
137 "pand %%xmm5,%%xmm3 \n"
138 "pavgw %%xmm2,%%xmm0 \n"
139 "pavgw %%xmm3,%%xmm1 \n"
140 "packuswb %%xmm1,%%xmm0 \n"
141 "movdqu %%xmm0," MEMACCESS(1) " \n"
142 "lea " MEMLEA(0x10,1) ",%1 \n"
143 "sub $0x10,%2 \n"
144 "jg 1b \n"
145 : "+r"(src_ptr), // %0
146 "+r"(dst_ptr), // %1
147 "+r"(dst_width) // %2
148 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
149 );
150 }
151
152 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
153 uint8* dst_ptr, int dst_width) {
154 asm volatile (
155 "pcmpeqb %%xmm5,%%xmm5 \n"
156 "psrlw $0x8,%%xmm5 \n"
157
158 LABELALIGN
159 "1: \n"
160 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
161 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
162 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
163 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
164 "lea " MEMLEA(0x20,0) ",%0 \n"
165 "pavgb %%xmm2,%%xmm0 \n"
166 "pavgb %%xmm3,%%xmm1 \n"
167 "movdqa %%xmm0,%%xmm2 \n"
168 "psrlw $0x8,%%xmm0 \n"
169 "movdqa %%xmm1,%%xmm3 \n"
170 "psrlw $0x8,%%xmm1 \n"
171 "pand %%xmm5,%%xmm2 \n"
172 "pand %%xmm5,%%xmm3 \n"
173 "pavgw %%xmm2,%%xmm0 \n"
174 "pavgw %%xmm3,%%xmm1 \n"
175 "packuswb %%xmm1,%%xmm0 \n"
176 "movdqu %%xmm0," MEMACCESS(1) " \n"
177 "lea " MEMLEA(0x10,1) ",%1 \n"
178 "sub $0x10,%2 \n"
179 "jg 1b \n"
180 : "+r"(src_ptr), // %0
181 "+r"(dst_ptr), // %1
182 "+r"(dst_width) // %2
183 : "r"((intptr_t)(src_stride)) // %3
184 : "memory", "cc", NACL_R14
185 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
186 );
187 }
188
189 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
190 uint8* dst_ptr, int dst_width) {
191 asm volatile (
192 "pcmpeqb %%xmm5,%%xmm5 \n"
193 "psrld $0x18,%%xmm5 \n"
194 "pslld $0x10,%%xmm5 \n"
195
196 LABELALIGN
197 "1: \n"
198 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
199 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
200 "lea " MEMLEA(0x20,0) ",%0 \n"
201 "pand %%xmm5,%%xmm0 \n"
202 "pand %%xmm5,%%xmm1 \n"
203 "packuswb %%xmm1,%%xmm0 \n"
204 "psrlw $0x8,%%xmm0 \n"
205 "packuswb %%xmm0,%%xmm0 \n"
206 "movq %%xmm0," MEMACCESS(1) " \n"
207 "lea " MEMLEA(0x8,1) ",%1 \n"
208 "sub $0x8,%2 \n"
209 "jg 1b \n"
210 : "+r"(src_ptr), // %0
211 "+r"(dst_ptr), // %1
212 "+r"(dst_width) // %2
213 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
214 );
215 }
216
217 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
218 uint8* dst_ptr, int dst_width) {
219 intptr_t stridex3 = 0;
220 asm volatile (
221 "pcmpeqb %%xmm7,%%xmm7 \n"
222 "psrlw $0x8,%%xmm7 \n"
223 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
224
225 LABELALIGN
226 "1: \n"
227 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
228 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
229 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
230 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
231 "pavgb %%xmm2,%%xmm0 \n"
232 "pavgb %%xmm3,%%xmm1 \n"
233 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
234 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
235 MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4
236 MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5
237 "lea " MEMLEA(0x20,0) ",%0 \n"
238 "pavgb %%xmm4,%%xmm2 \n"
239 "pavgb %%xmm2,%%xmm0 \n"
240 "pavgb %%xmm5,%%xmm3 \n"
241 "pavgb %%xmm3,%%xmm1 \n"
242 "movdqa %%xmm0,%%xmm2 \n"
243 "psrlw $0x8,%%xmm0 \n"
244 "movdqa %%xmm1,%%xmm3 \n"
245 "psrlw $0x8,%%xmm1 \n"
246 "pand %%xmm7,%%xmm2 \n"
247 "pand %%xmm7,%%xmm3 \n"
248 "pavgw %%xmm2,%%xmm0 \n"
249 "pavgw %%xmm3,%%xmm1 \n"
250 "packuswb %%xmm1,%%xmm0 \n"
251 "movdqa %%xmm0,%%xmm2 \n"
252 "psrlw $0x8,%%xmm0 \n"
253 "pand %%xmm7,%%xmm2 \n"
254 "pavgw %%xmm2,%%xmm0 \n"
255 "packuswb %%xmm0,%%xmm0 \n"
256 "movq %%xmm0," MEMACCESS(1) " \n"
257 "lea " MEMLEA(0x8,1) ",%1 \n"
258 "sub $0x8,%2 \n"
259 "jg 1b \n"
260 : "+r"(src_ptr), // %0
261 "+r"(dst_ptr), // %1
262 "+r"(dst_width), // %2
263 "+r"(stridex3) // %3
264 : "r"((intptr_t)(src_stride)) // %4
265 : "memory", "cc", NACL_R14
266 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
267 );
268 }
269
270 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
271 uint8* dst_ptr, int dst_width) {
272 asm volatile (
273 "movdqa %0,%%xmm3 \n"
274 "movdqa %1,%%xmm4 \n"
275 "movdqa %2,%%xmm5 \n"
276 :
277 : "m"(kShuf0), // %0
278 "m"(kShuf1), // %1
279 "m"(kShuf2) // %2
280 );
281 asm volatile (
282 LABELALIGN
283 "1: \n"
284 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
285 "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
286 "lea " MEMLEA(0x20,0) ",%0 \n"
287 "movdqa %%xmm2,%%xmm1 \n"
288 "palignr $0x8,%%xmm0,%%xmm1 \n"
289 "pshufb %%xmm3,%%xmm0 \n"
290 "pshufb %%xmm4,%%xmm1 \n"
291 "pshufb %%xmm5,%%xmm2 \n"
292 "movq %%xmm0," MEMACCESS(1) " \n"
293 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
294 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
295 "lea " MEMLEA(0x18,1) ",%1 \n"
296 "sub $0x18,%2 \n"
297 "jg 1b \n"
298 : "+r"(src_ptr), // %0
299 "+r"(dst_ptr), // %1
300 "+r"(dst_width) // %2
301 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
302 );
303 }
304
305 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
306 ptrdiff_t src_stride,
307 uint8* dst_ptr, int dst_width) {
308 asm volatile (
309 "movdqa %0,%%xmm2 \n" // kShuf01
310 "movdqa %1,%%xmm3 \n" // kShuf11
311 "movdqa %2,%%xmm4 \n" // kShuf21
312 :
313 : "m"(kShuf01), // %0
314 "m"(kShuf11), // %1
315 "m"(kShuf21) // %2
316 );
317 asm volatile (
318 "movdqa %0,%%xmm5 \n" // kMadd01
319 "movdqa %1,%%xmm0 \n" // kMadd11
320 "movdqa %2,%%xmm1 \n" // kRound34
321 :
322 : "m"(kMadd01), // %0
323 "m"(kMadd11), // %1
324 "m"(kRound34) // %2
325 );
326 asm volatile (
327 LABELALIGN
328 "1: \n"
329 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
330 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
331 "pavgb %%xmm7,%%xmm6 \n"
332 "pshufb %%xmm2,%%xmm6 \n"
333 "pmaddubsw %%xmm5,%%xmm6 \n"
334 "paddsw %%xmm1,%%xmm6 \n"
335 "psrlw $0x2,%%xmm6 \n"
336 "packuswb %%xmm6,%%xmm6 \n"
337 "movq %%xmm6," MEMACCESS(1) " \n"
338 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
339 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
340 "pavgb %%xmm7,%%xmm6 \n"
341 "pshufb %%xmm3,%%xmm6 \n"
342 "pmaddubsw %%xmm0,%%xmm6 \n"
343 "paddsw %%xmm1,%%xmm6 \n"
344 "psrlw $0x2,%%xmm6 \n"
345 "packuswb %%xmm6,%%xmm6 \n"
346 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
347 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
348 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
349 "lea " MEMLEA(0x20,0) ",%0 \n"
350 "pavgb %%xmm7,%%xmm6 \n"
351 "pshufb %%xmm4,%%xmm6 \n"
352 "pmaddubsw %4,%%xmm6 \n"
353 "paddsw %%xmm1,%%xmm6 \n"
354 "psrlw $0x2,%%xmm6 \n"
355 "packuswb %%xmm6,%%xmm6 \n"
356 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
357 "lea " MEMLEA(0x18,1) ",%1 \n"
358 "sub $0x18,%2 \n"
359 "jg 1b \n"
360 : "+r"(src_ptr), // %0
361 "+r"(dst_ptr), // %1
362 "+r"(dst_width) // %2
363 : "r"((intptr_t)(src_stride)), // %3
364 "m"(kMadd21) // %4
365 : "memory", "cc", NACL_R14
366 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
367 );
368 }
369
370 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
371 ptrdiff_t src_stride,
372 uint8* dst_ptr, int dst_width) {
373 asm volatile (
374 "movdqa %0,%%xmm2 \n" // kShuf01
375 "movdqa %1,%%xmm3 \n" // kShuf11
376 "movdqa %2,%%xmm4 \n" // kShuf21
377 :
378 : "m"(kShuf01), // %0
379 "m"(kShuf11), // %1
380 "m"(kShuf21) // %2
381 );
382 asm volatile (
383 "movdqa %0,%%xmm5 \n" // kMadd01
384 "movdqa %1,%%xmm0 \n" // kMadd11
385 "movdqa %2,%%xmm1 \n" // kRound34
386 :
387 : "m"(kMadd01), // %0
388 "m"(kMadd11), // %1
389 "m"(kRound34) // %2
390 );
391
392 asm volatile (
393 LABELALIGN
394 "1: \n"
395 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
396 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
397 "pavgb %%xmm6,%%xmm7 \n"
398 "pavgb %%xmm7,%%xmm6 \n"
399 "pshufb %%xmm2,%%xmm6 \n"
400 "pmaddubsw %%xmm5,%%xmm6 \n"
401 "paddsw %%xmm1,%%xmm6 \n"
402 "psrlw $0x2,%%xmm6 \n"
403 "packuswb %%xmm6,%%xmm6 \n"
404 "movq %%xmm6," MEMACCESS(1) " \n"
405 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
406 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
407 "pavgb %%xmm6,%%xmm7 \n"
408 "pavgb %%xmm7,%%xmm6 \n"
409 "pshufb %%xmm3,%%xmm6 \n"
410 "pmaddubsw %%xmm0,%%xmm6 \n"
411 "paddsw %%xmm1,%%xmm6 \n"
412 "psrlw $0x2,%%xmm6 \n"
413 "packuswb %%xmm6,%%xmm6 \n"
414 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
415 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
416 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
417 "lea " MEMLEA(0x20,0) ",%0 \n"
418 "pavgb %%xmm6,%%xmm7 \n"
419 "pavgb %%xmm7,%%xmm6 \n"
420 "pshufb %%xmm4,%%xmm6 \n"
421 "pmaddubsw %4,%%xmm6 \n"
422 "paddsw %%xmm1,%%xmm6 \n"
423 "psrlw $0x2,%%xmm6 \n"
424 "packuswb %%xmm6,%%xmm6 \n"
425 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
426 "lea " MEMLEA(0x18,1) ",%1 \n"
427 "sub $0x18,%2 \n"
428 "jg 1b \n"
429 : "+r"(src_ptr), // %0
430 "+r"(dst_ptr), // %1
431 "+r"(dst_width) // %2
432 : "r"((intptr_t)(src_stride)), // %3
433 "m"(kMadd21) // %4
434 : "memory", "cc", NACL_R14
435 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
436 );
437 }
438
439 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
440 uint8* dst_ptr, int dst_width) {
441 asm volatile (
442 "movdqa %3,%%xmm4 \n"
443 "movdqa %4,%%xmm5 \n"
444
445 LABELALIGN
446 "1: \n"
447 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
448 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
449 "lea " MEMLEA(0x20,0) ",%0 \n"
450 "pshufb %%xmm4,%%xmm0 \n"
451 "pshufb %%xmm5,%%xmm1 \n"
452 "paddusb %%xmm1,%%xmm0 \n"
453 "movq %%xmm0," MEMACCESS(1) " \n"
454 "movhlps %%xmm0,%%xmm1 \n"
455 "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
456 "lea " MEMLEA(0xc,1) ",%1 \n"
457 "sub $0xc,%2 \n"
458 "jg 1b \n"
459 : "+r"(src_ptr), // %0
460 "+r"(dst_ptr), // %1
461 "+r"(dst_width) // %2
462 : "m"(kShuf38a), // %3
463 "m"(kShuf38b) // %4
464 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
465 );
466 }
467
468 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
469 ptrdiff_t src_stride,
470 uint8* dst_ptr, int dst_width) {
471 asm volatile (
472 "movdqa %0,%%xmm2 \n"
473 "movdqa %1,%%xmm3 \n"
474 "movdqa %2,%%xmm4 \n"
475 "movdqa %3,%%xmm5 \n"
476 :
477 : "m"(kShufAb0), // %0
478 "m"(kShufAb1), // %1
479 "m"(kShufAb2), // %2
480 "m"(kScaleAb2) // %3
481 );
482 asm volatile (
483 LABELALIGN
484 "1: \n"
485 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
486 MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
487 "lea " MEMLEA(0x10,0) ",%0 \n"
488 "pavgb %%xmm1,%%xmm0 \n"
489 "movdqa %%xmm0,%%xmm1 \n"
490 "pshufb %%xmm2,%%xmm1 \n"
491 "movdqa %%xmm0,%%xmm6 \n"
492 "pshufb %%xmm3,%%xmm6 \n"
493 "paddusw %%xmm6,%%xmm1 \n"
494 "pshufb %%xmm4,%%xmm0 \n"
495 "paddusw %%xmm0,%%xmm1 \n"
496 "pmulhuw %%xmm5,%%xmm1 \n"
497 "packuswb %%xmm1,%%xmm1 \n"
498 "movd %%xmm1," MEMACCESS(1) " \n"
499 "psrlq $0x10,%%xmm1 \n"
500 "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
501 "lea " MEMLEA(0x6,1) ",%1 \n"
502 "sub $0x6,%2 \n"
503 "jg 1b \n"
504 : "+r"(src_ptr), // %0
505 "+r"(dst_ptr), // %1
506 "+r"(dst_width) // %2
507 : "r"((intptr_t)(src_stride)) // %3
508 : "memory", "cc", NACL_R14
509 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
510 );
511 }
512
513 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
514 ptrdiff_t src_stride,
515 uint8* dst_ptr, int dst_width) {
516 asm volatile (
517 "movdqa %0,%%xmm2 \n"
518 "movdqa %1,%%xmm3 \n"
519 "movdqa %2,%%xmm4 \n"
520 "pxor %%xmm5,%%xmm5 \n"
521 :
522 : "m"(kShufAc), // %0
523 "m"(kShufAc3), // %1
524 "m"(kScaleAc33) // %2
525 );
526 asm volatile (
527 LABELALIGN
528 "1: \n"
529 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
530 MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
531 "movhlps %%xmm0,%%xmm1 \n"
532 "movhlps %%xmm6,%%xmm7 \n"
533 "punpcklbw %%xmm5,%%xmm0 \n"
534 "punpcklbw %%xmm5,%%xmm1 \n"
535 "punpcklbw %%xmm5,%%xmm6 \n"
536 "punpcklbw %%xmm5,%%xmm7 \n"
537 "paddusw %%xmm6,%%xmm0 \n"
538 "paddusw %%xmm7,%%xmm1 \n"
539 MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
540 "lea " MEMLEA(0x10,0) ",%0 \n"
541 "movhlps %%xmm6,%%xmm7 \n"
542 "punpcklbw %%xmm5,%%xmm6 \n"
543 "punpcklbw %%xmm5,%%xmm7 \n"
544 "paddusw %%xmm6,%%xmm0 \n"
545 "paddusw %%xmm7,%%xmm1 \n"
546 "movdqa %%xmm0,%%xmm6 \n"
547 "psrldq $0x2,%%xmm0 \n"
548 "paddusw %%xmm0,%%xmm6 \n"
549 "psrldq $0x2,%%xmm0 \n"
550 "paddusw %%xmm0,%%xmm6 \n"
551 "pshufb %%xmm2,%%xmm6 \n"
552 "movdqa %%xmm1,%%xmm7 \n"
553 "psrldq $0x2,%%xmm1 \n"
554 "paddusw %%xmm1,%%xmm7 \n"
555 "psrldq $0x2,%%xmm1 \n"
556 "paddusw %%xmm1,%%xmm7 \n"
557 "pshufb %%xmm3,%%xmm7 \n"
558 "paddusw %%xmm7,%%xmm6 \n"
559 "pmulhuw %%xmm4,%%xmm6 \n"
560 "packuswb %%xmm6,%%xmm6 \n"
561 "movd %%xmm6," MEMACCESS(1) " \n"
562 "psrlq $0x10,%%xmm6 \n"
563 "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
564 "lea " MEMLEA(0x6,1) ",%1 \n"
565 "sub $0x6,%2 \n"
566 "jg 1b \n"
567 : "+r"(src_ptr), // %0
568 "+r"(dst_ptr), // %1
569 "+r"(dst_width) // %2
570 : "r"((intptr_t)(src_stride)) // %3
571 : "memory", "cc", NACL_R14
572 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
573 );
574 }
575
576 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
577 uint16* dst_ptr, int src_width, int src_height) {
578 int tmp_height = 0;
579 intptr_t tmp_src = 0;
580 asm volatile (
581 "pxor %%xmm4,%%xmm4 \n"
582 "sub $0x1,%5 \n"
583
584 LABELALIGN
585 "1: \n"
586 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
587 "mov %0,%3 \n"
588 "add %6,%0 \n"
589 "movdqa %%xmm0,%%xmm1 \n"
590 "punpcklbw %%xmm4,%%xmm0 \n"
591 "punpckhbw %%xmm4,%%xmm1 \n"
592 "mov %5,%2 \n"
593 "test %2,%2 \n"
594 "je 3f \n"
595
596 LABELALIGN
597 "2: \n"
598 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
599 "add %6,%0 \n"
600 "movdqa %%xmm2,%%xmm3 \n"
601 "punpcklbw %%xmm4,%%xmm2 \n"
602 "punpckhbw %%xmm4,%%xmm3 \n"
603 "paddusw %%xmm2,%%xmm0 \n"
604 "paddusw %%xmm3,%%xmm1 \n"
605 "sub $0x1,%2 \n"
606 "jg 2b \n"
607
608 LABELALIGN
609 "3: \n"
610 "movdqu %%xmm0," MEMACCESS(1) " \n"
611 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
612 "lea " MEMLEA(0x10,3) ",%0 \n"
613 "lea " MEMLEA(0x20,1) ",%1 \n"
614 "sub $0x10,%4 \n"
615 "jg 1b \n"
616 : "+r"(src_ptr), // %0
617 "+r"(dst_ptr), // %1
618 "+r"(tmp_height), // %2
619 "+r"(tmp_src), // %3
620 "+r"(src_width), // %4
621 "+rm"(src_height) // %5
622 : "rm"((intptr_t)(src_stride)) // %6
623 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
624 );
625 }
626
627 // Bilinear column filtering. SSSE3 version.
628 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
629 int dst_width, int x, int dx) {
630 intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
631 asm volatile (
632 "movd %6,%%xmm2 \n"
633 "movd %7,%%xmm3 \n"
634 "movl $0x04040000,%k2 \n"
635 "movd %k2,%%xmm5 \n"
636 "pcmpeqb %%xmm6,%%xmm6 \n"
637 "psrlw $0x9,%%xmm6 \n"
638 "pextrw $0x1,%%xmm2,%k3 \n"
639 "subl $0x2,%5 \n"
640 "jl 29f \n"
641 "movdqa %%xmm2,%%xmm0 \n"
642 "paddd %%xmm3,%%xmm0 \n"
643 "punpckldq %%xmm0,%%xmm2 \n"
644 "punpckldq %%xmm3,%%xmm3 \n"
645 "paddd %%xmm3,%%xmm3 \n"
646 "pextrw $0x3,%%xmm2,%k4 \n"
647
648 LABELALIGN
649 "2: \n"
650 "movdqa %%xmm2,%%xmm1 \n"
651 "paddd %%xmm3,%%xmm2 \n"
652 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
653 "movd %k2,%%xmm0 \n"
654 "psrlw $0x9,%%xmm1 \n"
655 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
656 "movd %k2,%%xmm4 \n"
657 "pshufb %%xmm5,%%xmm1 \n"
658 "punpcklwd %%xmm4,%%xmm0 \n"
659 "pxor %%xmm6,%%xmm1 \n"
660 "pmaddubsw %%xmm1,%%xmm0 \n"
661 "pextrw $0x1,%%xmm2,%k3 \n"
662 "pextrw $0x3,%%xmm2,%k4 \n"
663 "psrlw $0x7,%%xmm0 \n"
664 "packuswb %%xmm0,%%xmm0 \n"
665 "movd %%xmm0,%k2 \n"
666 "mov %w2," MEMACCESS(0) " \n"
667 "lea " MEMLEA(0x2,0) ",%0 \n"
668 "sub $0x2,%5 \n"
669 "jge 2b \n"
670
671 LABELALIGN
672 "29: \n"
673 "addl $0x1,%5 \n"
674 "jl 99f \n"
675 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
676 "movd %k2,%%xmm0 \n"
677 "psrlw $0x9,%%xmm2 \n"
678 "pshufb %%xmm5,%%xmm2 \n"
679 "pxor %%xmm6,%%xmm2 \n"
680 "pmaddubsw %%xmm2,%%xmm0 \n"
681 "psrlw $0x7,%%xmm0 \n"
682 "packuswb %%xmm0,%%xmm0 \n"
683 "movd %%xmm0,%k2 \n"
684 "mov %b2," MEMACCESS(0) " \n"
685 "99: \n"
686 : "+r"(dst_ptr), // %0
687 "+r"(src_ptr), // %1
688 "+a"(temp_pixel), // %2
689 "+r"(x0), // %3
690 "+r"(x1), // %4
691 "+rm"(dst_width) // %5
692 : "rm"(x), // %6
693 "rm"(dx) // %7
694 : "memory", "cc", NACL_R14
695 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
696 );
697 }
698
699 // Reads 4 pixels, duplicates them and writes 8 pixels.
700 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
701 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
702 int dst_width, int x, int dx) {
703 asm volatile (
704 LABELALIGN
705 "1: \n"
706 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
707 "lea " MEMLEA(0x10,1) ",%1 \n"
708 "movdqa %%xmm0,%%xmm1 \n"
709 "punpcklbw %%xmm0,%%xmm0 \n"
710 "punpckhbw %%xmm1,%%xmm1 \n"
711 "movdqu %%xmm0," MEMACCESS(0) " \n"
712 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
713 "lea " MEMLEA(0x20,0) ",%0 \n"
714 "sub $0x20,%2 \n"
715 "jg 1b \n"
716
717 : "+r"(dst_ptr), // %0
718 "+r"(src_ptr), // %1
719 "+r"(dst_width) // %2
720 :: "memory", "cc", "xmm0", "xmm1"
721 );
722 }
723
724 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
725 ptrdiff_t src_stride,
726 uint8* dst_argb, int dst_width) {
727 asm volatile (
728 LABELALIGN
729 "1: \n"
730 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
731 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
732 "lea " MEMLEA(0x20,0) ",%0 \n"
733 "shufps $0xdd,%%xmm1,%%xmm0 \n"
734 "movdqu %%xmm0," MEMACCESS(1) " \n"
735 "lea " MEMLEA(0x10,1) ",%1 \n"
736 "sub $0x4,%2 \n"
737 "jg 1b \n"
738 : "+r"(src_argb), // %0
739 "+r"(dst_argb), // %1
740 "+r"(dst_width) // %2
741 :: "memory", "cc", "xmm0", "xmm1"
742 );
743 }
744
745 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
746 ptrdiff_t src_stride,
747 uint8* dst_argb, int dst_width) {
748 asm volatile (
749 LABELALIGN
750 "1: \n"
751 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
752 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
753 "lea " MEMLEA(0x20,0) ",%0 \n"
754 "movdqa %%xmm0,%%xmm2 \n"
755 "shufps $0x88,%%xmm1,%%xmm0 \n"
756 "shufps $0xdd,%%xmm1,%%xmm2 \n"
757 "pavgb %%xmm2,%%xmm0 \n"
758 "movdqu %%xmm0," MEMACCESS(1) " \n"
759 "lea " MEMLEA(0x10,1) ",%1 \n"
760 "sub $0x4,%2 \n"
761 "jg 1b \n"
762 : "+r"(src_argb), // %0
763 "+r"(dst_argb), // %1
764 "+r"(dst_width) // %2
765 :: "memory", "cc", "xmm0", "xmm1"
766 );
767 }
768
769 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
770 ptrdiff_t src_stride,
771 uint8* dst_argb, int dst_width) {
772 asm volatile (
773 LABELALIGN
774 "1: \n"
775 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
776 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
777 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
778 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
779 "lea " MEMLEA(0x20,0) ",%0 \n"
780 "pavgb %%xmm2,%%xmm0 \n"
781 "pavgb %%xmm3,%%xmm1 \n"
782 "movdqa %%xmm0,%%xmm2 \n"
783 "shufps $0x88,%%xmm1,%%xmm0 \n"
784 "shufps $0xdd,%%xmm1,%%xmm2 \n"
785 "pavgb %%xmm2,%%xmm0 \n"
786 "movdqu %%xmm0," MEMACCESS(1) " \n"
787 "lea " MEMLEA(0x10,1) ",%1 \n"
788 "sub $0x4,%2 \n"
789 "jg 1b \n"
790 : "+r"(src_argb), // %0
791 "+r"(dst_argb), // %1
792 "+r"(dst_width) // %2
793 : "r"((intptr_t)(src_stride)) // %3
794 : "memory", "cc", NACL_R14
795 "xmm0", "xmm1", "xmm2", "xmm3"
796 );
797 }
798
799 // Reads 4 pixels at a time.
800 // Alignment requirement: dst_argb 16 byte aligned.
801 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
802 int src_stepx,
803 uint8* dst_argb, int dst_width) {
804 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
805 intptr_t src_stepx_x12 = 0;
806 asm volatile (
807 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
808 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
809 LABELALIGN
810 "1: \n"
811 "movd " MEMACCESS(0) ",%%xmm0 \n"
812 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
813 "punpckldq %%xmm1,%%xmm0 \n"
814 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
815 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
816 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
817 "punpckldq %%xmm3,%%xmm2 \n"
818 "punpcklqdq %%xmm2,%%xmm0 \n"
819 "movdqu %%xmm0," MEMACCESS(2) " \n"
820 "lea " MEMLEA(0x10,2) ",%2 \n"
821 "sub $0x4,%3 \n"
822 "jg 1b \n"
823 : "+r"(src_argb), // %0
824 "+r"(src_stepx_x4), // %1
825 "+r"(dst_argb), // %2
826 "+r"(dst_width), // %3
827 "+r"(src_stepx_x12) // %4
828 :: "memory", "cc", NACL_R14
829 "xmm0", "xmm1", "xmm2", "xmm3"
830 );
831 }
832
833 // Blends four 2x2 to 4x1.
834 // Alignment requirement: dst_argb 16 byte aligned.
835 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
836 ptrdiff_t src_stride, int src_stepx,
837 uint8* dst_argb, int dst_width) {
838 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
839 intptr_t src_stepx_x12 = 0;
840 intptr_t row1 = (intptr_t)(src_stride);
841 asm volatile (
842 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
843 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
844 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
845
846 LABELALIGN
847 "1: \n"
848 "movq " MEMACCESS(0) ",%%xmm0 \n"
849 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
850 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
851 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
852 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
853 "movq " MEMACCESS(5) ",%%xmm2 \n"
854 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
855 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
856 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
857 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
858 "pavgb %%xmm2,%%xmm0 \n"
859 "pavgb %%xmm3,%%xmm1 \n"
860 "movdqa %%xmm0,%%xmm2 \n"
861 "shufps $0x88,%%xmm1,%%xmm0 \n"
862 "shufps $0xdd,%%xmm1,%%xmm2 \n"
863 "pavgb %%xmm2,%%xmm0 \n"
864 "movdqu %%xmm0," MEMACCESS(2) " \n"
865 "lea " MEMLEA(0x10,2) ",%2 \n"
866 "sub $0x4,%3 \n"
867 "jg 1b \n"
868 : "+r"(src_argb), // %0
869 "+r"(src_stepx_x4), // %1
870 "+r"(dst_argb), // %2
871 "+rm"(dst_width), // %3
872 "+r"(src_stepx_x12), // %4
873 "+r"(row1) // %5
874 :: "memory", "cc", NACL_R14
875 "xmm0", "xmm1", "xmm2", "xmm3"
876 );
877 }
878
879 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
880 int dst_width, int x, int dx) {
881 intptr_t x0 = 0, x1 = 0;
882 asm volatile (
883 "movd %5,%%xmm2 \n"
884 "movd %6,%%xmm3 \n"
885 "pshufd $0x0,%%xmm2,%%xmm2 \n"
886 "pshufd $0x11,%%xmm3,%%xmm0 \n"
887 "paddd %%xmm0,%%xmm2 \n"
888 "paddd %%xmm3,%%xmm3 \n"
889 "pshufd $0x5,%%xmm3,%%xmm0 \n"
890 "paddd %%xmm0,%%xmm2 \n"
891 "paddd %%xmm3,%%xmm3 \n"
892 "pshufd $0x0,%%xmm3,%%xmm3 \n"
893 "pextrw $0x1,%%xmm2,%k0 \n"
894 "pextrw $0x3,%%xmm2,%k1 \n"
895 "cmp $0x0,%4 \n"
896 "jl 99f \n"
897 "sub $0x4,%4 \n"
898 "jl 49f \n"
899
900 LABELALIGN
901 "40: \n"
902 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
903 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
904 "pextrw $0x5,%%xmm2,%k0 \n"
905 "pextrw $0x7,%%xmm2,%k1 \n"
906 "paddd %%xmm3,%%xmm2 \n"
907 "punpckldq %%xmm1,%%xmm0 \n"
908 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
909 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
910 "pextrw $0x1,%%xmm2,%k0 \n"
911 "pextrw $0x3,%%xmm2,%k1 \n"
912 "punpckldq %%xmm4,%%xmm1 \n"
913 "punpcklqdq %%xmm1,%%xmm0 \n"
914 "movdqu %%xmm0," MEMACCESS(2) " \n"
915 "lea " MEMLEA(0x10,2) ",%2 \n"
916 "sub $0x4,%4 \n"
917 "jge 40b \n"
918
919 "49: \n"
920 "test $0x2,%4 \n"
921 "je 29f \n"
922 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
923 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
924 "pextrw $0x5,%%xmm2,%k0 \n"
925 "punpckldq %%xmm1,%%xmm0 \n"
926 "movq %%xmm0," MEMACCESS(2) " \n"
927 "lea " MEMLEA(0x8,2) ",%2 \n"
928 "29: \n"
929 "test $0x1,%4 \n"
930 "je 99f \n"
931 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
932 "movd %%xmm0," MEMACCESS(2) " \n"
933 "99: \n"
934 : "+a"(x0), // %0
935 "+d"(x1), // %1
936 "+r"(dst_argb), // %2
937 "+r"(src_argb), // %3
938 "+r"(dst_width) // %4
939 : "rm"(x), // %5
940 "rm"(dx) // %6
941 : "memory", "cc", NACL_R14
942 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
943 );
944 }
945
946 // Reads 4 pixels, duplicates them and writes 8 pixels.
947 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
948 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
949 int dst_width, int x, int dx) {
950 asm volatile (
951 LABELALIGN
952 "1: \n"
953 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
954 "lea " MEMLEA(0x10,1) ",%1 \n"
955 "movdqa %%xmm0,%%xmm1 \n"
956 "punpckldq %%xmm0,%%xmm0 \n"
957 "punpckhdq %%xmm1,%%xmm1 \n"
958 "movdqu %%xmm0," MEMACCESS(0) " \n"
959 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
960 "lea " MEMLEA(0x20,0) ",%0 \n"
961 "sub $0x8,%2 \n"
962 "jg 1b \n"
963
964 : "+r"(dst_argb), // %0
965 "+r"(src_argb), // %1
966 "+r"(dst_width) // %2
967 :: "memory", "cc", NACL_R14
968 "xmm0", "xmm1"
969 );
970 }
971
972 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
973 static uvec8 kShuffleColARGB = {
974 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
975 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
976 };
977
978 // Shuffle table for duplicating 2 fractions into 8 bytes each
979 static uvec8 kShuffleFractions = {
980 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
981 };
982
983 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
984 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
985 int dst_width, int x, int dx) {
986 intptr_t x0 = 0, x1 = 0;
987 asm volatile (
988 "movdqa %0,%%xmm4 \n"
989 "movdqa %1,%%xmm5 \n"
990 :
991 : "m"(kShuffleColARGB), // %0
992 "m"(kShuffleFractions) // %1
993 );
994
995 asm volatile (
996 "movd %5,%%xmm2 \n"
997 "movd %6,%%xmm3 \n"
998 "pcmpeqb %%xmm6,%%xmm6 \n"
999 "psrlw $0x9,%%xmm6 \n"
1000 "pextrw $0x1,%%xmm2,%k3 \n"
1001 "sub $0x2,%2 \n"
1002 "jl 29f \n"
1003 "movdqa %%xmm2,%%xmm0 \n"
1004 "paddd %%xmm3,%%xmm0 \n"
1005 "punpckldq %%xmm0,%%xmm2 \n"
1006 "punpckldq %%xmm3,%%xmm3 \n"
1007 "paddd %%xmm3,%%xmm3 \n"
1008 "pextrw $0x3,%%xmm2,%k4 \n"
1009
1010 LABELALIGN
1011 "2: \n"
1012 "movdqa %%xmm2,%%xmm1 \n"
1013 "paddd %%xmm3,%%xmm2 \n"
1014 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1015 "psrlw $0x9,%%xmm1 \n"
1016 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
1017 "pshufb %%xmm5,%%xmm1 \n"
1018 "pshufb %%xmm4,%%xmm0 \n"
1019 "pxor %%xmm6,%%xmm1 \n"
1020 "pmaddubsw %%xmm1,%%xmm0 \n"
1021 "psrlw $0x7,%%xmm0 \n"
1022 "pextrw $0x1,%%xmm2,%k3 \n"
1023 "pextrw $0x3,%%xmm2,%k4 \n"
1024 "packuswb %%xmm0,%%xmm0 \n"
1025 "movq %%xmm0," MEMACCESS(0) " \n"
1026 "lea " MEMLEA(0x8,0) ",%0 \n"
1027 "sub $0x2,%2 \n"
1028 "jge 2b \n"
1029
1030 LABELALIGN
1031 "29: \n"
1032 "add $0x1,%2 \n"
1033 "jl 99f \n"
1034 "psrlw $0x9,%%xmm2 \n"
1035 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1036 "pshufb %%xmm5,%%xmm2 \n"
1037 "pshufb %%xmm4,%%xmm0 \n"
1038 "pxor %%xmm6,%%xmm2 \n"
1039 "pmaddubsw %%xmm2,%%xmm0 \n"
1040 "psrlw $0x7,%%xmm0 \n"
1041 "packuswb %%xmm0,%%xmm0 \n"
1042 "movd %%xmm0," MEMACCESS(0) " \n"
1043
1044 LABELALIGN
1045 "99: \n"
1046 : "+r"(dst_argb), // %0
1047 "+r"(src_argb), // %1
1048 "+rm"(dst_width), // %2
1049 "+r"(x0), // %3
1050 "+r"(x1) // %4
1051 : "rm"(x), // %5
1052 "rm"(dx) // %6
1053 : "memory", "cc", NACL_R14
1054 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1055 );
1056 }
1057
1058 // Divide num by div and return as 16.16 fixed point result.
1059 int FixedDiv_X86(int num, int div) {
1060 asm volatile (
1061 "cdq \n"
1062 "shld $0x10,%%eax,%%edx \n"
1063 "shl $0x10,%%eax \n"
1064 "idiv %1 \n"
1065 "mov %0, %%eax \n"
1066 : "+a"(num) // %0
1067 : "c"(div) // %1
1068 : "memory", "cc", "edx"
1069 );
1070 return num;
1071 }
1072
1073 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
1074 int FixedDiv1_X86(int num, int div) {
1075 asm volatile (
1076 "cdq \n"
1077 "shld $0x10,%%eax,%%edx \n"
1078 "shl $0x10,%%eax \n"
1079 "sub $0x10001,%%eax \n"
1080 "sbb $0x0,%%edx \n"
1081 "sub $0x1,%1 \n"
1082 "idiv %1 \n"
1083 "mov %0, %%eax \n"
1084 : "+a"(num) // %0
1085 : "c"(div) // %1
1086 : "memory", "cc", "edx"
1087 );
1088 return num;
1089 }
1090
1091 #endif // defined(__x86_64__) || defined(__i386__)
1092
1093 #ifdef __cplusplus
1094 } // extern "C"
1095 } // namespace libyuv
1096 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_neon64.cc ('k') | source/libvpx/third_party/libyuv/source/scale_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698