Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(200)

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_posix.cc

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "third_party/libyuv/include/libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
20
21 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
22
23 // Constants for ARGB
24 static vec8 kARGBToY = {
25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
26 };
27
28 // JPeg full range.
29 static vec8 kARGBToYJ = {
30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
31 };
32 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
33
34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
35
36 static vec8 kARGBToU = {
37 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
38 };
39
40 static vec8 kARGBToUJ = {
41 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
42 };
43
44 static vec8 kARGBToV = {
45 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
46 };
47
48 static vec8 kARGBToVJ = {
49 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
50 };
51
52 // Constants for BGRA
53 static vec8 kBGRAToY = {
54 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
55 };
56
57 static vec8 kBGRAToU = {
58 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
59 };
60
61 static vec8 kBGRAToV = {
62 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
63 };
64
65 // Constants for ABGR
66 static vec8 kABGRToY = {
67 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
68 };
69
70 static vec8 kABGRToU = {
71 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
72 };
73
74 static vec8 kABGRToV = {
75 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
76 };
77
78 // Constants for RGBA.
79 static vec8 kRGBAToY = {
80 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
81 };
82
83 static vec8 kRGBAToU = {
84 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
85 };
86
87 static vec8 kRGBAToV = {
88 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
89 };
90
91 static uvec8 kAddY16 = {
92 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
93 };
94
95 static vec16 kAddYJ64 = {
96 64, 64, 64, 64, 64, 64, 64, 64
97 };
98
99 static uvec8 kAddUV128 = {
100 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
101 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
102 };
103
104 static uvec16 kAddUVJ128 = {
105 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
106 };
107 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
108
109 #ifdef HAS_RGB24TOARGBROW_SSSE3
110
111 // Shuffle table for converting RGB24 to ARGB.
112 static uvec8 kShuffleMaskRGB24ToARGB = {
113 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
114 };
115
116 // Shuffle table for converting RAW to ARGB.
117 static uvec8 kShuffleMaskRAWToARGB = {
118 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
119 };
120
121 // Shuffle table for converting ARGB to RGB24.
122 static uvec8 kShuffleMaskARGBToRGB24 = {
123 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
124 };
125
126 // Shuffle table for converting ARGB to RAW.
127 static uvec8 kShuffleMaskARGBToRAW = {
128 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
129 };
130
131 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
132 static uvec8 kShuffleMaskARGBToRGB24_0 = {
133 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
134 };
135
136 // Shuffle table for converting ARGB to RAW.
137 static uvec8 kShuffleMaskARGBToRAW_0 = {
138 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
139 };
140 #endif // HAS_RGB24TOARGBROW_SSSE3
141
142 #if defined(TESTING) && defined(__x86_64__)
143 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
144 asm volatile (
145 ".p2align 5 \n"
146 "mov %%eax,%%eax \n"
147 "mov %%ebx,%%ebx \n"
148 "mov %%ecx,%%ecx \n"
149 "mov %%edx,%%edx \n"
150 "mov %%esi,%%esi \n"
151 "mov %%edi,%%edi \n"
152 "mov %%ebp,%%ebp \n"
153 "mov %%esp,%%esp \n"
154 ".p2align 5 \n"
155 "mov %%r8d,%%r8d \n"
156 "mov %%r9d,%%r9d \n"
157 "mov %%r10d,%%r10d \n"
158 "mov %%r11d,%%r11d \n"
159 "mov %%r12d,%%r12d \n"
160 "mov %%r13d,%%r13d \n"
161 "mov %%r14d,%%r14d \n"
162 "mov %%r15d,%%r15d \n"
163 ".p2align 5 \n"
164 "lea (%%rax),%%eax \n"
165 "lea (%%rbx),%%ebx \n"
166 "lea (%%rcx),%%ecx \n"
167 "lea (%%rdx),%%edx \n"
168 "lea (%%rsi),%%esi \n"
169 "lea (%%rdi),%%edi \n"
170 "lea (%%rbp),%%ebp \n"
171 "lea (%%rsp),%%esp \n"
172 ".p2align 5 \n"
173 "lea (%%r8),%%r8d \n"
174 "lea (%%r9),%%r9d \n"
175 "lea (%%r10),%%r10d \n"
176 "lea (%%r11),%%r11d \n"
177 "lea (%%r12),%%r12d \n"
178 "lea (%%r13),%%r13d \n"
179 "lea (%%r14),%%r14d \n"
180 "lea (%%r15),%%r15d \n"
181
182 ".p2align 5 \n"
183 "lea 0x10(%%rax),%%eax \n"
184 "lea 0x10(%%rbx),%%ebx \n"
185 "lea 0x10(%%rcx),%%ecx \n"
186 "lea 0x10(%%rdx),%%edx \n"
187 "lea 0x10(%%rsi),%%esi \n"
188 "lea 0x10(%%rdi),%%edi \n"
189 "lea 0x10(%%rbp),%%ebp \n"
190 "lea 0x10(%%rsp),%%esp \n"
191 ".p2align 5 \n"
192 "lea 0x10(%%r8),%%r8d \n"
193 "lea 0x10(%%r9),%%r9d \n"
194 "lea 0x10(%%r10),%%r10d \n"
195 "lea 0x10(%%r11),%%r11d \n"
196 "lea 0x10(%%r12),%%r12d \n"
197 "lea 0x10(%%r13),%%r13d \n"
198 "lea 0x10(%%r14),%%r14d \n"
199 "lea 0x10(%%r15),%%r15d \n"
200
201 ".p2align 5 \n"
202 "add 0x10,%%eax \n"
203 "add 0x10,%%ebx \n"
204 "add 0x10,%%ecx \n"
205 "add 0x10,%%edx \n"
206 "add 0x10,%%esi \n"
207 "add 0x10,%%edi \n"
208 "add 0x10,%%ebp \n"
209 "add 0x10,%%esp \n"
210 ".p2align 5 \n"
211 "add 0x10,%%r8d \n"
212 "add 0x10,%%r9d \n"
213 "add 0x10,%%r10d \n"
214 "add 0x10,%%r11d \n"
215 "add 0x10,%%r12d \n"
216 "add 0x10,%%r13d \n"
217 "add 0x10,%%r14d \n"
218 "add 0x10,%%r15d \n"
219
220 ".p2align 2 \n"
221 "1: \n"
222 "movq " MEMACCESS(0) ",%%xmm0 \n"
223 "lea " MEMLEA(0x8,0) ",%0 \n"
224 "movdqa %%xmm0," MEMACCESS(1) " \n"
225 "lea " MEMLEA(0x20,1) ",%1 \n"
226 "sub $0x8,%2 \n"
227 "jg 1b \n"
228 : "+r"(src_y), // %0
229 "+r"(dst_argb), // %1
230 "+r"(pix) // %2
231 :
232 : "memory", "cc"
233 #if defined(__SSE2__)
234 , "xmm0", "xmm1", "xmm5"
235 #endif
236 );
237 }
238 #endif // TESTING
239
240 #ifdef HAS_I400TOARGBROW_SSE2
241 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
242 asm volatile (
243 "pcmpeqb %%xmm5,%%xmm5 \n"
244 "pslld $0x18,%%xmm5 \n"
245 LABELALIGN
246 "1: \n"
247 "movq " MEMACCESS(0) ",%%xmm0 \n"
248 "lea " MEMLEA(0x8,0) ",%0 \n"
249 "punpcklbw %%xmm0,%%xmm0 \n"
250 "movdqa %%xmm0,%%xmm1 \n"
251 "punpcklwd %%xmm0,%%xmm0 \n"
252 "punpckhwd %%xmm1,%%xmm1 \n"
253 "por %%xmm5,%%xmm0 \n"
254 "por %%xmm5,%%xmm1 \n"
255 "movdqa %%xmm0," MEMACCESS(1) " \n"
256 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
257 "lea " MEMLEA(0x20,1) ",%1 \n"
258 "sub $0x8,%2 \n"
259 "jg 1b \n"
260 : "+r"(src_y), // %0
261 "+r"(dst_argb), // %1
262 "+r"(pix) // %2
263 :
264 : "memory", "cc"
265 #if defined(__SSE2__)
266 , "xmm0", "xmm1", "xmm5"
267 #endif
268 );
269 }
270
271 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
272 int pix) {
273 asm volatile (
274 "pcmpeqb %%xmm5,%%xmm5 \n"
275 "pslld $0x18,%%xmm5 \n"
276 LABELALIGN
277 "1: \n"
278 "movq " MEMACCESS(0) ",%%xmm0 \n"
279 "lea " MEMLEA(0x8,0) ",%0 \n"
280 "punpcklbw %%xmm0,%%xmm0 \n"
281 "movdqa %%xmm0,%%xmm1 \n"
282 "punpcklwd %%xmm0,%%xmm0 \n"
283 "punpckhwd %%xmm1,%%xmm1 \n"
284 "por %%xmm5,%%xmm0 \n"
285 "por %%xmm5,%%xmm1 \n"
286 "movdqu %%xmm0," MEMACCESS(1) " \n"
287 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
288 "lea " MEMLEA(0x20,1) ",%1 \n"
289 "sub $0x8,%2 \n"
290 "jg 1b \n"
291 : "+r"(src_y), // %0
292 "+r"(dst_argb), // %1
293 "+r"(pix) // %2
294 :
295 : "memory", "cc"
296 #if defined(__SSE2__)
297 , "xmm0", "xmm1", "xmm5"
298 #endif
299 );
300 }
301 #endif // HAS_I400TOARGBROW_SSE2
302
303 #ifdef HAS_RGB24TOARGBROW_SSSE3
304 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
305 asm volatile (
306 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
307 "pslld $0x18,%%xmm5 \n"
308 "movdqa %3,%%xmm4 \n"
309 LABELALIGN
310 "1: \n"
311 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
312 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
313 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
314 "lea " MEMLEA(0x30,0) ",%0 \n"
315 "movdqa %%xmm3,%%xmm2 \n"
316 "palignr $0x8,%%xmm1,%%xmm2 \n"
317 "pshufb %%xmm4,%%xmm2 \n"
318 "por %%xmm5,%%xmm2 \n"
319 "palignr $0xc,%%xmm0,%%xmm1 \n"
320 "pshufb %%xmm4,%%xmm0 \n"
321 "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
322 "por %%xmm5,%%xmm0 \n"
323 "pshufb %%xmm4,%%xmm1 \n"
324 "movdqa %%xmm0," MEMACCESS(1) " \n"
325 "por %%xmm5,%%xmm1 \n"
326 "palignr $0x4,%%xmm3,%%xmm3 \n"
327 "pshufb %%xmm4,%%xmm3 \n"
328 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
329 "por %%xmm5,%%xmm3 \n"
330 "sub $0x10,%2 \n"
331 "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
332 "lea " MEMLEA(0x40,1) ",%1 \n"
333 "jg 1b \n"
334 : "+r"(src_rgb24), // %0
335 "+r"(dst_argb), // %1
336 "+r"(pix) // %2
337 : "m"(kShuffleMaskRGB24ToARGB) // %3
338 : "memory", "cc"
339 #if defined(__SSE2__)
340 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
341 #endif
342 );
343 }
344
345 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
346 asm volatile (
347 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
348 "pslld $0x18,%%xmm5 \n"
349 "movdqa %3,%%xmm4 \n"
350 LABELALIGN
351 "1: \n"
352 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
353 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
354 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
355 "lea " MEMLEA(0x30,0) ",%0 \n"
356 "movdqa %%xmm3,%%xmm2 \n"
357 "palignr $0x8,%%xmm1,%%xmm2 \n"
358 "pshufb %%xmm4,%%xmm2 \n"
359 "por %%xmm5,%%xmm2 \n"
360 "palignr $0xc,%%xmm0,%%xmm1 \n"
361 "pshufb %%xmm4,%%xmm0 \n"
362 "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
363 "por %%xmm5,%%xmm0 \n"
364 "pshufb %%xmm4,%%xmm1 \n"
365 "movdqa %%xmm0," MEMACCESS(1) " \n"
366 "por %%xmm5,%%xmm1 \n"
367 "palignr $0x4,%%xmm3,%%xmm3 \n"
368 "pshufb %%xmm4,%%xmm3 \n"
369 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
370 "por %%xmm5,%%xmm3 \n"
371 "sub $0x10,%2 \n"
372 "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
373 "lea " MEMLEA(0x40,1) ",%1 \n"
374 "jg 1b \n"
375 : "+r"(src_raw), // %0
376 "+r"(dst_argb), // %1
377 "+r"(pix) // %2
378 : "m"(kShuffleMaskRAWToARGB) // %3
379 : "memory", "cc"
380 #if defined(__SSE2__)
381 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
382 #endif
383 );
384 }
385
386 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
387 asm volatile (
388 "mov $0x1080108,%%eax \n"
389 "movd %%eax,%%xmm5 \n"
390 "pshufd $0x0,%%xmm5,%%xmm5 \n"
391 "mov $0x20802080,%%eax \n"
392 "movd %%eax,%%xmm6 \n"
393 "pshufd $0x0,%%xmm6,%%xmm6 \n"
394 "pcmpeqb %%xmm3,%%xmm3 \n"
395 "psllw $0xb,%%xmm3 \n"
396 "pcmpeqb %%xmm4,%%xmm4 \n"
397 "psllw $0xa,%%xmm4 \n"
398 "psrlw $0x5,%%xmm4 \n"
399 "pcmpeqb %%xmm7,%%xmm7 \n"
400 "psllw $0x8,%%xmm7 \n"
401 "sub %0,%1 \n"
402 "sub %0,%1 \n"
403 LABELALIGN
404 "1: \n"
405 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
406 "movdqa %%xmm0,%%xmm1 \n"
407 "movdqa %%xmm0,%%xmm2 \n"
408 "pand %%xmm3,%%xmm1 \n"
409 "psllw $0xb,%%xmm2 \n"
410 "pmulhuw %%xmm5,%%xmm1 \n"
411 "pmulhuw %%xmm5,%%xmm2 \n"
412 "psllw $0x8,%%xmm1 \n"
413 "por %%xmm2,%%xmm1 \n"
414 "pand %%xmm4,%%xmm0 \n"
415 "pmulhuw %%xmm6,%%xmm0 \n"
416 "por %%xmm7,%%xmm0 \n"
417 "movdqa %%xmm1,%%xmm2 \n"
418 "punpcklbw %%xmm0,%%xmm1 \n"
419 "punpckhbw %%xmm0,%%xmm2 \n"
420 BUNDLEALIGN
421 MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
422 MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
423 "lea " MEMLEA(0x10,0) ",%0 \n"
424 "sub $0x8,%2 \n"
425 "jg 1b \n"
426 : "+r"(src), // %0
427 "+r"(dst), // %1
428 "+r"(pix) // %2
429 :
430 : "memory", "cc", "eax"
431 #if defined(__native_client__) && defined(__x86_64__)
432 , "r14"
433 #endif
434 #if defined(__SSE2__)
435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
436 #endif
437 );
438 }
439
440 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
441 asm volatile (
442 "mov $0x1080108,%%eax \n"
443 "movd %%eax,%%xmm5 \n"
444 "pshufd $0x0,%%xmm5,%%xmm5 \n"
445 "mov $0x42004200,%%eax \n"
446 "movd %%eax,%%xmm6 \n"
447 "pshufd $0x0,%%xmm6,%%xmm6 \n"
448 "pcmpeqb %%xmm3,%%xmm3 \n"
449 "psllw $0xb,%%xmm3 \n"
450 "movdqa %%xmm3,%%xmm4 \n"
451 "psrlw $0x6,%%xmm4 \n"
452 "pcmpeqb %%xmm7,%%xmm7 \n"
453 "psllw $0x8,%%xmm7 \n"
454 "sub %0,%1 \n"
455 "sub %0,%1 \n"
456 LABELALIGN
457 "1: \n"
458 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
459 "movdqa %%xmm0,%%xmm1 \n"
460 "movdqa %%xmm0,%%xmm2 \n"
461 "psllw $0x1,%%xmm1 \n"
462 "psllw $0xb,%%xmm2 \n"
463 "pand %%xmm3,%%xmm1 \n"
464 "pmulhuw %%xmm5,%%xmm2 \n"
465 "pmulhuw %%xmm5,%%xmm1 \n"
466 "psllw $0x8,%%xmm1 \n"
467 "por %%xmm2,%%xmm1 \n"
468 "movdqa %%xmm0,%%xmm2 \n"
469 "pand %%xmm4,%%xmm0 \n"
470 "psraw $0x8,%%xmm2 \n"
471 "pmulhuw %%xmm6,%%xmm0 \n"
472 "pand %%xmm7,%%xmm2 \n"
473 "por %%xmm2,%%xmm0 \n"
474 "movdqa %%xmm1,%%xmm2 \n"
475 "punpcklbw %%xmm0,%%xmm1 \n"
476 "punpckhbw %%xmm0,%%xmm2 \n"
477 BUNDLEALIGN
478 MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
479 MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
480 "lea " MEMLEA(0x10,0) ",%0 \n"
481 "sub $0x8,%2 \n"
482 "jg 1b \n"
483 : "+r"(src), // %0
484 "+r"(dst), // %1
485 "+r"(pix) // %2
486 :
487 : "memory", "cc", "eax"
488 #if defined(__native_client__) && defined(__x86_64__)
489 , "r14"
490 #endif
491 #if defined(__SSE2__)
492 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
493 #endif
494 );
495 }
496
497 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
498 asm volatile (
499 "mov $0xf0f0f0f,%%eax \n"
500 "movd %%eax,%%xmm4 \n"
501 "pshufd $0x0,%%xmm4,%%xmm4 \n"
502 "movdqa %%xmm4,%%xmm5 \n"
503 "pslld $0x4,%%xmm5 \n"
504 "sub %0,%1 \n"
505 "sub %0,%1 \n"
506 LABELALIGN
507 "1: \n"
508 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
509 "movdqa %%xmm0,%%xmm2 \n"
510 "pand %%xmm4,%%xmm0 \n"
511 "pand %%xmm5,%%xmm2 \n"
512 "movdqa %%xmm0,%%xmm1 \n"
513 "movdqa %%xmm2,%%xmm3 \n"
514 "psllw $0x4,%%xmm1 \n"
515 "psrlw $0x4,%%xmm3 \n"
516 "por %%xmm1,%%xmm0 \n"
517 "por %%xmm3,%%xmm2 \n"
518 "movdqa %%xmm0,%%xmm1 \n"
519 "punpcklbw %%xmm2,%%xmm0 \n"
520 "punpckhbw %%xmm2,%%xmm1 \n"
521 BUNDLEALIGN
522 MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
523 MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
524 "lea " MEMLEA(0x10,0) ",%0 \n"
525 "sub $0x8,%2 \n"
526 "jg 1b \n"
527 : "+r"(src), // %0
528 "+r"(dst), // %1
529 "+r"(pix) // %2
530 :
531 : "memory", "cc", "eax"
532 #if defined(__native_client__) && defined(__x86_64__)
533 , "r14"
534 #endif
535 #if defined(__SSE2__)
536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
537 #endif
538 );
539 }
540
541 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
542 asm volatile (
543 "movdqa %3,%%xmm6 \n"
544 LABELALIGN
545 "1: \n"
546 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
547 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
548 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
549 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
550 "lea " MEMLEA(0x40,0) ",%0 \n"
551 "pshufb %%xmm6,%%xmm0 \n"
552 "pshufb %%xmm6,%%xmm1 \n"
553 "pshufb %%xmm6,%%xmm2 \n"
554 "pshufb %%xmm6,%%xmm3 \n"
555 "movdqa %%xmm1,%%xmm4 \n"
556 "psrldq $0x4,%%xmm1 \n"
557 "pslldq $0xc,%%xmm4 \n"
558 "movdqa %%xmm2,%%xmm5 \n"
559 "por %%xmm4,%%xmm0 \n"
560 "pslldq $0x8,%%xmm5 \n"
561 "movdqu %%xmm0," MEMACCESS(1) " \n"
562 "por %%xmm5,%%xmm1 \n"
563 "psrldq $0x8,%%xmm2 \n"
564 "pslldq $0x4,%%xmm3 \n"
565 "por %%xmm3,%%xmm2 \n"
566 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
567 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
568 "lea " MEMLEA(0x30,1) ",%1 \n"
569 "sub $0x10,%2 \n"
570 "jg 1b \n"
571 : "+r"(src), // %0
572 "+r"(dst), // %1
573 "+r"(pix) // %2
574 : "m"(kShuffleMaskARGBToRGB24) // %3
575 : "memory", "cc"
576 #if defined(__SSE2__)
577 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
578 #endif
579 );
580 }
581
582 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
583 asm volatile (
584 "movdqa %3,%%xmm6 \n"
585 LABELALIGN
586 "1: \n"
587 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
588 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
589 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
590 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
591 "lea " MEMLEA(0x40,0) ",%0 \n"
592 "pshufb %%xmm6,%%xmm0 \n"
593 "pshufb %%xmm6,%%xmm1 \n"
594 "pshufb %%xmm6,%%xmm2 \n"
595 "pshufb %%xmm6,%%xmm3 \n"
596 "movdqa %%xmm1,%%xmm4 \n"
597 "psrldq $0x4,%%xmm1 \n"
598 "pslldq $0xc,%%xmm4 \n"
599 "movdqa %%xmm2,%%xmm5 \n"
600 "por %%xmm4,%%xmm0 \n"
601 "pslldq $0x8,%%xmm5 \n"
602 "movdqu %%xmm0," MEMACCESS(1) " \n"
603 "por %%xmm5,%%xmm1 \n"
604 "psrldq $0x8,%%xmm2 \n"
605 "pslldq $0x4,%%xmm3 \n"
606 "por %%xmm3,%%xmm2 \n"
607 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
608 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
609 "lea " MEMLEA(0x30,1) ",%1 \n"
610 "sub $0x10,%2 \n"
611 "jg 1b \n"
612 : "+r"(src), // %0
613 "+r"(dst), // %1
614 "+r"(pix) // %2
615 : "m"(kShuffleMaskARGBToRAW) // %3
616 : "memory", "cc"
617 #if defined(__SSE2__)
618 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
619 #endif
620 );
621 }
622
623 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
624 asm volatile (
625 "pcmpeqb %%xmm3,%%xmm3 \n"
626 "psrld $0x1b,%%xmm3 \n"
627 "pcmpeqb %%xmm4,%%xmm4 \n"
628 "psrld $0x1a,%%xmm4 \n"
629 "pslld $0x5,%%xmm4 \n"
630 "pcmpeqb %%xmm5,%%xmm5 \n"
631 "pslld $0xb,%%xmm5 \n"
632 LABELALIGN
633 "1: \n"
634 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
635 "movdqa %%xmm0,%%xmm1 \n"
636 "movdqa %%xmm0,%%xmm2 \n"
637 "pslld $0x8,%%xmm0 \n"
638 "psrld $0x3,%%xmm1 \n"
639 "psrld $0x5,%%xmm2 \n"
640 "psrad $0x10,%%xmm0 \n"
641 "pand %%xmm3,%%xmm1 \n"
642 "pand %%xmm4,%%xmm2 \n"
643 "pand %%xmm5,%%xmm0 \n"
644 "por %%xmm2,%%xmm1 \n"
645 "por %%xmm1,%%xmm0 \n"
646 "packssdw %%xmm0,%%xmm0 \n"
647 "lea " MEMLEA(0x10,0) ",%0 \n"
648 "movq %%xmm0," MEMACCESS(1) " \n"
649 "lea " MEMLEA(0x8,1) ",%1 \n"
650 "sub $0x4,%2 \n"
651 "jg 1b \n"
652 : "+r"(src), // %0
653 "+r"(dst), // %1
654 "+r"(pix) // %2
655 :
656 : "memory", "cc"
657 #if defined(__SSE2__)
658 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
659 #endif
660 );
661 }
662
663 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
664 asm volatile (
665 "pcmpeqb %%xmm4,%%xmm4 \n"
666 "psrld $0x1b,%%xmm4 \n"
667 "movdqa %%xmm4,%%xmm5 \n"
668 "pslld $0x5,%%xmm5 \n"
669 "movdqa %%xmm4,%%xmm6 \n"
670 "pslld $0xa,%%xmm6 \n"
671 "pcmpeqb %%xmm7,%%xmm7 \n"
672 "pslld $0xf,%%xmm7 \n"
673 LABELALIGN
674 "1: \n"
675 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
676 "movdqa %%xmm0,%%xmm1 \n"
677 "movdqa %%xmm0,%%xmm2 \n"
678 "movdqa %%xmm0,%%xmm3 \n"
679 "psrad $0x10,%%xmm0 \n"
680 "psrld $0x3,%%xmm1 \n"
681 "psrld $0x6,%%xmm2 \n"
682 "psrld $0x9,%%xmm3 \n"
683 "pand %%xmm7,%%xmm0 \n"
684 "pand %%xmm4,%%xmm1 \n"
685 "pand %%xmm5,%%xmm2 \n"
686 "pand %%xmm6,%%xmm3 \n"
687 "por %%xmm1,%%xmm0 \n"
688 "por %%xmm3,%%xmm2 \n"
689 "por %%xmm2,%%xmm0 \n"
690 "packssdw %%xmm0,%%xmm0 \n"
691 "lea " MEMLEA(0x10,0) ",%0 \n"
692 "movq %%xmm0," MEMACCESS(1) " \n"
693 "lea " MEMACCESS2(0x8,1) ",%1 \n"
694 "sub $0x4,%2 \n"
695 "jg 1b \n"
696 : "+r"(src), // %0
697 "+r"(dst), // %1
698 "+r"(pix) // %2
699 :
700 : "memory", "cc"
701 #if defined(__SSE2__)
702 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
703 #endif
704 );
705 }
706
707 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
708 asm volatile (
709 "pcmpeqb %%xmm4,%%xmm4 \n"
710 "psllw $0xc,%%xmm4 \n"
711 "movdqa %%xmm4,%%xmm3 \n"
712 "psrlw $0x8,%%xmm3 \n"
713 LABELALIGN
714 "1: \n"
715 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
716 "movdqa %%xmm0,%%xmm1 \n"
717 "pand %%xmm3,%%xmm0 \n"
718 "pand %%xmm4,%%xmm1 \n"
719 "psrlq $0x4,%%xmm0 \n"
720 "psrlq $0x8,%%xmm1 \n"
721 "por %%xmm1,%%xmm0 \n"
722 "packuswb %%xmm0,%%xmm0 \n"
723 "lea " MEMLEA(0x10,0) ",%0 \n"
724 "movq %%xmm0," MEMACCESS(1) " \n"
725 "lea " MEMLEA(0x8,1) ",%1 \n"
726 "sub $0x4,%2 \n"
727 "jg 1b \n"
728 : "+r"(src), // %0
729 "+r"(dst), // %1
730 "+r"(pix) // %2
731 :
732 : "memory", "cc"
733 #if defined(__SSE2__)
734 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
735 #endif
736 );
737 }
738 #endif // HAS_RGB24TOARGBROW_SSSE3
739
740 #ifdef HAS_ARGBTOYROW_SSSE3
741 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
742 asm volatile (
743 "movdqa %4,%%xmm5 \n"
744 "movdqa %3,%%xmm4 \n"
745 LABELALIGN
746 "1: \n"
747 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
748 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
749 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
750 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
751 "pmaddubsw %%xmm4,%%xmm0 \n"
752 "pmaddubsw %%xmm4,%%xmm1 \n"
753 "pmaddubsw %%xmm4,%%xmm2 \n"
754 "pmaddubsw %%xmm4,%%xmm3 \n"
755 "lea " MEMLEA(0x40,0) ",%0 \n"
756 "phaddw %%xmm1,%%xmm0 \n"
757 "phaddw %%xmm3,%%xmm2 \n"
758 "psrlw $0x7,%%xmm0 \n"
759 "psrlw $0x7,%%xmm2 \n"
760 "packuswb %%xmm2,%%xmm0 \n"
761 "paddb %%xmm5,%%xmm0 \n"
762 "sub $0x10,%2 \n"
763 "movdqa %%xmm0," MEMACCESS(1) " \n"
764 "lea " MEMLEA(0x10,1) ",%1 \n"
765 "jg 1b \n"
766 : "+r"(src_argb), // %0
767 "+r"(dst_y), // %1
768 "+r"(pix) // %2
769 : "m"(kARGBToY), // %3
770 "m"(kAddY16) // %4
771 : "memory", "cc"
772 #if defined(__SSE2__)
773 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
774 #endif
775 );
776 }
777
778 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
779 asm volatile (
780 "movdqa %4,%%xmm5 \n"
781 "movdqa %3,%%xmm4 \n"
782 LABELALIGN
783 "1: \n"
784 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
785 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
786 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
787 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
788 "pmaddubsw %%xmm4,%%xmm0 \n"
789 "pmaddubsw %%xmm4,%%xmm1 \n"
790 "pmaddubsw %%xmm4,%%xmm2 \n"
791 "pmaddubsw %%xmm4,%%xmm3 \n"
792 "lea " MEMLEA(0x40,0) ",%0 \n"
793 "phaddw %%xmm1,%%xmm0 \n"
794 "phaddw %%xmm3,%%xmm2 \n"
795 "psrlw $0x7,%%xmm0 \n"
796 "psrlw $0x7,%%xmm2 \n"
797 "packuswb %%xmm2,%%xmm0 \n"
798 "paddb %%xmm5,%%xmm0 \n"
799 "sub $0x10,%2 \n"
800 "movdqu %%xmm0," MEMACCESS(1) " \n"
801 "lea " MEMLEA(0x10,1) ",%1 \n"
802 "jg 1b \n"
803 : "+r"(src_argb), // %0
804 "+r"(dst_y), // %1
805 "+r"(pix) // %2
806 : "m"(kARGBToY), // %3
807 "m"(kAddY16) // %4
808 : "memory", "cc"
809 #if defined(__SSE2__)
810 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
811 #endif
812 );
813 }
814 #endif // HAS_ARGBTOYROW_SSSE3
815
816 #ifdef HAS_ARGBTOYJROW_SSSE3
817 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
818 asm volatile (
819 "movdqa %3,%%xmm4 \n"
820 "movdqa %4,%%xmm5 \n"
821 LABELALIGN
822 "1: \n"
823 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
824 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
825 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
826 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
827 "pmaddubsw %%xmm4,%%xmm0 \n"
828 "pmaddubsw %%xmm4,%%xmm1 \n"
829 "pmaddubsw %%xmm4,%%xmm2 \n"
830 "pmaddubsw %%xmm4,%%xmm3 \n"
831 "lea " MEMLEA(0x40,0) ",%0 \n"
832 "phaddw %%xmm1,%%xmm0 \n"
833 "phaddw %%xmm3,%%xmm2 \n"
834 "paddw %%xmm5,%%xmm0 \n"
835 "paddw %%xmm5,%%xmm2 \n"
836 "psrlw $0x7,%%xmm0 \n"
837 "psrlw $0x7,%%xmm2 \n"
838 "packuswb %%xmm2,%%xmm0 \n"
839 "sub $0x10,%2 \n"
840 "movdqa %%xmm0," MEMACCESS(1) " \n"
841 "lea " MEMLEA(0x10,1) ",%1 \n"
842 "jg 1b \n"
843 : "+r"(src_argb), // %0
844 "+r"(dst_y), // %1
845 "+r"(pix) // %2
846 : "m"(kARGBToYJ), // %3
847 "m"(kAddYJ64) // %4
848 : "memory", "cc"
849 #if defined(__SSE2__)
850 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
851 #endif
852 );
853 }
854
855 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
856 asm volatile (
857 "movdqa %3,%%xmm4 \n"
858 "movdqa %4,%%xmm5 \n"
859 LABELALIGN
860 "1: \n"
861 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
862 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
863 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
864 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
865 "pmaddubsw %%xmm4,%%xmm0 \n"
866 "pmaddubsw %%xmm4,%%xmm1 \n"
867 "pmaddubsw %%xmm4,%%xmm2 \n"
868 "pmaddubsw %%xmm4,%%xmm3 \n"
869 "lea " MEMLEA(0x40,0) ",%0 \n"
870 "phaddw %%xmm1,%%xmm0 \n"
871 "phaddw %%xmm3,%%xmm2 \n"
872 "paddw %%xmm5,%%xmm0 \n"
873 "paddw %%xmm5,%%xmm2 \n"
874 "psrlw $0x7,%%xmm0 \n"
875 "psrlw $0x7,%%xmm2 \n"
876 "packuswb %%xmm2,%%xmm0 \n"
877 "sub $0x10,%2 \n"
878 "movdqu %%xmm0," MEMACCESS(1) " \n"
879 "lea " MEMLEA(0x10,1) ",%1 \n"
880 "jg 1b \n"
881 : "+r"(src_argb), // %0
882 "+r"(dst_y), // %1
883 "+r"(pix) // %2
884 : "m"(kARGBToYJ), // %3
885 "m"(kAddYJ64) // %4
886 : "memory", "cc"
887 #if defined(__SSE2__)
888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
889 #endif
890 );
891 }
892 #endif // HAS_ARGBTOYJROW_SSSE3
893
894 #ifdef HAS_ARGBTOUVROW_SSSE3
895 // TODO(fbarchard): pass xmm constants to single block of assembly.
896 // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
897 // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
898 // or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
899 // and considered unsafe.
900 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
901 uint8* dst_u, uint8* dst_v, int width) {
902 asm volatile (
903 "movdqa %0,%%xmm4 \n"
904 "movdqa %1,%%xmm3 \n"
905 "movdqa %2,%%xmm5 \n"
906 :
907 : "m"(kARGBToU), // %0
908 "m"(kARGBToV), // %1
909 "m"(kAddUV128) // %2
910 );
911 asm volatile (
912 "sub %1,%2 \n"
913 LABELALIGN
914 "1: \n"
915 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
916 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
917 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
918 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
919 BUNDLEALIGN
920 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
921 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
922 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
923 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
924 "lea " MEMLEA(0x40,0) ",%0 \n"
925 "movdqa %%xmm0,%%xmm7 \n"
926 "shufps $0x88,%%xmm1,%%xmm0 \n"
927 "shufps $0xdd,%%xmm1,%%xmm7 \n"
928 "pavgb %%xmm7,%%xmm0 \n"
929 "movdqa %%xmm2,%%xmm7 \n"
930 "shufps $0x88,%%xmm6,%%xmm2 \n"
931 "shufps $0xdd,%%xmm6,%%xmm7 \n"
932 "pavgb %%xmm7,%%xmm2 \n"
933 "movdqa %%xmm0,%%xmm1 \n"
934 "movdqa %%xmm2,%%xmm6 \n"
935 "pmaddubsw %%xmm4,%%xmm0 \n"
936 "pmaddubsw %%xmm4,%%xmm2 \n"
937 "pmaddubsw %%xmm3,%%xmm1 \n"
938 "pmaddubsw %%xmm3,%%xmm6 \n"
939 "phaddw %%xmm2,%%xmm0 \n"
940 "phaddw %%xmm6,%%xmm1 \n"
941 "psraw $0x8,%%xmm0 \n"
942 "psraw $0x8,%%xmm1 \n"
943 "packsswb %%xmm1,%%xmm0 \n"
944 "paddb %%xmm5,%%xmm0 \n"
945 "sub $0x10,%3 \n"
946 "movlps %%xmm0," MEMACCESS(1) " \n"
947 BUNDLEALIGN
948 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
949 "lea " MEMLEA(0x8,1) ",%1 \n"
950 "jg 1b \n"
951 : "+r"(src_argb0), // %0
952 "+r"(dst_u), // %1
953 "+r"(dst_v), // %2
954 "+rm"(width) // %3
955 : "r"((intptr_t)(src_stride_argb)) // %4
956 : "memory", "cc"
957 #if defined(__native_client__) && defined(__x86_64__)
958 , "r14"
959 #endif
960 #if defined(__SSE2__)
961 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
962 #endif
963 );
964 }
965
966 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
967 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
968 uint8* dst_u, uint8* dst_v, int width) {
969 asm volatile (
970 "movdqa %0,%%xmm4 \n"
971 "movdqa %1,%%xmm3 \n"
972 "movdqa %2,%%xmm5 \n"
973 :
974 : "m"(kARGBToUJ), // %0
975 "m"(kARGBToVJ), // %1
976 "m"(kAddUVJ128) // %2
977 );
978 asm volatile (
979 "sub %1,%2 \n"
980 LABELALIGN
981 "1: \n"
982 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
983 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
984 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
985 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
986 BUNDLEALIGN
987 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
988 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
989 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
990 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
991 "lea " MEMLEA(0x40,0) ",%0 \n"
992 "movdqa %%xmm0,%%xmm7 \n"
993 "shufps $0x88,%%xmm1,%%xmm0 \n"
994 "shufps $0xdd,%%xmm1,%%xmm7 \n"
995 "pavgb %%xmm7,%%xmm0 \n"
996 "movdqa %%xmm2,%%xmm7 \n"
997 "shufps $0x88,%%xmm6,%%xmm2 \n"
998 "shufps $0xdd,%%xmm6,%%xmm7 \n"
999 "pavgb %%xmm7,%%xmm2 \n"
1000 "movdqa %%xmm0,%%xmm1 \n"
1001 "movdqa %%xmm2,%%xmm6 \n"
1002 "pmaddubsw %%xmm4,%%xmm0 \n"
1003 "pmaddubsw %%xmm4,%%xmm2 \n"
1004 "pmaddubsw %%xmm3,%%xmm1 \n"
1005 "pmaddubsw %%xmm3,%%xmm6 \n"
1006 "phaddw %%xmm2,%%xmm0 \n"
1007 "phaddw %%xmm6,%%xmm1 \n"
1008 "paddw %%xmm5,%%xmm0 \n"
1009 "paddw %%xmm5,%%xmm1 \n"
1010 "psraw $0x8,%%xmm0 \n"
1011 "psraw $0x8,%%xmm1 \n"
1012 "packsswb %%xmm1,%%xmm0 \n"
1013 "sub $0x10,%3 \n"
1014 "movlps %%xmm0," MEMACCESS(1) " \n"
1015 BUNDLEALIGN
1016 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1017 "lea " MEMLEA(0x8,1) ",%1 \n"
1018 "jg 1b \n"
1019 : "+r"(src_argb0), // %0
1020 "+r"(dst_u), // %1
1021 "+r"(dst_v), // %2
1022 "+rm"(width) // %3
1023 : "r"((intptr_t)(src_stride_argb)) // %4
1024 : "memory", "cc"
1025 #if defined(__native_client__) && defined(__x86_64__)
1026 , "r14"
1027 #endif
1028 #if defined(__SSE2__)
1029 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1030 #endif
1031 );
1032 }
1033
1034 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1035 uint8* dst_u, uint8* dst_v, int width) {
1036 asm volatile (
1037 "movdqa %0,%%xmm4 \n"
1038 "movdqa %1,%%xmm3 \n"
1039 "movdqa %2,%%xmm5 \n"
1040 :
1041 : "m"(kARGBToU), // %0
1042 "m"(kARGBToV), // %1
1043 "m"(kAddUV128) // %2
1044 );
1045 asm volatile (
1046 "sub %1,%2 \n"
1047 LABELALIGN
1048 "1: \n"
1049 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1050 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1051 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1052 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1053 BUNDLEALIGN
1054 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1055 "pavgb %%xmm7,%%xmm0 \n"
1056 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1057 "pavgb %%xmm7,%%xmm1 \n"
1058 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1059 "pavgb %%xmm7,%%xmm2 \n"
1060 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1061 "pavgb %%xmm7,%%xmm6 \n"
1062 "lea " MEMLEA(0x40,0) ",%0 \n"
1063 "movdqa %%xmm0,%%xmm7 \n"
1064 "shufps $0x88,%%xmm1,%%xmm0 \n"
1065 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1066 "pavgb %%xmm7,%%xmm0 \n"
1067 "movdqa %%xmm2,%%xmm7 \n"
1068 "shufps $0x88,%%xmm6,%%xmm2 \n"
1069 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1070 "pavgb %%xmm7,%%xmm2 \n"
1071 "movdqa %%xmm0,%%xmm1 \n"
1072 "movdqa %%xmm2,%%xmm6 \n"
1073 "pmaddubsw %%xmm4,%%xmm0 \n"
1074 "pmaddubsw %%xmm4,%%xmm2 \n"
1075 "pmaddubsw %%xmm3,%%xmm1 \n"
1076 "pmaddubsw %%xmm3,%%xmm6 \n"
1077 "phaddw %%xmm2,%%xmm0 \n"
1078 "phaddw %%xmm6,%%xmm1 \n"
1079 "psraw $0x8,%%xmm0 \n"
1080 "psraw $0x8,%%xmm1 \n"
1081 "packsswb %%xmm1,%%xmm0 \n"
1082 "paddb %%xmm5,%%xmm0 \n"
1083 "sub $0x10,%3 \n"
1084 "movlps %%xmm0," MEMACCESS(1) " \n"
1085 BUNDLEALIGN
1086 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1087 "lea " MEMLEA(0x8,1) ",%1 \n"
1088 "jg 1b \n"
1089 : "+r"(src_argb0), // %0
1090 "+r"(dst_u), // %1
1091 "+r"(dst_v), // %2
1092 "+rm"(width) // %3
1093 : "r"((intptr_t)(src_stride_argb)) // %4
1094 : "memory", "cc"
1095 #if defined(__native_client__) && defined(__x86_64__)
1096 , "r14"
1097 #endif
1098 #if defined(__SSE2__)
1099 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1100 #endif
1101 );
1102 }
1103
1104 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1105 uint8* dst_u, uint8* dst_v, int width) {
1106 asm volatile (
1107 "movdqa %0,%%xmm4 \n"
1108 "movdqa %1,%%xmm3 \n"
1109 "movdqa %2,%%xmm5 \n"
1110 :
1111 : "m"(kARGBToUJ), // %0
1112 "m"(kARGBToVJ), // %1
1113 "m"(kAddUVJ128) // %2
1114 );
1115 asm volatile (
1116 "sub %1,%2 \n"
1117 LABELALIGN
1118 "1: \n"
1119 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1120 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1121 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1122 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1123 BUNDLEALIGN
1124 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1125 "pavgb %%xmm7,%%xmm0 \n"
1126 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1127 "pavgb %%xmm7,%%xmm1 \n"
1128 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1129 "pavgb %%xmm7,%%xmm2 \n"
1130 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1131 "pavgb %%xmm7,%%xmm6 \n"
1132 "lea " MEMLEA(0x40,0) ",%0 \n"
1133 "movdqa %%xmm0,%%xmm7 \n"
1134 "shufps $0x88,%%xmm1,%%xmm0 \n"
1135 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1136 "pavgb %%xmm7,%%xmm0 \n"
1137 "movdqa %%xmm2,%%xmm7 \n"
1138 "shufps $0x88,%%xmm6,%%xmm2 \n"
1139 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1140 "pavgb %%xmm7,%%xmm2 \n"
1141 "movdqa %%xmm0,%%xmm1 \n"
1142 "movdqa %%xmm2,%%xmm6 \n"
1143 "pmaddubsw %%xmm4,%%xmm0 \n"
1144 "pmaddubsw %%xmm4,%%xmm2 \n"
1145 "pmaddubsw %%xmm3,%%xmm1 \n"
1146 "pmaddubsw %%xmm3,%%xmm6 \n"
1147 "phaddw %%xmm2,%%xmm0 \n"
1148 "phaddw %%xmm6,%%xmm1 \n"
1149 "paddw %%xmm5,%%xmm0 \n"
1150 "paddw %%xmm5,%%xmm1 \n"
1151 "psraw $0x8,%%xmm0 \n"
1152 "psraw $0x8,%%xmm1 \n"
1153 "packsswb %%xmm1,%%xmm0 \n"
1154 "sub $0x10,%3 \n"
1155 "movlps %%xmm0," MEMACCESS(1) " \n"
1156 BUNDLEALIGN
1157 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1158 "lea " MEMLEA(0x8,1) ",%1 \n"
1159 "jg 1b \n"
1160 : "+r"(src_argb0), // %0
1161 "+r"(dst_u), // %1
1162 "+r"(dst_v), // %2
1163 "+rm"(width) // %3
1164 : "r"((intptr_t)(src_stride_argb))
1165 : "memory", "cc"
1166 #if defined(__native_client__) && defined(__x86_64__)
1167 , "r14"
1168 #endif
1169 #if defined(__SSE2__)
1170 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1171 #endif
1172 );
1173 }
1174
1175 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1176 int width) {
1177 asm volatile (
1178 "movdqa %0,%%xmm4 \n"
1179 "movdqa %1,%%xmm3 \n"
1180 "movdqa %2,%%xmm5 \n"
1181 :
1182 : "m"(kARGBToU), // %0
1183 "m"(kARGBToV), // %1
1184 "m"(kAddUV128) // %2
1185 );
1186 asm volatile (
1187 "sub %1,%2 \n"
1188 LABELALIGN
1189 "1: \n"
1190 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1191 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1192 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1193 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1194 "pmaddubsw %%xmm4,%%xmm0 \n"
1195 "pmaddubsw %%xmm4,%%xmm1 \n"
1196 "pmaddubsw %%xmm4,%%xmm2 \n"
1197 "pmaddubsw %%xmm4,%%xmm6 \n"
1198 "phaddw %%xmm1,%%xmm0 \n"
1199 "phaddw %%xmm6,%%xmm2 \n"
1200 "psraw $0x8,%%xmm0 \n"
1201 "psraw $0x8,%%xmm2 \n"
1202 "packsswb %%xmm2,%%xmm0 \n"
1203 "paddb %%xmm5,%%xmm0 \n"
1204 "sub $0x10,%3 \n"
1205 "movdqa %%xmm0," MEMACCESS(1) " \n"
1206 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1207 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1208 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1209 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1210 "pmaddubsw %%xmm3,%%xmm0 \n"
1211 "pmaddubsw %%xmm3,%%xmm1 \n"
1212 "pmaddubsw %%xmm3,%%xmm2 \n"
1213 "pmaddubsw %%xmm3,%%xmm6 \n"
1214 "phaddw %%xmm1,%%xmm0 \n"
1215 "phaddw %%xmm6,%%xmm2 \n"
1216 "psraw $0x8,%%xmm0 \n"
1217 "psraw $0x8,%%xmm2 \n"
1218 "packsswb %%xmm2,%%xmm0 \n"
1219 "paddb %%xmm5,%%xmm0 \n"
1220 "lea " MEMLEA(0x40,0) ",%0 \n"
1221 BUNDLEALIGN
1222 MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
1223 "lea " MEMLEA(0x10,1) ",%1 \n"
1224 "jg 1b \n"
1225 : "+r"(src_argb), // %0
1226 "+r"(dst_u), // %1
1227 "+r"(dst_v), // %2
1228 "+rm"(width) // %3
1229 :
1230 : "memory", "cc"
1231 #if defined(__native_client__) && defined(__x86_64__)
1232 , "r14"
1233 #endif
1234 #if defined(__SSE2__)
1235 , "xmm0", "xmm1", "xmm2", "xmm6"
1236 #endif
1237 );
1238 }
1239
1240 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
1241 uint8* dst_v, int width) {
1242 asm volatile (
1243 "movdqa %0,%%xmm4 \n"
1244 "movdqa %1,%%xmm3 \n"
1245 "movdqa %2,%%xmm5 \n"
1246 :
1247 : "m"(kARGBToU), // %0
1248 "m"(kARGBToV), // %1
1249 "m"(kAddUV128) // %2
1250 );
1251 asm volatile (
1252 "sub %1,%2 \n"
1253 LABELALIGN
1254 "1: \n"
1255 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1256 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1257 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1258 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1259 "pmaddubsw %%xmm4,%%xmm0 \n"
1260 "pmaddubsw %%xmm4,%%xmm1 \n"
1261 "pmaddubsw %%xmm4,%%xmm2 \n"
1262 "pmaddubsw %%xmm4,%%xmm6 \n"
1263 "phaddw %%xmm1,%%xmm0 \n"
1264 "phaddw %%xmm6,%%xmm2 \n"
1265 "psraw $0x8,%%xmm0 \n"
1266 "psraw $0x8,%%xmm2 \n"
1267 "packsswb %%xmm2,%%xmm0 \n"
1268 "paddb %%xmm5,%%xmm0 \n"
1269 "sub $0x10,%3 \n"
1270 "movdqu %%xmm0," MEMACCESS(1) " \n"
1271 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1272 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1273 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1274 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1275 "pmaddubsw %%xmm3,%%xmm0 \n"
1276 "pmaddubsw %%xmm3,%%xmm1 \n"
1277 "pmaddubsw %%xmm3,%%xmm2 \n"
1278 "pmaddubsw %%xmm3,%%xmm6 \n"
1279 "phaddw %%xmm1,%%xmm0 \n"
1280 "phaddw %%xmm6,%%xmm2 \n"
1281 "psraw $0x8,%%xmm0 \n"
1282 "psraw $0x8,%%xmm2 \n"
1283 "packsswb %%xmm2,%%xmm0 \n"
1284 "paddb %%xmm5,%%xmm0 \n"
1285 "lea " MEMLEA(0x40,0) ",%0 \n"
1286 BUNDLEALIGN
1287 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
1288 "lea " MEMLEA(0x10,1) ",%1 \n"
1289 "jg 1b \n"
1290 : "+r"(src_argb), // %0
1291 "+r"(dst_u), // %1
1292 "+r"(dst_v), // %2
1293 "+rm"(width) // %3
1294 :
1295 : "memory", "cc"
1296 #if defined(__native_client__) && defined(__x86_64__)
1297 , "r14"
1298 #endif
1299 #if defined(__SSE2__)
1300 , "xmm0", "xmm1", "xmm2", "xmm6"
1301 #endif
1302 );
1303 }
1304
1305 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1306 uint8* dst_u, uint8* dst_v, int width) {
1307 asm volatile (
1308 "movdqa %0,%%xmm4 \n"
1309 "movdqa %1,%%xmm3 \n"
1310 "movdqa %2,%%xmm5 \n"
1311 :
1312 : "m"(kARGBToU), // %0
1313 "m"(kARGBToV), // %1
1314 "m"(kAddUV128) // %2
1315 );
1316 asm volatile (
1317 "sub %1,%2 \n"
1318 LABELALIGN
1319 "1: \n"
1320 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1321 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1322 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1323 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1324 "lea " MEMLEA(0x40,0) ",%0 \n"
1325 "movdqa %%xmm0,%%xmm7 \n"
1326 "shufps $0x88,%%xmm1,%%xmm0 \n"
1327 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1328 "pavgb %%xmm7,%%xmm0 \n"
1329 "movdqa %%xmm2,%%xmm7 \n"
1330 "shufps $0x88,%%xmm6,%%xmm2 \n"
1331 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1332 "pavgb %%xmm7,%%xmm2 \n"
1333 "movdqa %%xmm0,%%xmm1 \n"
1334 "movdqa %%xmm2,%%xmm6 \n"
1335 "pmaddubsw %%xmm4,%%xmm0 \n"
1336 "pmaddubsw %%xmm4,%%xmm2 \n"
1337 "pmaddubsw %%xmm3,%%xmm1 \n"
1338 "pmaddubsw %%xmm3,%%xmm6 \n"
1339 "phaddw %%xmm2,%%xmm0 \n"
1340 "phaddw %%xmm6,%%xmm1 \n"
1341 "psraw $0x8,%%xmm0 \n"
1342 "psraw $0x8,%%xmm1 \n"
1343 "packsswb %%xmm1,%%xmm0 \n"
1344 "paddb %%xmm5,%%xmm0 \n"
1345 "sub $0x10,%3 \n"
1346 "movlps %%xmm0," MEMACCESS(1) " \n"
1347 BUNDLEALIGN
1348 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1349 "lea " MEMLEA(0x8,1) ",%1 \n"
1350 "jg 1b \n"
1351 : "+r"(src_argb0), // %0
1352 "+r"(dst_u), // %1
1353 "+r"(dst_v), // %2
1354 "+rm"(width) // %3
1355 :
1356 : "memory", "cc"
1357 #if defined(__native_client__) && defined(__x86_64__)
1358 , "r14"
1359 #endif
1360 #if defined(__SSE2__)
1361 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1362 #endif
1363 );
1364 }
1365
1366 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1367 uint8* dst_u, uint8* dst_v, int width) {
1368 asm volatile (
1369 "movdqa %0,%%xmm4 \n"
1370 "movdqa %1,%%xmm3 \n"
1371 "movdqa %2,%%xmm5 \n"
1372 :
1373 : "m"(kARGBToU), // %0
1374 "m"(kARGBToV), // %1
1375 "m"(kAddUV128) // %2
1376 );
1377 asm volatile (
1378 "sub %1,%2 \n"
1379 LABELALIGN
1380 "1: \n"
1381 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1382 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1383 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1384 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1385 "lea " MEMLEA(0x40,0) ",%0 \n"
1386 "movdqa %%xmm0,%%xmm7 \n"
1387 "shufps $0x88,%%xmm1,%%xmm0 \n"
1388 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1389 "pavgb %%xmm7,%%xmm0 \n"
1390 "movdqa %%xmm2,%%xmm7 \n"
1391 "shufps $0x88,%%xmm6,%%xmm2 \n"
1392 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1393 "pavgb %%xmm7,%%xmm2 \n"
1394 "movdqa %%xmm0,%%xmm1 \n"
1395 "movdqa %%xmm2,%%xmm6 \n"
1396 "pmaddubsw %%xmm4,%%xmm0 \n"
1397 "pmaddubsw %%xmm4,%%xmm2 \n"
1398 "pmaddubsw %%xmm3,%%xmm1 \n"
1399 "pmaddubsw %%xmm3,%%xmm6 \n"
1400 "phaddw %%xmm2,%%xmm0 \n"
1401 "phaddw %%xmm6,%%xmm1 \n"
1402 "psraw $0x8,%%xmm0 \n"
1403 "psraw $0x8,%%xmm1 \n"
1404 "packsswb %%xmm1,%%xmm0 \n"
1405 "paddb %%xmm5,%%xmm0 \n"
1406 "sub $0x10,%3 \n"
1407 "movlps %%xmm0," MEMACCESS(1) " \n"
1408 BUNDLEALIGN
1409 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1410 "lea " MEMLEA(0x8,1) ",%1 \n"
1411 "jg 1b \n"
1412 : "+r"(src_argb0), // %0
1413 "+r"(dst_u), // %1
1414 "+r"(dst_v), // %2
1415 "+rm"(width) // %3
1416 :
1417 : "memory", "cc"
1418 #if defined(__native_client__) && defined(__x86_64__)
1419 , "r14"
1420 #endif
1421 #if defined(__SSE2__)
1422 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1423 #endif
1424 );
1425 }
1426
1427 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1428 asm volatile (
1429 "movdqa %4,%%xmm5 \n"
1430 "movdqa %3,%%xmm4 \n"
1431 LABELALIGN
1432 "1: \n"
1433 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1434 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1435 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1436 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1437 "pmaddubsw %%xmm4,%%xmm0 \n"
1438 "pmaddubsw %%xmm4,%%xmm1 \n"
1439 "pmaddubsw %%xmm4,%%xmm2 \n"
1440 "pmaddubsw %%xmm4,%%xmm3 \n"
1441 "lea " MEMLEA(0x40,0) ",%0 \n"
1442 "phaddw %%xmm1,%%xmm0 \n"
1443 "phaddw %%xmm3,%%xmm2 \n"
1444 "psrlw $0x7,%%xmm0 \n"
1445 "psrlw $0x7,%%xmm2 \n"
1446 "packuswb %%xmm2,%%xmm0 \n"
1447 "paddb %%xmm5,%%xmm0 \n"
1448 "sub $0x10,%2 \n"
1449 "movdqa %%xmm0," MEMACCESS(1) " \n"
1450 "lea " MEMLEA(0x10,1) ",%1 \n"
1451 "jg 1b \n"
1452 : "+r"(src_bgra), // %0
1453 "+r"(dst_y), // %1
1454 "+r"(pix) // %2
1455 : "m"(kBGRAToY), // %3
1456 "m"(kAddY16) // %4
1457 : "memory", "cc"
1458 #if defined(__SSE2__)
1459 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1460 #endif
1461 );
1462 }
1463
1464 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1465 asm volatile (
1466 "movdqa %4,%%xmm5 \n"
1467 "movdqa %3,%%xmm4 \n"
1468 LABELALIGN
1469 "1: \n"
1470 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1471 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1472 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1473 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1474 "pmaddubsw %%xmm4,%%xmm0 \n"
1475 "pmaddubsw %%xmm4,%%xmm1 \n"
1476 "pmaddubsw %%xmm4,%%xmm2 \n"
1477 "pmaddubsw %%xmm4,%%xmm3 \n"
1478 "lea " MEMLEA(0x40,0) ",%0 \n"
1479 "phaddw %%xmm1,%%xmm0 \n"
1480 "phaddw %%xmm3,%%xmm2 \n"
1481 "psrlw $0x7,%%xmm0 \n"
1482 "psrlw $0x7,%%xmm2 \n"
1483 "packuswb %%xmm2,%%xmm0 \n"
1484 "paddb %%xmm5,%%xmm0 \n"
1485 "sub $0x10,%2 \n"
1486 "movdqu %%xmm0," MEMACCESS(1) " \n"
1487 "lea " MEMLEA(0x10,1) ",%1 \n"
1488 "jg 1b \n"
1489 : "+r"(src_bgra), // %0
1490 "+r"(dst_y), // %1
1491 "+r"(pix) // %2
1492 : "m"(kBGRAToY), // %3
1493 "m"(kAddY16) // %4
1494 : "memory", "cc"
1495 #if defined(__SSE2__)
1496 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1497 #endif
1498 );
1499 }
1500
1501 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1502 uint8* dst_u, uint8* dst_v, int width) {
1503 asm volatile (
1504 "movdqa %0,%%xmm4 \n"
1505 "movdqa %1,%%xmm3 \n"
1506 "movdqa %2,%%xmm5 \n"
1507 :
1508 : "m"(kBGRAToU), // %0
1509 "m"(kBGRAToV), // %1
1510 "m"(kAddUV128) // %2
1511 );
1512 asm volatile (
1513 "sub %1,%2 \n"
1514 LABELALIGN
1515 "1: \n"
1516 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1517 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1518 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1519 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1520 BUNDLEALIGN
1521 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
1522 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
1523 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
1524 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
1525 "lea " MEMLEA(0x40,0) ",%0 \n"
1526 "movdqa %%xmm0,%%xmm7 \n"
1527 "shufps $0x88,%%xmm1,%%xmm0 \n"
1528 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1529 "pavgb %%xmm7,%%xmm0 \n"
1530 "movdqa %%xmm2,%%xmm7 \n"
1531 "shufps $0x88,%%xmm6,%%xmm2 \n"
1532 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1533 "pavgb %%xmm7,%%xmm2 \n"
1534 "movdqa %%xmm0,%%xmm1 \n"
1535 "movdqa %%xmm2,%%xmm6 \n"
1536 "pmaddubsw %%xmm4,%%xmm0 \n"
1537 "pmaddubsw %%xmm4,%%xmm2 \n"
1538 "pmaddubsw %%xmm3,%%xmm1 \n"
1539 "pmaddubsw %%xmm3,%%xmm6 \n"
1540 "phaddw %%xmm2,%%xmm0 \n"
1541 "phaddw %%xmm6,%%xmm1 \n"
1542 "psraw $0x8,%%xmm0 \n"
1543 "psraw $0x8,%%xmm1 \n"
1544 "packsswb %%xmm1,%%xmm0 \n"
1545 "paddb %%xmm5,%%xmm0 \n"
1546 "sub $0x10,%3 \n"
1547 "movlps %%xmm0," MEMACCESS(1) " \n"
1548 BUNDLEALIGN
1549 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1550 "lea " MEMLEA(0x8,1) ",%1 \n"
1551 "jg 1b \n"
1552 : "+r"(src_bgra0), // %0
1553 "+r"(dst_u), // %1
1554 "+r"(dst_v), // %2
1555 "+rm"(width) // %3
1556 : "r"((intptr_t)(src_stride_bgra)) // %4
1557 : "memory", "cc"
1558 #if defined(__native_client__) && defined(__x86_64__)
1559 , "r14"
1560 #endif
1561 #if defined(__SSE2__)
1562 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1563 #endif
1564 );
1565 }
1566
1567 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1568 uint8* dst_u, uint8* dst_v, int width) {
1569 asm volatile (
1570 "movdqa %0,%%xmm4 \n"
1571 "movdqa %1,%%xmm3 \n"
1572 "movdqa %2,%%xmm5 \n"
1573 :
1574 : "m"(kBGRAToU), // %0
1575 "m"(kBGRAToV), // %1
1576 "m"(kAddUV128) // %2
1577 );
1578 asm volatile (
1579 "sub %1,%2 \n"
1580 LABELALIGN
1581 "1: \n"
1582 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1583 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1584 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1585 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1586 BUNDLEALIGN
1587 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1588 "pavgb %%xmm7,%%xmm0 \n"
1589 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1590 "pavgb %%xmm7,%%xmm1 \n"
1591 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1592 "pavgb %%xmm7,%%xmm2 \n"
1593 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1594 "pavgb %%xmm7,%%xmm6 \n"
1595 "lea " MEMLEA(0x40,0) ",%0 \n"
1596 "movdqa %%xmm0,%%xmm7 \n"
1597 "shufps $0x88,%%xmm1,%%xmm0 \n"
1598 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1599 "pavgb %%xmm7,%%xmm0 \n"
1600 "movdqa %%xmm2,%%xmm7 \n"
1601 "shufps $0x88,%%xmm6,%%xmm2 \n"
1602 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1603 "pavgb %%xmm7,%%xmm2 \n"
1604 "movdqa %%xmm0,%%xmm1 \n"
1605 "movdqa %%xmm2,%%xmm6 \n"
1606 "pmaddubsw %%xmm4,%%xmm0 \n"
1607 "pmaddubsw %%xmm4,%%xmm2 \n"
1608 "pmaddubsw %%xmm3,%%xmm1 \n"
1609 "pmaddubsw %%xmm3,%%xmm6 \n"
1610 "phaddw %%xmm2,%%xmm0 \n"
1611 "phaddw %%xmm6,%%xmm1 \n"
1612 "psraw $0x8,%%xmm0 \n"
1613 "psraw $0x8,%%xmm1 \n"
1614 "packsswb %%xmm1,%%xmm0 \n"
1615 "paddb %%xmm5,%%xmm0 \n"
1616 "sub $0x10,%3 \n"
1617 "movlps %%xmm0," MEMACCESS(1) " \n"
1618 BUNDLEALIGN
1619 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1620 "lea " MEMLEA(0x8,1) ",%1 \n"
1621 "jg 1b \n"
1622 : "+r"(src_bgra0), // %0
1623 "+r"(dst_u), // %1
1624 "+r"(dst_v), // %2
1625 "+rm"(width) // %3
1626 : "r"((intptr_t)(src_stride_bgra)) // %4
1627 : "memory", "cc"
1628 #if defined(__native_client__) && defined(__x86_64__)
1629 , "r14"
1630 #endif
1631 #if defined(__SSE2__)
1632 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1633 #endif
1634 );
1635 }
1636
1637 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1638 asm volatile (
1639 "movdqa %4,%%xmm5 \n"
1640 "movdqa %3,%%xmm4 \n"
1641 LABELALIGN
1642 "1: \n"
1643 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1644 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1645 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1646 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1647 "pmaddubsw %%xmm4,%%xmm0 \n"
1648 "pmaddubsw %%xmm4,%%xmm1 \n"
1649 "pmaddubsw %%xmm4,%%xmm2 \n"
1650 "pmaddubsw %%xmm4,%%xmm3 \n"
1651 "lea " MEMLEA(0x40,0) ",%0 \n"
1652 "phaddw %%xmm1,%%xmm0 \n"
1653 "phaddw %%xmm3,%%xmm2 \n"
1654 "psrlw $0x7,%%xmm0 \n"
1655 "psrlw $0x7,%%xmm2 \n"
1656 "packuswb %%xmm2,%%xmm0 \n"
1657 "paddb %%xmm5,%%xmm0 \n"
1658 "sub $0x10,%2 \n"
1659 "movdqa %%xmm0," MEMACCESS(1) " \n"
1660 "lea " MEMLEA(0x10,1) ",%1 \n"
1661 "jg 1b \n"
1662 : "+r"(src_abgr), // %0
1663 "+r"(dst_y), // %1
1664 "+r"(pix) // %2
1665 : "m"(kABGRToY), // %3
1666 "m"(kAddY16) // %4
1667 : "memory", "cc"
1668 #if defined(__SSE2__)
1669 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1670 #endif
1671 );
1672 }
1673
1674 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1675 asm volatile (
1676 "movdqa %4,%%xmm5 \n"
1677 "movdqa %3,%%xmm4 \n"
1678 LABELALIGN
1679 "1: \n"
1680 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1681 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1682 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1683 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1684 "pmaddubsw %%xmm4,%%xmm0 \n"
1685 "pmaddubsw %%xmm4,%%xmm1 \n"
1686 "pmaddubsw %%xmm4,%%xmm2 \n"
1687 "pmaddubsw %%xmm4,%%xmm3 \n"
1688 "lea " MEMLEA(0x40,0) ",%0 \n"
1689 "phaddw %%xmm1,%%xmm0 \n"
1690 "phaddw %%xmm3,%%xmm2 \n"
1691 "psrlw $0x7,%%xmm0 \n"
1692 "psrlw $0x7,%%xmm2 \n"
1693 "packuswb %%xmm2,%%xmm0 \n"
1694 "paddb %%xmm5,%%xmm0 \n"
1695 "sub $0x10,%2 \n"
1696 "movdqu %%xmm0," MEMACCESS(1) " \n"
1697 "lea " MEMLEA(0x10,1) ",%1 \n"
1698 "jg 1b \n"
1699 : "+r"(src_abgr), // %0
1700 "+r"(dst_y), // %1
1701 "+r"(pix) // %2
1702 : "m"(kABGRToY), // %3
1703 "m"(kAddY16) // %4
1704 : "memory", "cc"
1705 #if defined(__SSE2__)
1706 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1707 #endif
1708 );
1709 }
1710
1711 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1712 asm volatile (
1713 "movdqa %4,%%xmm5 \n"
1714 "movdqa %3,%%xmm4 \n"
1715 LABELALIGN
1716 "1: \n"
1717 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1718 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1719 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1720 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1721 "pmaddubsw %%xmm4,%%xmm0 \n"
1722 "pmaddubsw %%xmm4,%%xmm1 \n"
1723 "pmaddubsw %%xmm4,%%xmm2 \n"
1724 "pmaddubsw %%xmm4,%%xmm3 \n"
1725 "lea " MEMLEA(0x40,0) ",%0 \n"
1726 "phaddw %%xmm1,%%xmm0 \n"
1727 "phaddw %%xmm3,%%xmm2 \n"
1728 "psrlw $0x7,%%xmm0 \n"
1729 "psrlw $0x7,%%xmm2 \n"
1730 "packuswb %%xmm2,%%xmm0 \n"
1731 "paddb %%xmm5,%%xmm0 \n"
1732 "sub $0x10,%2 \n"
1733 "movdqa %%xmm0," MEMACCESS(1) " \n"
1734 "lea " MEMLEA(0x10,1) ",%1 \n"
1735 "jg 1b \n"
1736 : "+r"(src_rgba), // %0
1737 "+r"(dst_y), // %1
1738 "+r"(pix) // %2
1739 : "m"(kRGBAToY), // %3
1740 "m"(kAddY16) // %4
1741 : "memory", "cc"
1742 #if defined(__SSE2__)
1743 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1744 #endif
1745 );
1746 }
1747
1748 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1749 asm volatile (
1750 "movdqa %4,%%xmm5 \n"
1751 "movdqa %3,%%xmm4 \n"
1752 LABELALIGN
1753 "1: \n"
1754 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1755 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1756 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1757 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1758 "pmaddubsw %%xmm4,%%xmm0 \n"
1759 "pmaddubsw %%xmm4,%%xmm1 \n"
1760 "pmaddubsw %%xmm4,%%xmm2 \n"
1761 "pmaddubsw %%xmm4,%%xmm3 \n"
1762 "lea " MEMLEA(0x40,0) ",%0 \n"
1763 "phaddw %%xmm1,%%xmm0 \n"
1764 "phaddw %%xmm3,%%xmm2 \n"
1765 "psrlw $0x7,%%xmm0 \n"
1766 "psrlw $0x7,%%xmm2 \n"
1767 "packuswb %%xmm2,%%xmm0 \n"
1768 "paddb %%xmm5,%%xmm0 \n"
1769 "sub $0x10,%2 \n"
1770 "movdqu %%xmm0," MEMACCESS(1) " \n"
1771 "lea " MEMLEA(0x10,1) ",%1 \n"
1772 "jg 1b \n"
1773 : "+r"(src_rgba), // %0
1774 "+r"(dst_y), // %1
1775 "+r"(pix) // %2
1776 : "m"(kRGBAToY), // %3
1777 "m"(kAddY16) // %4
1778 : "memory", "cc"
1779 #if defined(__SSE2__)
1780 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1781 #endif
1782 );
1783 }
1784
1785 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1786 uint8* dst_u, uint8* dst_v, int width) {
1787 asm volatile (
1788 "movdqa %0,%%xmm4 \n"
1789 "movdqa %1,%%xmm3 \n"
1790 "movdqa %2,%%xmm5 \n"
1791 :
1792 : "m"(kABGRToU), // %0
1793 "m"(kABGRToV), // %1
1794 "m"(kAddUV128) // %2
1795 );
1796 asm volatile (
1797 "sub %1,%2 \n"
1798 LABELALIGN
1799 "1: \n"
1800 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1801 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1802 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1803 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1804 BUNDLEALIGN
1805 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
1806 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
1807 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
1808 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
1809 "lea " MEMLEA(0x40,0) ",%0 \n"
1810 "movdqa %%xmm0,%%xmm7 \n"
1811 "shufps $0x88,%%xmm1,%%xmm0 \n"
1812 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1813 "pavgb %%xmm7,%%xmm0 \n"
1814 "movdqa %%xmm2,%%xmm7 \n"
1815 "shufps $0x88,%%xmm6,%%xmm2 \n"
1816 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1817 "pavgb %%xmm7,%%xmm2 \n"
1818 "movdqa %%xmm0,%%xmm1 \n"
1819 "movdqa %%xmm2,%%xmm6 \n"
1820 "pmaddubsw %%xmm4,%%xmm0 \n"
1821 "pmaddubsw %%xmm4,%%xmm2 \n"
1822 "pmaddubsw %%xmm3,%%xmm1 \n"
1823 "pmaddubsw %%xmm3,%%xmm6 \n"
1824 "phaddw %%xmm2,%%xmm0 \n"
1825 "phaddw %%xmm6,%%xmm1 \n"
1826 "psraw $0x8,%%xmm0 \n"
1827 "psraw $0x8,%%xmm1 \n"
1828 "packsswb %%xmm1,%%xmm0 \n"
1829 "paddb %%xmm5,%%xmm0 \n"
1830 "sub $0x10,%3 \n"
1831 "movlps %%xmm0," MEMACCESS(1) " \n"
1832 BUNDLEALIGN
1833 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1834 "lea " MEMLEA(0x8,1) ",%1 \n"
1835 "jg 1b \n"
1836 : "+r"(src_abgr0), // %0
1837 "+r"(dst_u), // %1
1838 "+r"(dst_v), // %2
1839 "+rm"(width) // %3
1840 : "r"((intptr_t)(src_stride_abgr)) // %4
1841 : "memory", "cc"
1842 #if defined(__native_client__) && defined(__x86_64__)
1843 , "r14"
1844 #endif
1845 #if defined(__SSE2__)
1846 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1847 #endif
1848 );
1849 }
1850
1851 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1852 uint8* dst_u, uint8* dst_v, int width) {
1853 asm volatile (
1854 "movdqa %0,%%xmm4 \n"
1855 "movdqa %1,%%xmm3 \n"
1856 "movdqa %2,%%xmm5 \n"
1857 :
1858 : "m"(kABGRToU), // %0
1859 "m"(kABGRToV), // %1
1860 "m"(kAddUV128) // %2
1861 );
1862 asm volatile (
1863 "sub %1,%2 \n"
1864 LABELALIGN
1865 "1: \n"
1866 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1867 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1868 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1869 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1870 BUNDLEALIGN
1871 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1872 "pavgb %%xmm7,%%xmm0 \n"
1873 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1874 "pavgb %%xmm7,%%xmm1 \n"
1875 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1876 "pavgb %%xmm7,%%xmm2 \n"
1877 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1878 "pavgb %%xmm7,%%xmm6 \n"
1879 "lea " MEMLEA(0x40,0) ",%0 \n"
1880 "movdqa %%xmm0,%%xmm7 \n"
1881 "shufps $0x88,%%xmm1,%%xmm0 \n"
1882 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1883 "pavgb %%xmm7,%%xmm0 \n"
1884 "movdqa %%xmm2,%%xmm7 \n"
1885 "shufps $0x88,%%xmm6,%%xmm2 \n"
1886 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1887 "pavgb %%xmm7,%%xmm2 \n"
1888 "movdqa %%xmm0,%%xmm1 \n"
1889 "movdqa %%xmm2,%%xmm6 \n"
1890 "pmaddubsw %%xmm4,%%xmm0 \n"
1891 "pmaddubsw %%xmm4,%%xmm2 \n"
1892 "pmaddubsw %%xmm3,%%xmm1 \n"
1893 "pmaddubsw %%xmm3,%%xmm6 \n"
1894 "phaddw %%xmm2,%%xmm0 \n"
1895 "phaddw %%xmm6,%%xmm1 \n"
1896 "psraw $0x8,%%xmm0 \n"
1897 "psraw $0x8,%%xmm1 \n"
1898 "packsswb %%xmm1,%%xmm0 \n"
1899 "paddb %%xmm5,%%xmm0 \n"
1900 "sub $0x10,%3 \n"
1901 "movlps %%xmm0," MEMACCESS(1) " \n"
1902 BUNDLEALIGN
1903 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1904 "lea " MEMLEA(0x8,1) ",%1 \n"
1905 "jg 1b \n"
1906 : "+r"(src_abgr0), // %0
1907 "+r"(dst_u), // %1
1908 "+r"(dst_v), // %2
1909 "+rm"(width) // %3
1910 : "r"((intptr_t)(src_stride_abgr)) // %4
1911 : "memory", "cc"
1912 #if defined(__native_client__) && defined(__x86_64__)
1913 , "r14"
1914 #endif
1915 #if defined(__SSE2__)
1916 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1917 #endif
1918 );
1919 }
1920
1921 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1922 uint8* dst_u, uint8* dst_v, int width) {
1923 asm volatile (
1924 "movdqa %0,%%xmm4 \n"
1925 "movdqa %1,%%xmm3 \n"
1926 "movdqa %2,%%xmm5 \n"
1927 :
1928 : "m"(kRGBAToU), // %0
1929 "m"(kRGBAToV), // %1
1930 "m"(kAddUV128) // %2
1931 );
1932 asm volatile (
1933 "sub %1,%2 \n"
1934 LABELALIGN
1935 "1: \n"
1936 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
1937 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1938 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1939 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1940 BUNDLEALIGN
1941 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
1942 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
1943 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
1944 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
1945 "lea " MEMLEA(0x40,0) ",%0 \n"
1946 "movdqa %%xmm0,%%xmm7 \n"
1947 "shufps $0x88,%%xmm1,%%xmm0 \n"
1948 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1949 "pavgb %%xmm7,%%xmm0 \n"
1950 "movdqa %%xmm2,%%xmm7 \n"
1951 "shufps $0x88,%%xmm6,%%xmm2 \n"
1952 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1953 "pavgb %%xmm7,%%xmm2 \n"
1954 "movdqa %%xmm0,%%xmm1 \n"
1955 "movdqa %%xmm2,%%xmm6 \n"
1956 "pmaddubsw %%xmm4,%%xmm0 \n"
1957 "pmaddubsw %%xmm4,%%xmm2 \n"
1958 "pmaddubsw %%xmm3,%%xmm1 \n"
1959 "pmaddubsw %%xmm3,%%xmm6 \n"
1960 "phaddw %%xmm2,%%xmm0 \n"
1961 "phaddw %%xmm6,%%xmm1 \n"
1962 "psraw $0x8,%%xmm0 \n"
1963 "psraw $0x8,%%xmm1 \n"
1964 "packsswb %%xmm1,%%xmm0 \n"
1965 "paddb %%xmm5,%%xmm0 \n"
1966 "sub $0x10,%3 \n"
1967 "movlps %%xmm0," MEMACCESS(1) " \n"
1968 BUNDLEALIGN
1969 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1970 "lea " MEMLEA(0x8,1) ",%1 \n"
1971 "jg 1b \n"
1972 : "+r"(src_rgba0), // %0
1973 "+r"(dst_u), // %1
1974 "+r"(dst_v), // %2
1975 "+rm"(width) // %3
1976 : "r"((intptr_t)(src_stride_rgba))
1977 : "memory", "cc"
1978 #if defined(__native_client__) && defined(__x86_64__)
1979 , "r14"
1980 #endif
1981 #if defined(__SSE2__)
1982 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1983 #endif
1984 );
1985 }
1986
1987 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1988 uint8* dst_u, uint8* dst_v, int width) {
1989 asm volatile (
1990 "movdqa %0,%%xmm4 \n"
1991 "movdqa %1,%%xmm3 \n"
1992 "movdqa %2,%%xmm5 \n"
1993 :
1994 : "m"(kRGBAToU), // %0
1995 "m"(kRGBAToV), // %1
1996 "m"(kAddUV128) // %2
1997 );
1998 asm volatile (
1999 "sub %1,%2 \n"
2000 LABELALIGN
2001 "1: \n"
2002 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2003 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2004 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
2005 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
2006 BUNDLEALIGN
2007 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
2008 "pavgb %%xmm7,%%xmm0 \n"
2009 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
2010 "pavgb %%xmm7,%%xmm1 \n"
2011 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
2012 "pavgb %%xmm7,%%xmm2 \n"
2013 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
2014 "pavgb %%xmm7,%%xmm6 \n"
2015 "lea " MEMLEA(0x40,0) ",%0 \n"
2016 "movdqa %%xmm0,%%xmm7 \n"
2017 "shufps $0x88,%%xmm1,%%xmm0 \n"
2018 "shufps $0xdd,%%xmm1,%%xmm7 \n"
2019 "pavgb %%xmm7,%%xmm0 \n"
2020 "movdqa %%xmm2,%%xmm7 \n"
2021 "shufps $0x88,%%xmm6,%%xmm2 \n"
2022 "shufps $0xdd,%%xmm6,%%xmm7 \n"
2023 "pavgb %%xmm7,%%xmm2 \n"
2024 "movdqa %%xmm0,%%xmm1 \n"
2025 "movdqa %%xmm2,%%xmm6 \n"
2026 "pmaddubsw %%xmm4,%%xmm0 \n"
2027 "pmaddubsw %%xmm4,%%xmm2 \n"
2028 "pmaddubsw %%xmm3,%%xmm1 \n"
2029 "pmaddubsw %%xmm3,%%xmm6 \n"
2030 "phaddw %%xmm2,%%xmm0 \n"
2031 "phaddw %%xmm6,%%xmm1 \n"
2032 "psraw $0x8,%%xmm0 \n"
2033 "psraw $0x8,%%xmm1 \n"
2034 "packsswb %%xmm1,%%xmm0 \n"
2035 "paddb %%xmm5,%%xmm0 \n"
2036 "sub $0x10,%3 \n"
2037 "movlps %%xmm0," MEMACCESS(1) " \n"
2038 BUNDLEALIGN
2039 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
2040 "lea " MEMLEA(0x8,1) ",%1 \n"
2041 "jg 1b \n"
2042 : "+r"(src_rgba0), // %0
2043 "+r"(dst_u), // %1
2044 "+r"(dst_v), // %2
2045 "+rm"(width) // %3
2046 : "r"((intptr_t)(src_stride_rgba)) // %4
2047 : "memory", "cc"
2048 #if defined(__native_client__) && defined(__x86_64__)
2049 , "r14"
2050 #endif
2051 #if defined(__SSE2__)
2052 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
2053 #endif
2054 );
2055 }
2056 #endif // HAS_ARGBTOUVROW_SSSE3
2057
2058 #ifdef HAS_I422TOARGBROW_SSSE3
2059 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
2060 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
2061 #define UR 0
2062
2063 #define VB 0
2064 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
2065 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
2066
2067 // Bias
2068 #define BB UB * 128 + VB * 128
2069 #define BG UG * 128 + VG * 128
2070 #define BR UR * 128 + VR * 128
2071
2072 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
2073
2074 struct {
2075 vec8 kUVToB; // 0
2076 vec8 kUVToG; // 16
2077 vec8 kUVToR; // 32
2078 vec16 kUVBiasB; // 48
2079 vec16 kUVBiasG; // 64
2080 vec16 kUVBiasR; // 80
2081 vec16 kYSub16; // 96
2082 vec16 kYToRgb; // 112
2083 vec8 kVUToB; // 128
2084 vec8 kVUToG; // 144
2085 vec8 kVUToR; // 160
2086 } static SIMD_ALIGNED(kYuvConstants) = {
2087 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
2088 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
2089 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
2090 { BB, BB, BB, BB, BB, BB, BB, BB },
2091 { BG, BG, BG, BG, BG, BG, BG, BG },
2092 { BR, BR, BR, BR, BR, BR, BR, BR },
2093 { 16, 16, 16, 16, 16, 16, 16, 16 },
2094 { YG, YG, YG, YG, YG, YG, YG, YG },
2095 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
2096 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
2097 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
2098 };
2099
2100
2101 // Read 8 UV from 411
2102 #define READYUV444 \
2103 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
2104 BUNDLEALIGN \
2105 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
2106 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
2107 "punpcklbw %%xmm1,%%xmm0 \n"
2108
2109 // Read 4 UV from 422, upsample to 8 UV
2110 #define READYUV422 \
2111 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
2112 BUNDLEALIGN \
2113 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
2114 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
2115 "punpcklbw %%xmm1,%%xmm0 \n" \
2116 "punpcklwd %%xmm0,%%xmm0 \n"
2117
2118 // Read 2 UV from 411, upsample to 8 UV
2119 #define READYUV411 \
2120 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
2121 BUNDLEALIGN \
2122 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
2123 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
2124 "punpcklbw %%xmm1,%%xmm0 \n" \
2125 "punpcklwd %%xmm0,%%xmm0 \n" \
2126 "punpckldq %%xmm0,%%xmm0 \n"
2127
2128 // Read 4 UV from NV12, upsample to 8 UV
2129 #define READNV12 \
2130 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
2131 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
2132 "punpcklwd %%xmm0,%%xmm0 \n"
2133
2134 // Convert 8 pixels: 8 UV and 8 Y
2135 #define YUVTORGB \
2136 "movdqa %%xmm0,%%xmm1 \n" \
2137 "movdqa %%xmm0,%%xmm2 \n" \
2138 "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \
2139 "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \
2140 "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \
2141 "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
2142 "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
2143 "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
2144 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
2145 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
2146 "punpcklbw %%xmm4,%%xmm3 \n" \
2147 "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
2148 "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
2149 "paddsw %%xmm3,%%xmm0 \n" \
2150 "paddsw %%xmm3,%%xmm1 \n" \
2151 "paddsw %%xmm3,%%xmm2 \n" \
2152 "psraw $0x6,%%xmm0 \n" \
2153 "psraw $0x6,%%xmm1 \n" \
2154 "psraw $0x6,%%xmm2 \n" \
2155 "packuswb %%xmm0,%%xmm0 \n" \
2156 "packuswb %%xmm1,%%xmm1 \n" \
2157 "packuswb %%xmm2,%%xmm2 \n"
2158
2159 // Convert 8 pixels: 8 VU and 8 Y
2160 #define YVUTORGB \
2161 "movdqa %%xmm0,%%xmm1 \n" \
2162 "movdqa %%xmm0,%%xmm2 \n" \
2163 "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \
2164 "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \
2165 "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \
2166 "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
2167 "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
2168 "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
2169 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
2170 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
2171 "punpcklbw %%xmm4,%%xmm3 \n" \
2172 "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
2173 "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
2174 "paddsw %%xmm3,%%xmm0 \n" \
2175 "paddsw %%xmm3,%%xmm1 \n" \
2176 "paddsw %%xmm3,%%xmm2 \n" \
2177 "psraw $0x6,%%xmm0 \n" \
2178 "psraw $0x6,%%xmm1 \n" \
2179 "psraw $0x6,%%xmm2 \n" \
2180 "packuswb %%xmm0,%%xmm0 \n" \
2181 "packuswb %%xmm1,%%xmm1 \n" \
2182 "packuswb %%xmm2,%%xmm2 \n"
2183
2184 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
2185 const uint8* u_buf,
2186 const uint8* v_buf,
2187 uint8* dst_argb,
2188 int width) {
2189 asm volatile (
2190 "sub %[u_buf],%[v_buf] \n"
2191 "pcmpeqb %%xmm5,%%xmm5 \n"
2192 "pxor %%xmm4,%%xmm4 \n"
2193 LABELALIGN
2194 "1: \n"
2195 READYUV444
2196 YUVTORGB
2197 "punpcklbw %%xmm1,%%xmm0 \n"
2198 "punpcklbw %%xmm5,%%xmm2 \n"
2199 "movdqa %%xmm0,%%xmm1 \n"
2200 "punpcklwd %%xmm2,%%xmm0 \n"
2201 "punpckhwd %%xmm2,%%xmm1 \n"
2202 "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
2203 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
2204 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2205 "sub $0x8,%[width] \n"
2206 "jg 1b \n"
2207 : [y_buf]"+r"(y_buf), // %[y_buf]
2208 [u_buf]"+r"(u_buf), // %[u_buf]
2209 [v_buf]"+r"(v_buf), // %[v_buf]
2210 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2211 [width]"+rm"(width) // %[width]
2212 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2213 : "memory", "cc"
2214 #if defined(__native_client__) && defined(__x86_64__)
2215 , "r14"
2216 #endif
2217 #if defined(__SSE2__)
2218 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2219 #endif
2220 );
2221 }
2222
2223 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
2224 const uint8* u_buf,
2225 const uint8* v_buf,
2226 uint8* dst_rgb24,
2227 int width) {
2228 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
2229 #if defined(__i386__)
2230 asm volatile (
2231 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2232 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2233 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2234 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
2235 #endif
2236
2237 asm volatile (
2238 #if !defined(__i386__)
2239 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2240 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2241 #endif
2242 "sub %[u_buf],%[v_buf] \n"
2243 "pxor %%xmm4,%%xmm4 \n"
2244 LABELALIGN
2245 "1: \n"
2246 READYUV422
2247 YUVTORGB
2248 "punpcklbw %%xmm1,%%xmm0 \n"
2249 "punpcklbw %%xmm2,%%xmm2 \n"
2250 "movdqa %%xmm0,%%xmm1 \n"
2251 "punpcklwd %%xmm2,%%xmm0 \n"
2252 "punpckhwd %%xmm2,%%xmm1 \n"
2253 "pshufb %%xmm5,%%xmm0 \n"
2254 "pshufb %%xmm6,%%xmm1 \n"
2255 "palignr $0xc,%%xmm0,%%xmm1 \n"
2256 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
2257 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
2258 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
2259 "sub $0x8,%[width] \n"
2260 "jg 1b \n"
2261 : [y_buf]"+r"(y_buf), // %[y_buf]
2262 [u_buf]"+r"(u_buf), // %[u_buf]
2263 [v_buf]"+r"(v_buf), // %[v_buf]
2264 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
2265 [width]"+rm"(width) // %[width]
2266 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
2267 #if !defined(__i386__)
2268 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2269 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2270 #endif
2271 : "memory", "cc"
2272 #if defined(__native_client__) && defined(__x86_64__)
2273 , "r14"
2274 #endif
2275 #if defined(__SSE2__)
2276 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2277 #endif
2278 );
2279 }
2280
2281 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
2282 const uint8* u_buf,
2283 const uint8* v_buf,
2284 uint8* dst_raw,
2285 int width) {
2286 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
2287 #if defined(__i386__)
2288 asm volatile (
2289 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
2290 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
2291 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2292 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
2293 #endif
2294
2295 asm volatile (
2296 #if !defined(__i386__)
2297 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
2298 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
2299 #endif
2300 "sub %[u_buf],%[v_buf] \n"
2301 "pxor %%xmm4,%%xmm4 \n"
2302 LABELALIGN
2303 "1: \n"
2304 READYUV422
2305 YUVTORGB
2306 "punpcklbw %%xmm1,%%xmm0 \n"
2307 "punpcklbw %%xmm2,%%xmm2 \n"
2308 "movdqa %%xmm0,%%xmm1 \n"
2309 "punpcklwd %%xmm2,%%xmm0 \n"
2310 "punpckhwd %%xmm2,%%xmm1 \n"
2311 "pshufb %%xmm5,%%xmm0 \n"
2312 "pshufb %%xmm6,%%xmm1 \n"
2313 "palignr $0xc,%%xmm0,%%xmm1 \n"
2314 "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
2315 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
2316 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
2317 "sub $0x8,%[width] \n"
2318 "jg 1b \n"
2319 : [y_buf]"+r"(y_buf), // %[y_buf]
2320 [u_buf]"+r"(u_buf), // %[u_buf]
2321 [v_buf]"+r"(v_buf), // %[v_buf]
2322 [dst_raw]"+r"(dst_raw), // %[dst_raw]
2323 [width]"+rm"(width) // %[width]
2324 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
2325 #if !defined(__i386__)
2326 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2327 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
2328 #endif
2329 : "memory", "cc"
2330 #if defined(__native_client__) && defined(__x86_64__)
2331 , "r14"
2332 #endif
2333 #if defined(__SSE2__)
2334 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2335 #endif
2336 );
2337 }
2338
2339 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
2340 const uint8* u_buf,
2341 const uint8* v_buf,
2342 uint8* dst_argb,
2343 int width) {
2344 asm volatile (
2345 "sub %[u_buf],%[v_buf] \n"
2346 "pcmpeqb %%xmm5,%%xmm5 \n"
2347 "pxor %%xmm4,%%xmm4 \n"
2348 LABELALIGN
2349 "1: \n"
2350 READYUV422
2351 YUVTORGB
2352 "punpcklbw %%xmm1,%%xmm0 \n"
2353 "punpcklbw %%xmm5,%%xmm2 \n"
2354 "movdqa %%xmm0,%%xmm1 \n"
2355 "punpcklwd %%xmm2,%%xmm0 \n"
2356 "punpckhwd %%xmm2,%%xmm1 \n"
2357 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
2358 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2359 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2360 "sub $0x8,%[width] \n"
2361 "jg 1b \n"
2362 : [y_buf]"+r"(y_buf), // %[y_buf]
2363 [u_buf]"+r"(u_buf), // %[u_buf]
2364 [v_buf]"+r"(v_buf), // %[v_buf]
2365 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2366 [width]"+rm"(width) // %[width]
2367 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2368 : "memory", "cc"
2369 #if defined(__native_client__) && defined(__x86_64__)
2370 , "r14"
2371 #endif
2372 #if defined(__SSE2__)
2373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2374 #endif
2375 );
2376 }
2377
2378 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
2379 const uint8* u_buf,
2380 const uint8* v_buf,
2381 uint8* dst_argb,
2382 int width) {
2383 asm volatile (
2384 "sub %[u_buf],%[v_buf] \n"
2385 "pcmpeqb %%xmm5,%%xmm5 \n"
2386 "pxor %%xmm4,%%xmm4 \n"
2387 LABELALIGN
2388 "1: \n"
2389 READYUV411
2390 YUVTORGB
2391 "punpcklbw %%xmm1,%%xmm0 \n"
2392 "punpcklbw %%xmm5,%%xmm2 \n"
2393 "movdqa %%xmm0,%%xmm1 \n"
2394 "punpcklwd %%xmm2,%%xmm0 \n"
2395 "punpckhwd %%xmm2,%%xmm1 \n"
2396 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
2397 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2398 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2399 "sub $0x8,%[width] \n"
2400 "jg 1b \n"
2401 : [y_buf]"+r"(y_buf), // %[y_buf]
2402 [u_buf]"+r"(u_buf), // %[u_buf]
2403 [v_buf]"+r"(v_buf), // %[v_buf]
2404 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2405 [width]"+rm"(width) // %[width]
2406 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2407 : "memory", "cc"
2408 #if defined(__native_client__) && defined(__x86_64__)
2409 , "r14"
2410 #endif
2411 #if defined(__SSE2__)
2412 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2413 #endif
2414 );
2415 }
2416
2417 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
2418 const uint8* uv_buf,
2419 uint8* dst_argb,
2420 int width) {
2421 asm volatile (
2422 "pcmpeqb %%xmm5,%%xmm5 \n"
2423 "pxor %%xmm4,%%xmm4 \n"
2424 LABELALIGN
2425 "1: \n"
2426 READNV12
2427 YUVTORGB
2428 "punpcklbw %%xmm1,%%xmm0 \n"
2429 "punpcklbw %%xmm5,%%xmm2 \n"
2430 "movdqa %%xmm0,%%xmm1 \n"
2431 "punpcklwd %%xmm2,%%xmm0 \n"
2432 "punpckhwd %%xmm2,%%xmm1 \n"
2433 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
2434 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2435 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2436 "sub $0x8,%[width] \n"
2437 "jg 1b \n"
2438 : [y_buf]"+r"(y_buf), // %[y_buf]
2439 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2440 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2441 [width]"+rm"(width) // %[width]
2442 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2443 : "memory", "cc"
2444 // Does not use r14.
2445 #if defined(__SSE2__)
2446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2447 #endif
2448 );
2449 }
2450
2451 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
2452 const uint8* uv_buf,
2453 uint8* dst_argb,
2454 int width) {
2455 asm volatile (
2456 "pcmpeqb %%xmm5,%%xmm5 \n"
2457 "pxor %%xmm4,%%xmm4 \n"
2458 LABELALIGN
2459 "1: \n"
2460 READNV12
2461 YVUTORGB
2462 "punpcklbw %%xmm1,%%xmm0 \n"
2463 "punpcklbw %%xmm5,%%xmm2 \n"
2464 "movdqa %%xmm0,%%xmm1 \n"
2465 "punpcklwd %%xmm2,%%xmm0 \n"
2466 "punpckhwd %%xmm2,%%xmm1 \n"
2467 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
2468 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2469 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2470 "sub $0x8,%[width] \n"
2471 "jg 1b \n"
2472 : [y_buf]"+r"(y_buf), // %[y_buf]
2473 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2474 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2475 [width]"+rm"(width) // %[width]
2476 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2477 : "memory", "cc"
2478 // Does not use r14.
2479 #if defined(__SSE2__)
2480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2481 #endif
2482 );
2483 }
2484
2485 void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2486 const uint8* u_buf,
2487 const uint8* v_buf,
2488 uint8* dst_argb,
2489 int width) {
2490 asm volatile (
2491 "sub %[u_buf],%[v_buf] \n"
2492 "pcmpeqb %%xmm5,%%xmm5 \n"
2493 "pxor %%xmm4,%%xmm4 \n"
2494 LABELALIGN
2495 "1: \n"
2496 READYUV444
2497 YUVTORGB
2498 "punpcklbw %%xmm1,%%xmm0 \n"
2499 "punpcklbw %%xmm5,%%xmm2 \n"
2500 "movdqa %%xmm0,%%xmm1 \n"
2501 "punpcklwd %%xmm2,%%xmm0 \n"
2502 "punpckhwd %%xmm2,%%xmm1 \n"
2503 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
2504 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2505 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2506 "sub $0x8,%[width] \n"
2507 "jg 1b \n"
2508 : [y_buf]"+r"(y_buf), // %[y_buf]
2509 [u_buf]"+r"(u_buf), // %[u_buf]
2510 [v_buf]"+r"(v_buf), // %[v_buf]
2511 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2512 [width]"+rm"(width) // %[width]
2513 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2514 : "memory", "cc"
2515 #if defined(__native_client__) && defined(__x86_64__)
2516 , "r14"
2517 #endif
2518 #if defined(__SSE2__)
2519 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2520 #endif
2521 );
2522 }
2523
2524 void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2525 const uint8* u_buf,
2526 const uint8* v_buf,
2527 uint8* dst_argb,
2528 int width) {
2529 asm volatile (
2530 "sub %[u_buf],%[v_buf] \n"
2531 "pcmpeqb %%xmm5,%%xmm5 \n"
2532 "pxor %%xmm4,%%xmm4 \n"
2533 LABELALIGN
2534 "1: \n"
2535 READYUV422
2536 YUVTORGB
2537 "punpcklbw %%xmm1,%%xmm0 \n"
2538 "punpcklbw %%xmm5,%%xmm2 \n"
2539 "movdqa %%xmm0,%%xmm1 \n"
2540 "punpcklwd %%xmm2,%%xmm0 \n"
2541 "punpckhwd %%xmm2,%%xmm1 \n"
2542 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
2543 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2544 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2545 "sub $0x8,%[width] \n"
2546 "jg 1b \n"
2547 : [y_buf]"+r"(y_buf), // %[y_buf]
2548 [u_buf]"+r"(u_buf), // %[u_buf]
2549 [v_buf]"+r"(v_buf), // %[v_buf]
2550 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2551 [width]"+rm"(width) // %[width]
2552 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2553 : "memory", "cc"
2554 #if defined(__native_client__) && defined(__x86_64__)
2555 , "r14"
2556 #endif
2557 #if defined(__SSE2__)
2558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2559 #endif
2560 );
2561 }
2562
2563 void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2564 const uint8* u_buf,
2565 const uint8* v_buf,
2566 uint8* dst_argb,
2567 int width) {
2568 asm volatile (
2569 "sub %[u_buf],%[v_buf] \n"
2570 "pcmpeqb %%xmm5,%%xmm5 \n"
2571 "pxor %%xmm4,%%xmm4 \n"
2572 LABELALIGN
2573 "1: \n"
2574 READYUV411
2575 YUVTORGB
2576 "punpcklbw %%xmm1,%%xmm0 \n"
2577 "punpcklbw %%xmm5,%%xmm2 \n"
2578 "movdqa %%xmm0,%%xmm1 \n"
2579 "punpcklwd %%xmm2,%%xmm0 \n"
2580 "punpckhwd %%xmm2,%%xmm1 \n"
2581 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
2582 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2583 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2584 "sub $0x8,%[width] \n"
2585 "jg 1b \n"
2586 : [y_buf]"+r"(y_buf), // %[y_buf]
2587 [u_buf]"+r"(u_buf), // %[u_buf]
2588 [v_buf]"+r"(v_buf), // %[v_buf]
2589 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2590 [width]"+rm"(width) // %[width]
2591 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2592 : "memory", "cc"
2593 #if defined(__native_client__) && defined(__x86_64__)
2594 , "r14"
2595 #endif
2596 #if defined(__SSE2__)
2597 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2598 #endif
2599 );
2600 }
2601
2602 void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2603 const uint8* uv_buf,
2604 uint8* dst_argb,
2605 int width) {
2606 asm volatile (
2607 "pcmpeqb %%xmm5,%%xmm5 \n"
2608 "pxor %%xmm4,%%xmm4 \n"
2609 LABELALIGN
2610 "1: \n"
2611 READNV12
2612 YUVTORGB
2613 "punpcklbw %%xmm1,%%xmm0 \n"
2614 "punpcklbw %%xmm5,%%xmm2 \n"
2615 "movdqa %%xmm0,%%xmm1 \n"
2616 "punpcklwd %%xmm2,%%xmm0 \n"
2617 "punpckhwd %%xmm2,%%xmm1 \n"
2618 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
2619 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2620 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2621 "sub $0x8,%[width] \n"
2622 "jg 1b \n"
2623 : [y_buf]"+r"(y_buf), // %[y_buf]
2624 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2625 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2626 [width]"+rm"(width) // %[width]
2627 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2628 : "memory", "cc"
2629 // Does not use r14.
2630 #if defined(__SSE2__)
2631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2632 #endif
2633 );
2634 }
2635
2636 void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2637 const uint8* uv_buf,
2638 uint8* dst_argb,
2639 int width) {
2640 asm volatile (
2641 "pcmpeqb %%xmm5,%%xmm5 \n"
2642 "pxor %%xmm4,%%xmm4 \n"
2643 LABELALIGN
2644 "1: \n"
2645 READNV12
2646 YVUTORGB
2647 "punpcklbw %%xmm1,%%xmm0 \n"
2648 "punpcklbw %%xmm5,%%xmm2 \n"
2649 "movdqa %%xmm0,%%xmm1 \n"
2650 "punpcklwd %%xmm2,%%xmm0 \n"
2651 "punpckhwd %%xmm2,%%xmm1 \n"
2652 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
2653 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2654 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2655 "sub $0x8,%[width] \n"
2656 "jg 1b \n"
2657 : [y_buf]"+r"(y_buf), // %[y_buf]
2658 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2659 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2660 [width]"+rm"(width) // %[width]
2661 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2662 : "memory", "cc"
2663 // Does not use r14.
2664 #if defined(__SSE2__)
2665 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2666 #endif
2667 );
2668 }
2669
2670 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2671 const uint8* u_buf,
2672 const uint8* v_buf,
2673 uint8* dst_bgra,
2674 int width) {
2675 asm volatile (
2676 "sub %[u_buf],%[v_buf] \n"
2677 "pcmpeqb %%xmm5,%%xmm5 \n"
2678 "pxor %%xmm4,%%xmm4 \n"
2679 LABELALIGN
2680 "1: \n"
2681 READYUV422
2682 YUVTORGB
2683 "pcmpeqb %%xmm5,%%xmm5 \n"
2684 "punpcklbw %%xmm0,%%xmm1 \n"
2685 "punpcklbw %%xmm2,%%xmm5 \n"
2686 "movdqa %%xmm5,%%xmm0 \n"
2687 "punpcklwd %%xmm1,%%xmm5 \n"
2688 "punpckhwd %%xmm1,%%xmm0 \n"
2689 "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
2690 "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
2691 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
2692 "sub $0x8,%[width] \n"
2693 "jg 1b \n"
2694 : [y_buf]"+r"(y_buf), // %[y_buf]
2695 [u_buf]"+r"(u_buf), // %[u_buf]
2696 [v_buf]"+r"(v_buf), // %[v_buf]
2697 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
2698 [width]"+rm"(width) // %[width]
2699 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2700 : "memory", "cc"
2701 #if defined(__native_client__) && defined(__x86_64__)
2702 , "r14"
2703 #endif
2704 #if defined(__SSE2__)
2705 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2706 #endif
2707 );
2708 }
2709
2710 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
2711 const uint8* u_buf,
2712 const uint8* v_buf,
2713 uint8* dst_abgr,
2714 int width) {
2715 asm volatile (
2716 "sub %[u_buf],%[v_buf] \n"
2717 "pcmpeqb %%xmm5,%%xmm5 \n"
2718 "pxor %%xmm4,%%xmm4 \n"
2719 LABELALIGN
2720 "1: \n"
2721 READYUV422
2722 YUVTORGB
2723 "punpcklbw %%xmm1,%%xmm2 \n"
2724 "punpcklbw %%xmm5,%%xmm0 \n"
2725 "movdqa %%xmm2,%%xmm1 \n"
2726 "punpcklwd %%xmm0,%%xmm2 \n"
2727 "punpckhwd %%xmm0,%%xmm1 \n"
2728 "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
2729 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
2730 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
2731 "sub $0x8,%[width] \n"
2732 "jg 1b \n"
2733 : [y_buf]"+r"(y_buf), // %[y_buf]
2734 [u_buf]"+r"(u_buf), // %[u_buf]
2735 [v_buf]"+r"(v_buf), // %[v_buf]
2736 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
2737 [width]"+rm"(width) // %[width]
2738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2739 : "memory", "cc"
2740 #if defined(__native_client__) && defined(__x86_64__)
2741 , "r14"
2742 #endif
2743 #if defined(__SSE2__)
2744 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2745 #endif
2746 );
2747 }
2748
2749 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2750 const uint8* u_buf,
2751 const uint8* v_buf,
2752 uint8* dst_rgba,
2753 int width) {
2754 asm volatile (
2755 "sub %[u_buf],%[v_buf] \n"
2756 "pcmpeqb %%xmm5,%%xmm5 \n"
2757 "pxor %%xmm4,%%xmm4 \n"
2758 LABELALIGN
2759 "1: \n"
2760 READYUV422
2761 YUVTORGB
2762 "pcmpeqb %%xmm5,%%xmm5 \n"
2763 "punpcklbw %%xmm2,%%xmm1 \n"
2764 "punpcklbw %%xmm0,%%xmm5 \n"
2765 "movdqa %%xmm5,%%xmm0 \n"
2766 "punpcklwd %%xmm1,%%xmm5 \n"
2767 "punpckhwd %%xmm1,%%xmm0 \n"
2768 "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
2769 "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
2770 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
2771 "sub $0x8,%[width] \n"
2772 "jg 1b \n"
2773 : [y_buf]"+r"(y_buf), // %[y_buf]
2774 [u_buf]"+r"(u_buf), // %[u_buf]
2775 [v_buf]"+r"(v_buf), // %[v_buf]
2776 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
2777 [width]"+rm"(width) // %[width]
2778 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2779 : "memory", "cc"
2780 #if defined(__native_client__) && defined(__x86_64__)
2781 , "r14"
2782 #endif
2783 #if defined(__SSE2__)
2784 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2785 #endif
2786 );
2787 }
2788
2789 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
2790 const uint8* u_buf,
2791 const uint8* v_buf,
2792 uint8* dst_bgra,
2793 int width) {
2794 asm volatile (
2795 "sub %[u_buf],%[v_buf] \n"
2796 "pcmpeqb %%xmm5,%%xmm5 \n"
2797 "pxor %%xmm4,%%xmm4 \n"
2798 LABELALIGN
2799 "1: \n"
2800 READYUV422
2801 YUVTORGB
2802 "pcmpeqb %%xmm5,%%xmm5 \n"
2803 "punpcklbw %%xmm0,%%xmm1 \n"
2804 "punpcklbw %%xmm2,%%xmm5 \n"
2805 "movdqa %%xmm5,%%xmm0 \n"
2806 "punpcklwd %%xmm1,%%xmm5 \n"
2807 "punpckhwd %%xmm1,%%xmm0 \n"
2808 "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
2809 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
2810 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
2811 "sub $0x8,%[width] \n"
2812 "jg 1b \n"
2813 : [y_buf]"+r"(y_buf), // %[y_buf]
2814 [u_buf]"+r"(u_buf), // %[u_buf]
2815 [v_buf]"+r"(v_buf), // %[v_buf]
2816 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
2817 [width]"+rm"(width) // %[width]
2818 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2819 : "memory", "cc"
2820 #if defined(__native_client__) && defined(__x86_64__)
2821 , "r14"
2822 #endif
2823 #if defined(__SSE2__)
2824 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2825 #endif
2826 );
2827 }
2828
2829 void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
2830 const uint8* u_buf,
2831 const uint8* v_buf,
2832 uint8* dst_abgr,
2833 int width) {
2834 asm volatile (
2835 "sub %[u_buf],%[v_buf] \n"
2836 "pcmpeqb %%xmm5,%%xmm5 \n"
2837 "pxor %%xmm4,%%xmm4 \n"
2838 LABELALIGN
2839 "1: \n"
2840 READYUV422
2841 YUVTORGB
2842 "punpcklbw %%xmm1,%%xmm2 \n"
2843 "punpcklbw %%xmm5,%%xmm0 \n"
2844 "movdqa %%xmm2,%%xmm1 \n"
2845 "punpcklwd %%xmm0,%%xmm2 \n"
2846 "punpckhwd %%xmm0,%%xmm1 \n"
2847 "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
2848 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
2849 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
2850 "sub $0x8,%[width] \n"
2851 "jg 1b \n"
2852 : [y_buf]"+r"(y_buf), // %[y_buf]
2853 [u_buf]"+r"(u_buf), // %[u_buf]
2854 [v_buf]"+r"(v_buf), // %[v_buf]
2855 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
2856 [width]"+rm"(width) // %[width]
2857 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2858 : "memory", "cc"
2859 #if defined(__native_client__) && defined(__x86_64__)
2860 , "r14"
2861 #endif
2862 #if defined(__SSE2__)
2863 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2864 #endif
2865 );
2866 }
2867
2868 void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2869 const uint8* u_buf,
2870 const uint8* v_buf,
2871 uint8* dst_rgba,
2872 int width) {
2873 asm volatile (
2874 "sub %[u_buf],%[v_buf] \n"
2875 "pcmpeqb %%xmm5,%%xmm5 \n"
2876 "pxor %%xmm4,%%xmm4 \n"
2877 LABELALIGN
2878 "1: \n"
2879 READYUV422
2880 YUVTORGB
2881 "pcmpeqb %%xmm5,%%xmm5 \n"
2882 "punpcklbw %%xmm2,%%xmm1 \n"
2883 "punpcklbw %%xmm0,%%xmm5 \n"
2884 "movdqa %%xmm5,%%xmm0 \n"
2885 "punpcklwd %%xmm1,%%xmm5 \n"
2886 "punpckhwd %%xmm1,%%xmm0 \n"
2887 "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
2888 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
2889 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
2890 "sub $0x8,%[width] \n"
2891 "jg 1b \n"
2892 : [y_buf]"+r"(y_buf), // %[y_buf]
2893 [u_buf]"+r"(u_buf), // %[u_buf]
2894 [v_buf]"+r"(v_buf), // %[v_buf]
2895 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
2896 [width]"+rm"(width) // %[width]
2897 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2898 : "memory", "cc"
2899 #if defined(__native_client__) && defined(__x86_64__)
2900 , "r14"
2901 #endif
2902 #if defined(__SSE2__)
2903 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2904 #endif
2905 );
2906 }
2907
2908 #endif // HAS_I422TOARGBROW_SSSE3
2909
2910 #ifdef HAS_YTOARGBROW_SSE2
2911 void YToARGBRow_SSE2(const uint8* y_buf,
2912 uint8* dst_argb,
2913 int width) {
2914 asm volatile (
2915 "pxor %%xmm5,%%xmm5 \n"
2916 "pcmpeqb %%xmm4,%%xmm4 \n"
2917 "pslld $0x18,%%xmm4 \n"
2918 "mov $0x00100010,%%eax \n"
2919 "movd %%eax,%%xmm3 \n"
2920 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2921 "mov $0x004a004a,%%eax \n"
2922 "movd %%eax,%%xmm2 \n"
2923 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2924 LABELALIGN
2925 "1: \n"
2926 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2927 "movq " MEMACCESS(0) ",%%xmm0 \n"
2928 "lea " MEMLEA(0x8,0) ",%0 \n"
2929 "punpcklbw %%xmm5,%%xmm0 \n"
2930 "psubusw %%xmm3,%%xmm0 \n"
2931 "pmullw %%xmm2,%%xmm0 \n"
2932 "psrlw $6, %%xmm0 \n"
2933 "packuswb %%xmm0,%%xmm0 \n"
2934
2935 // Step 2: Weave into ARGB
2936 "punpcklbw %%xmm0,%%xmm0 \n"
2937 "movdqa %%xmm0,%%xmm1 \n"
2938 "punpcklwd %%xmm0,%%xmm0 \n"
2939 "punpckhwd %%xmm1,%%xmm1 \n"
2940 "por %%xmm4,%%xmm0 \n"
2941 "por %%xmm4,%%xmm1 \n"
2942 "movdqa %%xmm0," MEMACCESS(1) " \n"
2943 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
2944 "lea " MEMLEA(0x20,1) ",%1 \n"
2945
2946 "sub $0x8,%2 \n"
2947 "jg 1b \n"
2948 : "+r"(y_buf), // %0
2949 "+r"(dst_argb), // %1
2950 "+rm"(width) // %2
2951 :
2952 : "memory", "cc", "eax"
2953 #if defined(__SSE2__)
2954 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2955 #endif
2956 );
2957 }
2958 #endif // HAS_YTOARGBROW_SSE2
2959
2960 #ifdef HAS_MIRRORROW_SSSE3
2961 // Shuffle table for reversing the bytes.
2962 static uvec8 kShuffleMirror = {
2963 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2964 };
2965
2966 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2967 intptr_t temp_width = (intptr_t)(width);
2968 asm volatile (
2969 "movdqa %3,%%xmm5 \n"
2970 "lea " MEMLEA(-0x10,0) ",%0 \n"
2971 LABELALIGN
2972 "1: \n"
2973 MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
2974 "pshufb %%xmm5,%%xmm0 \n"
2975 "sub $0x10,%2 \n"
2976 "movdqa %%xmm0," MEMACCESS(1) " \n"
2977 "lea " MEMLEA(0x10,1) ",%1 \n"
2978 "jg 1b \n"
2979 : "+r"(src), // %0
2980 "+r"(dst), // %1
2981 "+r"(temp_width) // %2
2982 : "m"(kShuffleMirror) // %3
2983 : "memory", "cc"
2984 #if defined(__native_client__) && defined(__x86_64__)
2985 , "r14"
2986 #endif
2987 #if defined(__SSE2__)
2988 , "xmm0", "xmm5"
2989 #endif
2990 );
2991 }
2992 #endif // HAS_MIRRORROW_SSSE3
2993
2994 #ifdef HAS_MIRRORROW_SSE2
2995 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2996 intptr_t temp_width = (intptr_t)(width);
2997 asm volatile (
2998 "lea " MEMLEA(-0x10,0) ",%0 \n"
2999 LABELALIGN
3000 "1: \n"
3001 MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
3002 "movdqa %%xmm0,%%xmm1 \n"
3003 "psllw $0x8,%%xmm0 \n"
3004 "psrlw $0x8,%%xmm1 \n"
3005 "por %%xmm1,%%xmm0 \n"
3006 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
3007 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
3008 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
3009 "sub $0x10,%2 \n"
3010 "movdqu %%xmm0," MEMACCESS(1) " \n"
3011 "lea " MEMLEA(0x10,1)",%1 \n"
3012 "jg 1b \n"
3013 : "+r"(src), // %0
3014 "+r"(dst), // %1
3015 "+r"(temp_width) // %2
3016 :
3017 : "memory", "cc"
3018 #if defined(__native_client__) && defined(__x86_64__)
3019 , "r14"
3020 #endif
3021 #if defined(__SSE2__)
3022 , "xmm0", "xmm1"
3023 #endif
3024 );
3025 }
3026 #endif // HAS_MIRRORROW_SSE2
3027
3028 #ifdef HAS_MIRRORROW_UV_SSSE3
3029 // Shuffle table for reversing the bytes of UV channels.
3030 static uvec8 kShuffleMirrorUV = {
3031 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3032 };
3033 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3034 int width) {
3035 intptr_t temp_width = (intptr_t)(width);
3036 asm volatile (
3037 "movdqa %4,%%xmm1 \n"
3038 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
3039 "sub %1,%2 \n"
3040 LABELALIGN
3041 "1: \n"
3042 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3043 "lea " MEMLEA(-0x10,0) ",%0 \n"
3044 "pshufb %%xmm1,%%xmm0 \n"
3045 "sub $8,%3 \n"
3046 "movlpd %%xmm0," MEMACCESS(1) " \n"
3047 BUNDLEALIGN
3048 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
3049 "lea " MEMLEA(0x8,1) ",%1 \n"
3050 "jg 1b \n"
3051 : "+r"(src), // %0
3052 "+r"(dst_u), // %1
3053 "+r"(dst_v), // %2
3054 "+r"(temp_width) // %3
3055 : "m"(kShuffleMirrorUV) // %4
3056 : "memory", "cc"
3057 #if defined(__native_client__) && defined(__x86_64__)
3058 , "r14"
3059 #endif
3060 #if defined(__SSE2__)
3061 , "xmm0", "xmm1"
3062 #endif
3063 );
3064 }
3065 #endif // HAS_MIRRORROW_UV_SSSE3
3066
3067 #ifdef HAS_ARGBMIRRORROW_SSSE3
3068 // Shuffle table for reversing the bytes.
3069 static uvec8 kARGBShuffleMirror = {
3070 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
3071 };
3072
3073 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3074 intptr_t temp_width = (intptr_t)(width);
3075 asm volatile (
3076 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
3077 "movdqa %3,%%xmm5 \n"
3078 LABELALIGN
3079 "1: \n"
3080 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3081 "pshufb %%xmm5,%%xmm0 \n"
3082 "lea " MEMLEA(-0x10,0) ",%0 \n"
3083 "sub $0x4,%2 \n"
3084 "movdqa %%xmm0," MEMACCESS(1) " \n"
3085 "lea " MEMLEA(0x10,1) ",%1 \n"
3086 "jg 1b \n"
3087 : "+r"(src), // %0
3088 "+r"(dst), // %1
3089 "+r"(temp_width) // %2
3090 : "m"(kARGBShuffleMirror) // %3
3091 : "memory", "cc"
3092 #if defined(__SSE2__)
3093 , "xmm0", "xmm5"
3094 #endif
3095 );
3096 }
3097 #endif // HAS_ARGBMIRRORROW_SSSE3
3098
3099 #ifdef HAS_SPLITUVROW_SSE2
3100 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3101 asm volatile (
3102 "pcmpeqb %%xmm5,%%xmm5 \n"
3103 "psrlw $0x8,%%xmm5 \n"
3104 "sub %1,%2 \n"
3105 LABELALIGN
3106 "1: \n"
3107 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3108 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3109 "lea " MEMLEA(0x20,0) ",%0 \n"
3110 "movdqa %%xmm0,%%xmm2 \n"
3111 "movdqa %%xmm1,%%xmm3 \n"
3112 "pand %%xmm5,%%xmm0 \n"
3113 "pand %%xmm5,%%xmm1 \n"
3114 "packuswb %%xmm1,%%xmm0 \n"
3115 "psrlw $0x8,%%xmm2 \n"
3116 "psrlw $0x8,%%xmm3 \n"
3117 "packuswb %%xmm3,%%xmm2 \n"
3118 "movdqa %%xmm0," MEMACCESS(1) " \n"
3119 MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
3120 "lea " MEMLEA(0x10,1) ",%1 \n"
3121 "sub $0x10,%3 \n"
3122 "jg 1b \n"
3123 : "+r"(src_uv), // %0
3124 "+r"(dst_u), // %1
3125 "+r"(dst_v), // %2
3126 "+r"(pix) // %3
3127 :
3128 : "memory", "cc"
3129 #if defined(__native_client__) && defined(__x86_64__)
3130 , "r14"
3131 #endif
3132 #if defined(__SSE2__)
3133 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3134 #endif
3135 );
3136 }
3137
3138 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3139 int pix) {
3140 asm volatile (
3141 "pcmpeqb %%xmm5,%%xmm5 \n"
3142 "psrlw $0x8,%%xmm5 \n"
3143 "sub %1,%2 \n"
3144 LABELALIGN
3145 "1: \n"
3146 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3147 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3148 "lea " MEMLEA(0x20,0) ",%0 \n"
3149 "movdqa %%xmm0,%%xmm2 \n"
3150 "movdqa %%xmm1,%%xmm3 \n"
3151 "pand %%xmm5,%%xmm0 \n"
3152 "pand %%xmm5,%%xmm1 \n"
3153 "packuswb %%xmm1,%%xmm0 \n"
3154 "psrlw $0x8,%%xmm2 \n"
3155 "psrlw $0x8,%%xmm3 \n"
3156 "packuswb %%xmm3,%%xmm2 \n"
3157 "movdqu %%xmm0," MEMACCESS(1) " \n"
3158 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
3159 "lea " MEMLEA(0x10,1) ",%1 \n"
3160 "sub $0x10,%3 \n"
3161 "jg 1b \n"
3162 : "+r"(src_uv), // %0
3163 "+r"(dst_u), // %1
3164 "+r"(dst_v), // %2
3165 "+r"(pix) // %3
3166 :
3167 : "memory", "cc"
3168 #if defined(__native_client__) && defined(__x86_64__)
3169 , "r14"
3170 #endif
3171 #if defined(__SSE2__)
3172 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3173 #endif
3174 );
3175 }
3176 #endif // HAS_SPLITUVROW_SSE2
3177
3178 #ifdef HAS_MERGEUVROW_SSE2
3179 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3180 int width) {
3181 asm volatile (
3182 "sub %0,%1 \n"
3183 LABELALIGN
3184 "1: \n"
3185 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3186 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
3187 "lea " MEMLEA(0x10,0) ",%0 \n"
3188 "movdqa %%xmm0,%%xmm2 \n"
3189 "punpcklbw %%xmm1,%%xmm0 \n"
3190 "punpckhbw %%xmm1,%%xmm2 \n"
3191 "movdqa %%xmm0," MEMACCESS(2) " \n"
3192 "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
3193 "lea " MEMLEA(0x20,2) ",%2 \n"
3194 "sub $0x10,%3 \n"
3195 "jg 1b \n"
3196 : "+r"(src_u), // %0
3197 "+r"(src_v), // %1
3198 "+r"(dst_uv), // %2
3199 "+r"(width) // %3
3200 :
3201 : "memory", "cc"
3202 #if defined(__native_client__) && defined(__x86_64__)
3203 , "r14"
3204 #endif
3205 #if defined(__SSE2__)
3206 , "xmm0", "xmm1", "xmm2"
3207 #endif
3208 );
3209 }
3210
3211 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
3212 uint8* dst_uv, int width) {
3213 asm volatile (
3214 "sub %0,%1 \n"
3215 LABELALIGN
3216 "1: \n"
3217 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3218 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
3219 "lea " MEMLEA(0x10,0) ",%0 \n"
3220 "movdqa %%xmm0,%%xmm2 \n"
3221 "punpcklbw %%xmm1,%%xmm0 \n"
3222 "punpckhbw %%xmm1,%%xmm2 \n"
3223 "movdqu %%xmm0," MEMACCESS(2) " \n"
3224 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
3225 "lea " MEMLEA(0x20,2) ",%2 \n"
3226 "sub $0x10,%3 \n"
3227 "jg 1b \n"
3228 : "+r"(src_u), // %0
3229 "+r"(src_v), // %1
3230 "+r"(dst_uv), // %2
3231 "+r"(width) // %3
3232 :
3233 : "memory", "cc"
3234 #if defined(__native_client__) && defined(__x86_64__)
3235 , "r14"
3236 #endif
3237 #if defined(__SSE2__)
3238 , "xmm0", "xmm1", "xmm2"
3239 #endif
3240 );
3241 }
3242 #endif // HAS_MERGEUVROW_SSE2
3243
3244 #ifdef HAS_COPYROW_SSE2
3245 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3246 asm volatile (
3247 LABELALIGN
3248 "1: \n"
3249 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3250 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3251 "lea " MEMLEA(0x20,0) ",%0 \n"
3252 "movdqa %%xmm0," MEMACCESS(1) " \n"
3253 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
3254 "lea " MEMLEA(0x20,1) ",%1 \n"
3255 "sub $0x20,%2 \n"
3256 "jg 1b \n"
3257 : "+r"(src), // %0
3258 "+r"(dst), // %1
3259 "+r"(count) // %2
3260 :
3261 : "memory", "cc"
3262 #if defined(__SSE2__)
3263 , "xmm0", "xmm1"
3264 #endif
3265 );
3266 }
3267 #endif // HAS_COPYROW_SSE2
3268
3269 #ifdef HAS_COPYROW_X86
3270 void CopyRow_X86(const uint8* src, uint8* dst, int width) {
3271 size_t width_tmp = (size_t)(width);
3272 asm volatile (
3273 "shr $0x2,%2 \n"
3274 "rep movsl " MEMMOVESTRING(0,1) " \n"
3275 : "+S"(src), // %0
3276 "+D"(dst), // %1
3277 "+c"(width_tmp) // %2
3278 :
3279 : "memory", "cc"
3280 );
3281 }
3282 #endif // HAS_COPYROW_X86
3283
3284 #ifdef HAS_COPYROW_ERMS
3285 // Unaligned Multiple of 1.
3286 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
3287 size_t width_tmp = (size_t)(width);
3288 asm volatile (
3289 "rep movsb " MEMMOVESTRING(0,1) " \n"
3290 : "+S"(src), // %0
3291 "+D"(dst), // %1
3292 "+c"(width_tmp) // %2
3293 :
3294 : "memory", "cc"
3295 );
3296 }
3297 #endif // HAS_COPYROW_ERMS
3298
3299 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3300 // width in pixels
3301 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3302 asm volatile (
3303 "pcmpeqb %%xmm0,%%xmm0 \n"
3304 "pslld $0x18,%%xmm0 \n"
3305 "pcmpeqb %%xmm1,%%xmm1 \n"
3306 "psrld $0x8,%%xmm1 \n"
3307 LABELALIGN
3308 "1: \n"
3309 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
3310 "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
3311 "lea " MEMLEA(0x20,0) ",%0 \n"
3312 "movdqa " MEMACCESS(1) ",%%xmm4 \n"
3313 "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
3314 "pand %%xmm0,%%xmm2 \n"
3315 "pand %%xmm0,%%xmm3 \n"
3316 "pand %%xmm1,%%xmm4 \n"
3317 "pand %%xmm1,%%xmm5 \n"
3318 "por %%xmm4,%%xmm2 \n"
3319 "por %%xmm5,%%xmm3 \n"
3320 "movdqa %%xmm2," MEMACCESS(1) " \n"
3321 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
3322 "lea " MEMLEA(0x20,1) ",%1 \n"
3323 "sub $0x8,%2 \n"
3324 "jg 1b \n"
3325 : "+r"(src), // %0
3326 "+r"(dst), // %1
3327 "+r"(width) // %2
3328 :
3329 : "memory", "cc"
3330 #if defined(__SSE2__)
3331 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3332 #endif
3333 );
3334 }
3335 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3336
3337 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3338 // width in pixels
3339 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3340 asm volatile (
3341 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
3342 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
3343 LABELALIGN
3344 "1: \n"
3345 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
3346 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
3347 "lea " MEMLEA(0x40,0) ",%0 \n"
3348 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
3349 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
3350 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
3351 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
3352 "lea " MEMLEA(0x40,1) ",%1 \n"
3353 "sub $0x10,%2 \n"
3354 "jg 1b \n"
3355 "vzeroupper \n"
3356 : "+r"(src), // %0
3357 "+r"(dst), // %1
3358 "+r"(width) // %2
3359 :
3360 : "memory", "cc"
3361 #if defined(__SSE2__)
3362 , "xmm0", "xmm1", "xmm2"
3363 #endif
3364 );
3365 }
3366 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3367
3368 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3369 // width in pixels
3370 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3371 asm volatile (
3372 "pcmpeqb %%xmm0,%%xmm0 \n"
3373 "pslld $0x18,%%xmm0 \n"
3374 "pcmpeqb %%xmm1,%%xmm1 \n"
3375 "psrld $0x8,%%xmm1 \n"
3376 LABELALIGN
3377 "1: \n"
3378 "movq " MEMACCESS(0) ",%%xmm2 \n"
3379 "lea " MEMLEA(0x8,0) ",%0 \n"
3380 "punpcklbw %%xmm2,%%xmm2 \n"
3381 "punpckhwd %%xmm2,%%xmm3 \n"
3382 "punpcklwd %%xmm2,%%xmm2 \n"
3383 "movdqa " MEMACCESS(1) ",%%xmm4 \n"
3384 "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
3385 "pand %%xmm0,%%xmm2 \n"
3386 "pand %%xmm0,%%xmm3 \n"
3387 "pand %%xmm1,%%xmm4 \n"
3388 "pand %%xmm1,%%xmm5 \n"
3389 "por %%xmm4,%%xmm2 \n"
3390 "por %%xmm5,%%xmm3 \n"
3391 "movdqa %%xmm2," MEMACCESS(1) " \n"
3392 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
3393 "lea " MEMLEA(0x20,1) ",%1 \n"
3394 "sub $0x8,%2 \n"
3395 "jg 1b \n"
3396 : "+r"(src), // %0
3397 "+r"(dst), // %1
3398 "+r"(width) // %2
3399 :
3400 : "memory", "cc"
3401 #if defined(__SSE2__)
3402 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3403 #endif
3404 );
3405 }
3406 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3407
3408 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3409 // width in pixels
3410 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3411 asm volatile (
3412 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
3413 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
3414 LABELALIGN
3415 "1: \n"
3416 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
3417 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
3418 "lea " MEMLEA(0x10,0) ",%0 \n"
3419 "vpslld $0x18,%%ymm1,%%ymm1 \n"
3420 "vpslld $0x18,%%ymm2,%%ymm2 \n"
3421 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
3422 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
3423 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
3424 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
3425 "lea " MEMLEA(0x40,1) ",%1 \n"
3426 "sub $0x10,%2 \n"
3427 "jg 1b \n"
3428 "vzeroupper \n"
3429 : "+r"(src), // %0
3430 "+r"(dst), // %1
3431 "+r"(width) // %2
3432 :
3433 : "memory", "cc"
3434 #if defined(__SSE2__)
3435 , "xmm0", "xmm1", "xmm2"
3436 #endif
3437 );
3438 }
3439 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3440
3441 #ifdef HAS_SETROW_X86
3442 void SetRow_X86(uint8* dst, uint32 v32, int width) {
3443 size_t width_tmp = (size_t)(width);
3444 asm volatile (
3445 "shr $0x2,%1 \n"
3446 "rep stosl " MEMSTORESTRING(eax,0) " \n"
3447 : "+D"(dst), // %0
3448 "+c"(width_tmp) // %1
3449 : "a"(v32) // %2
3450 : "memory", "cc");
3451 }
3452
3453 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
3454 int dst_stride, int height) {
3455 for (int y = 0; y < height; ++y) {
3456 size_t width_tmp = (size_t)(width);
3457 uint32* d = (uint32*)(dst);
3458 asm volatile (
3459 "rep stosl " MEMSTORESTRING(eax,0) " \n"
3460 : "+D"(d), // %0
3461 "+c"(width_tmp) // %1
3462 : "a"(v32) // %2
3463 : "memory", "cc");
3464 dst += dst_stride;
3465 }
3466 }
3467 #endif // HAS_SETROW_X86
3468
3469 #ifdef HAS_YUY2TOYROW_SSE2
3470 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
3471 asm volatile (
3472 "pcmpeqb %%xmm5,%%xmm5 \n"
3473 "psrlw $0x8,%%xmm5 \n"
3474 LABELALIGN
3475 "1: \n"
3476 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3477 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3478 "lea " MEMLEA(0x20,0) ",%0 \n"
3479 "pand %%xmm5,%%xmm0 \n"
3480 "pand %%xmm5,%%xmm1 \n"
3481 "packuswb %%xmm1,%%xmm0 \n"
3482 "movdqa %%xmm0," MEMACCESS(1) " \n"
3483 "lea " MEMLEA(0x10,1) ",%1 \n"
3484 "sub $0x10,%2 \n"
3485 "jg 1b \n"
3486 : "+r"(src_yuy2), // %0
3487 "+r"(dst_y), // %1
3488 "+r"(pix) // %2
3489 :
3490 : "memory", "cc"
3491 #if defined(__SSE2__)
3492 , "xmm0", "xmm1", "xmm5"
3493 #endif
3494 );
3495 }
3496
3497 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3498 uint8* dst_u, uint8* dst_v, int pix) {
3499 asm volatile (
3500 "pcmpeqb %%xmm5,%%xmm5 \n"
3501 "psrlw $0x8,%%xmm5 \n"
3502 "sub %1,%2 \n"
3503 LABELALIGN
3504 "1: \n"
3505 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3506 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3507 BUNDLEALIGN
3508 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
3509 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
3510 "lea " MEMLEA(0x20,0) ",%0 \n"
3511 "pavgb %%xmm2,%%xmm0 \n"
3512 "pavgb %%xmm3,%%xmm1 \n"
3513 "psrlw $0x8,%%xmm0 \n"
3514 "psrlw $0x8,%%xmm1 \n"
3515 "packuswb %%xmm1,%%xmm0 \n"
3516 "movdqa %%xmm0,%%xmm1 \n"
3517 "pand %%xmm5,%%xmm0 \n"
3518 "packuswb %%xmm0,%%xmm0 \n"
3519 "psrlw $0x8,%%xmm1 \n"
3520 "packuswb %%xmm1,%%xmm1 \n"
3521 "movq %%xmm0," MEMACCESS(1) " \n"
3522 BUNDLEALIGN
3523 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3524 "lea " MEMLEA(0x8,1) ",%1 \n"
3525 "sub $0x10,%3 \n"
3526 "jg 1b \n"
3527 : "+r"(src_yuy2), // %0
3528 "+r"(dst_u), // %1
3529 "+r"(dst_v), // %2
3530 "+r"(pix) // %3
3531 : "r"((intptr_t)(stride_yuy2)) // %4
3532 : "memory", "cc"
3533 #if defined(__native_client__) && defined(__x86_64__)
3534 , "r14"
3535 #endif
3536 #if defined(__SSE2__)
3537 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3538 #endif
3539 );
3540 }
3541
3542 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3543 uint8* dst_u, uint8* dst_v, int pix) {
3544 asm volatile (
3545 "pcmpeqb %%xmm5,%%xmm5 \n"
3546 "psrlw $0x8,%%xmm5 \n"
3547 "sub %1,%2 \n"
3548 LABELALIGN
3549 "1: \n"
3550 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3551 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3552 "lea " MEMLEA(0x20,0) ",%0 \n"
3553 "psrlw $0x8,%%xmm0 \n"
3554 "psrlw $0x8,%%xmm1 \n"
3555 "packuswb %%xmm1,%%xmm0 \n"
3556 "movdqa %%xmm0,%%xmm1 \n"
3557 "pand %%xmm5,%%xmm0 \n"
3558 "packuswb %%xmm0,%%xmm0 \n"
3559 "psrlw $0x8,%%xmm1 \n"
3560 "packuswb %%xmm1,%%xmm1 \n"
3561 "movq %%xmm0," MEMACCESS(1) " \n"
3562 BUNDLEALIGN
3563 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3564 "lea " MEMLEA(0x8,1) ",%1 \n"
3565 "sub $0x10,%3 \n"
3566 "jg 1b \n"
3567 : "+r"(src_yuy2), // %0
3568 "+r"(dst_u), // %1
3569 "+r"(dst_v), // %2
3570 "+r"(pix) // %3
3571 :
3572 : "memory", "cc"
3573 #if defined(__native_client__) && defined(__x86_64__)
3574 , "r14"
3575 #endif
3576 #if defined(__SSE2__)
3577 , "xmm0", "xmm1", "xmm5"
3578 #endif
3579 );
3580 }
3581
3582 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
3583 uint8* dst_y, int pix) {
3584 asm volatile (
3585 "pcmpeqb %%xmm5,%%xmm5 \n"
3586 "psrlw $0x8,%%xmm5 \n"
3587 LABELALIGN
3588 "1: \n"
3589 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3590 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3591 "lea " MEMLEA(0x20,0) ",%0 \n"
3592 "pand %%xmm5,%%xmm0 \n"
3593 "pand %%xmm5,%%xmm1 \n"
3594 "packuswb %%xmm1,%%xmm0 \n"
3595 "sub $0x10,%2 \n"
3596 "movdqu %%xmm0," MEMACCESS(1) " \n"
3597 "lea " MEMLEA(0x10,1) ",%1 \n"
3598 "jg 1b \n"
3599 : "+r"(src_yuy2), // %0
3600 "+r"(dst_y), // %1
3601 "+r"(pix) // %2
3602 :
3603 : "memory", "cc"
3604 #if defined(__SSE2__)
3605 , "xmm0", "xmm1", "xmm5"
3606 #endif
3607 );
3608 }
3609
3610 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
3611 int stride_yuy2,
3612 uint8* dst_u, uint8* dst_v, int pix) {
3613 asm volatile (
3614 "pcmpeqb %%xmm5,%%xmm5 \n"
3615 "psrlw $0x8,%%xmm5 \n"
3616 "sub %1,%2 \n"
3617 LABELALIGN
3618 "1: \n"
3619 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3620 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3621 BUNDLEALIGN
3622 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3623 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3624 "lea " MEMLEA(0x20,0) ",%0 \n"
3625 "pavgb %%xmm2,%%xmm0 \n"
3626 "pavgb %%xmm3,%%xmm1 \n"
3627 "psrlw $0x8,%%xmm0 \n"
3628 "psrlw $0x8,%%xmm1 \n"
3629 "packuswb %%xmm1,%%xmm0 \n"
3630 "movdqa %%xmm0,%%xmm1 \n"
3631 "pand %%xmm5,%%xmm0 \n"
3632 "packuswb %%xmm0,%%xmm0 \n"
3633 "psrlw $0x8,%%xmm1 \n"
3634 "packuswb %%xmm1,%%xmm1 \n"
3635 "movq %%xmm0," MEMACCESS(1) " \n"
3636 BUNDLEALIGN
3637 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3638 "lea " MEMLEA(0x8,1) ",%1 \n"
3639 "sub $0x10,%3 \n"
3640 "jg 1b \n"
3641 : "+r"(src_yuy2), // %0
3642 "+r"(dst_u), // %1
3643 "+r"(dst_v), // %2
3644 "+r"(pix) // %3
3645 : "r"((intptr_t)(stride_yuy2)) // %4
3646 : "memory", "cc"
3647 #if defined(__native_client__) && defined(__x86_64__)
3648 , "r14"
3649 #endif
3650 #if defined(__SSE2__)
3651 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3652 #endif
3653 );
3654 }
3655
3656 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
3657 uint8* dst_u, uint8* dst_v, int pix) {
3658 asm volatile (
3659 "pcmpeqb %%xmm5,%%xmm5 \n"
3660 "psrlw $0x8,%%xmm5 \n"
3661 "sub %1,%2 \n"
3662 LABELALIGN
3663 "1: \n"
3664 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3665 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3666 "lea " MEMLEA(0x20,0) ",%0 \n"
3667 "psrlw $0x8,%%xmm0 \n"
3668 "psrlw $0x8,%%xmm1 \n"
3669 "packuswb %%xmm1,%%xmm0 \n"
3670 "movdqa %%xmm0,%%xmm1 \n"
3671 "pand %%xmm5,%%xmm0 \n"
3672 "packuswb %%xmm0,%%xmm0 \n"
3673 "psrlw $0x8,%%xmm1 \n"
3674 "packuswb %%xmm1,%%xmm1 \n"
3675 "movq %%xmm0," MEMACCESS(1) " \n"
3676 BUNDLEALIGN
3677 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3678 "lea " MEMLEA(0x8,1) ",%1 \n"
3679 "sub $0x10,%3 \n"
3680 "jg 1b \n"
3681 : "+r"(src_yuy2), // %0
3682 "+r"(dst_u), // %1
3683 "+r"(dst_v), // %2
3684 "+r"(pix) // %3
3685 :
3686 : "memory", "cc"
3687 #if defined(__native_client__) && defined(__x86_64__)
3688 , "r14"
3689 #endif
3690 #if defined(__SSE2__)
3691 , "xmm0", "xmm1", "xmm5"
3692 #endif
3693 );
3694 }
3695
3696 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
3697 asm volatile (
3698 LABELALIGN
3699 "1: \n"
3700 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3701 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3702 "lea " MEMLEA(0x20,0) ",%0 \n"
3703 "psrlw $0x8,%%xmm0 \n"
3704 "psrlw $0x8,%%xmm1 \n"
3705 "packuswb %%xmm1,%%xmm0 \n"
3706 "sub $0x10,%2 \n"
3707 "movdqa %%xmm0," MEMACCESS(1) " \n"
3708 "lea " MEMLEA(0x10,1) ",%1 \n"
3709 "jg 1b \n"
3710 : "+r"(src_uyvy), // %0
3711 "+r"(dst_y), // %1
3712 "+r"(pix) // %2
3713 :
3714 : "memory", "cc"
3715 #if defined(__SSE2__)
3716 , "xmm0", "xmm1"
3717 #endif
3718 );
3719 }
3720
3721 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3722 uint8* dst_u, uint8* dst_v, int pix) {
3723 asm volatile (
3724 "pcmpeqb %%xmm5,%%xmm5 \n"
3725 "psrlw $0x8,%%xmm5 \n"
3726 "sub %1,%2 \n"
3727 LABELALIGN
3728 "1: \n"
3729 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3730 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3731 BUNDLEALIGN
3732 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
3733 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
3734 "lea " MEMLEA(0x20,0) ",%0 \n"
3735 "pavgb %%xmm2,%%xmm0 \n"
3736 "pavgb %%xmm3,%%xmm1 \n"
3737 "pand %%xmm5,%%xmm0 \n"
3738 "pand %%xmm5,%%xmm1 \n"
3739 "packuswb %%xmm1,%%xmm0 \n"
3740 "movdqa %%xmm0,%%xmm1 \n"
3741 "pand %%xmm5,%%xmm0 \n"
3742 "packuswb %%xmm0,%%xmm0 \n"
3743 "psrlw $0x8,%%xmm1 \n"
3744 "packuswb %%xmm1,%%xmm1 \n"
3745 "movq %%xmm0," MEMACCESS(1) " \n"
3746 BUNDLEALIGN
3747 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3748 "lea " MEMLEA(0x8,1) ",%1 \n"
3749 "sub $0x10,%3 \n"
3750 "jg 1b \n"
3751 : "+r"(src_uyvy), // %0
3752 "+r"(dst_u), // %1
3753 "+r"(dst_v), // %2
3754 "+r"(pix) // %3
3755 : "r"((intptr_t)(stride_uyvy)) // %4
3756 : "memory", "cc"
3757 #if defined(__native_client__) && defined(__x86_64__)
3758 , "r14"
3759 #endif
3760 #if defined(__SSE2__)
3761 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3762 #endif
3763 );
3764 }
3765
3766 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3767 uint8* dst_u, uint8* dst_v, int pix) {
3768 asm volatile (
3769 "pcmpeqb %%xmm5,%%xmm5 \n"
3770 "psrlw $0x8,%%xmm5 \n"
3771 "sub %1,%2 \n"
3772 LABELALIGN
3773 "1: \n"
3774 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
3775 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3776 "lea " MEMLEA(0x20,0) ",%0 \n"
3777 "pand %%xmm5,%%xmm0 \n"
3778 "pand %%xmm5,%%xmm1 \n"
3779 "packuswb %%xmm1,%%xmm0 \n"
3780 "movdqa %%xmm0,%%xmm1 \n"
3781 "pand %%xmm5,%%xmm0 \n"
3782 "packuswb %%xmm0,%%xmm0 \n"
3783 "psrlw $0x8,%%xmm1 \n"
3784 "packuswb %%xmm1,%%xmm1 \n"
3785 "movq %%xmm0," MEMACCESS(1) " \n"
3786 BUNDLEALIGN
3787 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3788 "lea " MEMLEA(0x8,1) ",%1 \n"
3789 "sub $0x10,%3 \n"
3790 "jg 1b \n"
3791 : "+r"(src_uyvy), // %0
3792 "+r"(dst_u), // %1
3793 "+r"(dst_v), // %2
3794 "+r"(pix) // %3
3795 :
3796 : "memory", "cc"
3797 #if defined(__native_client__) && defined(__x86_64__)
3798 , "r14"
3799 #endif
3800 #if defined(__SSE2__)
3801 , "xmm0", "xmm1", "xmm5"
3802 #endif
3803 );
3804 }
3805
3806 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3807 uint8* dst_y, int pix) {
3808 asm volatile (
3809 LABELALIGN
3810 "1: \n"
3811 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3812 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3813 "lea " MEMLEA(0x20,0) ",%0 \n"
3814 "psrlw $0x8,%%xmm0 \n"
3815 "psrlw $0x8,%%xmm1 \n"
3816 "packuswb %%xmm1,%%xmm0 \n"
3817 "sub $0x10,%2 \n"
3818 "movdqu %%xmm0," MEMACCESS(1) " \n"
3819 "lea " MEMLEA(0x10,1) ",%1 \n"
3820 "jg 1b \n"
3821 : "+r"(src_uyvy), // %0
3822 "+r"(dst_y), // %1
3823 "+r"(pix) // %2
3824 :
3825 : "memory", "cc"
3826 #if defined(__SSE2__)
3827 , "xmm0", "xmm1"
3828 #endif
3829 );
3830 }
3831
3832 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
3833 uint8* dst_u, uint8* dst_v, int pix) {
3834 asm volatile (
3835 "pcmpeqb %%xmm5,%%xmm5 \n"
3836 "psrlw $0x8,%%xmm5 \n"
3837 "sub %1,%2 \n"
3838 LABELALIGN
3839 "1: \n"
3840 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3841 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3842 BUNDLEALIGN
3843 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3844 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3845 "lea " MEMLEA(0x20,0) ",%0 \n"
3846 "pavgb %%xmm2,%%xmm0 \n"
3847 "pavgb %%xmm3,%%xmm1 \n"
3848 "pand %%xmm5,%%xmm0 \n"
3849 "pand %%xmm5,%%xmm1 \n"
3850 "packuswb %%xmm1,%%xmm0 \n"
3851 "movdqa %%xmm0,%%xmm1 \n"
3852 "pand %%xmm5,%%xmm0 \n"
3853 "packuswb %%xmm0,%%xmm0 \n"
3854 "psrlw $0x8,%%xmm1 \n"
3855 "packuswb %%xmm1,%%xmm1 \n"
3856 "movq %%xmm0," MEMACCESS(1) " \n"
3857 BUNDLEALIGN
3858 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3859 "lea " MEMLEA(0x8,1) ",%1 \n"
3860 "sub $0x10,%3 \n"
3861 "jg 1b \n"
3862 : "+r"(src_uyvy), // %0
3863 "+r"(dst_u), // %1
3864 "+r"(dst_v), // %2
3865 "+r"(pix) // %3
3866 : "r"((intptr_t)(stride_uyvy)) // %4
3867 : "memory", "cc"
3868 #if defined(__native_client__) && defined(__x86_64__)
3869 , "r14"
3870 #endif
3871 #if defined(__SSE2__)
3872 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3873 #endif
3874 );
3875 }
3876
3877 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3878 uint8* dst_u, uint8* dst_v, int pix) {
3879 asm volatile (
3880 "pcmpeqb %%xmm5,%%xmm5 \n"
3881 "psrlw $0x8,%%xmm5 \n"
3882 "sub %1,%2 \n"
3883 LABELALIGN
3884 "1: \n"
3885 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3886 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3887 "lea " MEMLEA(0x20,0) ",%0 \n"
3888 "pand %%xmm5,%%xmm0 \n"
3889 "pand %%xmm5,%%xmm1 \n"
3890 "packuswb %%xmm1,%%xmm0 \n"
3891 "movdqa %%xmm0,%%xmm1 \n"
3892 "pand %%xmm5,%%xmm0 \n"
3893 "packuswb %%xmm0,%%xmm0 \n"
3894 "psrlw $0x8,%%xmm1 \n"
3895 "packuswb %%xmm1,%%xmm1 \n"
3896 "movq %%xmm0," MEMACCESS(1) " \n"
3897 BUNDLEALIGN
3898 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3899 "lea " MEMLEA(0x8,1) ",%1 \n"
3900 "sub $0x10,%3 \n"
3901 "jg 1b \n"
3902 : "+r"(src_uyvy), // %0
3903 "+r"(dst_u), // %1
3904 "+r"(dst_v), // %2
3905 "+r"(pix) // %3
3906 :
3907 : "memory", "cc"
3908 #if defined(__native_client__) && defined(__x86_64__)
3909 , "r14"
3910 #endif
3911 #if defined(__SSE2__)
3912 , "xmm0", "xmm1", "xmm5"
3913 #endif
3914 );
3915 }
3916 #endif // HAS_YUY2TOYROW_SSE2
3917
3918 #ifdef HAS_ARGBBLENDROW_SSE2
3919 // Blend 8 pixels at a time.
3920 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3921 uint8* dst_argb, int width) {
3922 asm volatile (
3923 "pcmpeqb %%xmm7,%%xmm7 \n"
3924 "psrlw $0xf,%%xmm7 \n"
3925 "pcmpeqb %%xmm6,%%xmm6 \n"
3926 "psrlw $0x8,%%xmm6 \n"
3927 "pcmpeqb %%xmm5,%%xmm5 \n"
3928 "psllw $0x8,%%xmm5 \n"
3929 "pcmpeqb %%xmm4,%%xmm4 \n"
3930 "pslld $0x18,%%xmm4 \n"
3931 "sub $0x1,%3 \n"
3932 "je 91f \n"
3933 "jl 99f \n"
3934
3935 // 1 pixel loop until destination pointer is aligned.
3936 "10: \n"
3937 "test $0xf,%2 \n"
3938 "je 19f \n"
3939 "movd " MEMACCESS(0) ",%%xmm3 \n"
3940 "lea " MEMLEA(0x4,0) ",%0 \n"
3941 "movdqa %%xmm3,%%xmm0 \n"
3942 "pxor %%xmm4,%%xmm3 \n"
3943 "movd " MEMACCESS(1) ",%%xmm2 \n"
3944 "psrlw $0x8,%%xmm3 \n"
3945 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3946 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3947 "pand %%xmm6,%%xmm2 \n"
3948 "paddw %%xmm7,%%xmm3 \n"
3949 "pmullw %%xmm3,%%xmm2 \n"
3950 "movd " MEMACCESS(1) ",%%xmm1 \n"
3951 "lea " MEMLEA(0x4,1) ",%1 \n"
3952 "psrlw $0x8,%%xmm1 \n"
3953 "por %%xmm4,%%xmm0 \n"
3954 "pmullw %%xmm3,%%xmm1 \n"
3955 "psrlw $0x8,%%xmm2 \n"
3956 "paddusb %%xmm2,%%xmm0 \n"
3957 "pand %%xmm5,%%xmm1 \n"
3958 "paddusb %%xmm1,%%xmm0 \n"
3959 "sub $0x1,%3 \n"
3960 "movd %%xmm0," MEMACCESS(2) " \n"
3961 "lea " MEMLEA(0x4,2) ",%2 \n"
3962 "jge 10b \n"
3963
3964 "19: \n"
3965 "add $1-4,%3 \n"
3966 "jl 49f \n"
3967
3968 // 4 pixel loop.
3969 LABELALIGN
3970 "41: \n"
3971 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3972 "lea " MEMLEA(0x10,0) ",%0 \n"
3973 "movdqa %%xmm3,%%xmm0 \n"
3974 "pxor %%xmm4,%%xmm3 \n"
3975 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3976 "psrlw $0x8,%%xmm3 \n"
3977 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3978 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3979 "pand %%xmm6,%%xmm2 \n"
3980 "paddw %%xmm7,%%xmm3 \n"
3981 "pmullw %%xmm3,%%xmm2 \n"
3982 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3983 "lea " MEMLEA(0x10,1) ",%1 \n"
3984 "psrlw $0x8,%%xmm1 \n"
3985 "por %%xmm4,%%xmm0 \n"
3986 "pmullw %%xmm3,%%xmm1 \n"
3987 "psrlw $0x8,%%xmm2 \n"
3988 "paddusb %%xmm2,%%xmm0 \n"
3989 "pand %%xmm5,%%xmm1 \n"
3990 "paddusb %%xmm1,%%xmm0 \n"
3991 "sub $0x4,%3 \n"
3992 "movdqa %%xmm0," MEMACCESS(2) " \n"
3993 "lea " MEMLEA(0x10,2) ",%2 \n"
3994 "jge 41b \n"
3995
3996 "49: \n"
3997 "add $0x3,%3 \n"
3998 "jl 99f \n"
3999
4000 // 1 pixel loop.
4001 "91: \n"
4002 "movd " MEMACCESS(0) ",%%xmm3 \n"
4003 "lea " MEMLEA(0x4,0) ",%0 \n"
4004 "movdqa %%xmm3,%%xmm0 \n"
4005 "pxor %%xmm4,%%xmm3 \n"
4006 "movd " MEMACCESS(1) ",%%xmm2 \n"
4007 "psrlw $0x8,%%xmm3 \n"
4008 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
4009 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
4010 "pand %%xmm6,%%xmm2 \n"
4011 "paddw %%xmm7,%%xmm3 \n"
4012 "pmullw %%xmm3,%%xmm2 \n"
4013 "movd " MEMACCESS(1) ",%%xmm1 \n"
4014 "lea " MEMLEA(0x4,1) ",%1 \n"
4015 "psrlw $0x8,%%xmm1 \n"
4016 "por %%xmm4,%%xmm0 \n"
4017 "pmullw %%xmm3,%%xmm1 \n"
4018 "psrlw $0x8,%%xmm2 \n"
4019 "paddusb %%xmm2,%%xmm0 \n"
4020 "pand %%xmm5,%%xmm1 \n"
4021 "paddusb %%xmm1,%%xmm0 \n"
4022 "sub $0x1,%3 \n"
4023 "movd %%xmm0," MEMACCESS(2) " \n"
4024 "lea " MEMLEA(0x4,2) ",%2 \n"
4025 "jge 91b \n"
4026 "99: \n"
4027 : "+r"(src_argb0), // %0
4028 "+r"(src_argb1), // %1
4029 "+r"(dst_argb), // %2
4030 "+r"(width) // %3
4031 :
4032 : "memory", "cc"
4033 #if defined(__SSE2__)
4034 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4035 #endif
4036 );
4037 }
4038 #endif // HAS_ARGBBLENDROW_SSE2
4039
4040 #ifdef HAS_ARGBBLENDROW_SSSE3
4041 // Shuffle table for isolating alpha.
4042 static uvec8 kShuffleAlpha = {
4043 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4044 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4045 };
4046
4047 // Blend 8 pixels at a time
4048 // Shuffle table for reversing the bytes.
4049
4050 // Same as SSE2, but replaces
4051 // psrlw xmm3, 8 // alpha
4052 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
4053 // pshuflw xmm3, xmm3,0F5h
4054 // with..
4055 // pshufb xmm3, kShuffleAlpha // alpha
4056
4057 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4058 uint8* dst_argb, int width) {
4059 asm volatile (
4060 "pcmpeqb %%xmm7,%%xmm7 \n"
4061 "psrlw $0xf,%%xmm7 \n"
4062 "pcmpeqb %%xmm6,%%xmm6 \n"
4063 "psrlw $0x8,%%xmm6 \n"
4064 "pcmpeqb %%xmm5,%%xmm5 \n"
4065 "psllw $0x8,%%xmm5 \n"
4066 "pcmpeqb %%xmm4,%%xmm4 \n"
4067 "pslld $0x18,%%xmm4 \n"
4068 "sub $0x1,%3 \n"
4069 "je 91f \n"
4070 "jl 99f \n"
4071
4072 // 1 pixel loop until destination pointer is aligned.
4073 "10: \n"
4074 "test $0xf,%2 \n"
4075 "je 19f \n"
4076 "movd " MEMACCESS(0) ",%%xmm3 \n"
4077 "lea " MEMLEA(0x4,0) ",%0 \n"
4078 "movdqa %%xmm3,%%xmm0 \n"
4079 "pxor %%xmm4,%%xmm3 \n"
4080 "movd " MEMACCESS(1) ",%%xmm2 \n"
4081 "pshufb %4,%%xmm3 \n"
4082 "pand %%xmm6,%%xmm2 \n"
4083 "paddw %%xmm7,%%xmm3 \n"
4084 "pmullw %%xmm3,%%xmm2 \n"
4085 "movd " MEMACCESS(1) ",%%xmm1 \n"
4086 "lea " MEMLEA(0x4,1) ",%1 \n"
4087 "psrlw $0x8,%%xmm1 \n"
4088 "por %%xmm4,%%xmm0 \n"
4089 "pmullw %%xmm3,%%xmm1 \n"
4090 "psrlw $0x8,%%xmm2 \n"
4091 "paddusb %%xmm2,%%xmm0 \n"
4092 "pand %%xmm5,%%xmm1 \n"
4093 "paddusb %%xmm1,%%xmm0 \n"
4094 "sub $0x1,%3 \n"
4095 "movd %%xmm0," MEMACCESS(2) " \n"
4096 "lea " MEMLEA(0x4,2) ",%2 \n"
4097 "jge 10b \n"
4098
4099 "19: \n"
4100 "add $1-4,%3 \n"
4101 "jl 49f \n"
4102 "test $0xf,%0 \n"
4103 "jne 41f \n"
4104 "test $0xf,%1 \n"
4105 "jne 41f \n"
4106
4107 // 4 pixel loop.
4108 LABELALIGN
4109 "40: \n"
4110 "movdqa " MEMACCESS(0) ",%%xmm3 \n"
4111 "lea " MEMLEA(0x10,0) ",%0 \n"
4112 "movdqa %%xmm3,%%xmm0 \n"
4113 "pxor %%xmm4,%%xmm3 \n"
4114 "movdqa " MEMACCESS(1) ",%%xmm2 \n"
4115 "pshufb %4,%%xmm3 \n"
4116 "pand %%xmm6,%%xmm2 \n"
4117 "paddw %%xmm7,%%xmm3 \n"
4118 "pmullw %%xmm3,%%xmm2 \n"
4119 "movdqa " MEMACCESS(1) ",%%xmm1 \n"
4120 "lea " MEMLEA(0x10,1) ",%1 \n"
4121 "psrlw $0x8,%%xmm1 \n"
4122 "por %%xmm4,%%xmm0 \n"
4123 "pmullw %%xmm3,%%xmm1 \n"
4124 "psrlw $0x8,%%xmm2 \n"
4125 "paddusb %%xmm2,%%xmm0 \n"
4126 "pand %%xmm5,%%xmm1 \n"
4127 "paddusb %%xmm1,%%xmm0 \n"
4128 "sub $0x4,%3 \n"
4129 "movdqa %%xmm0," MEMACCESS(2) " \n"
4130 "lea " MEMLEA(0x10,2) ",%2 \n"
4131 "jge 40b \n"
4132 "jmp 49f \n"
4133
4134 // 4 pixel unaligned loop.
4135 LABELALIGN
4136 "41: \n"
4137 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
4138 "lea " MEMLEA(0x10,0) ",%0 \n"
4139 "movdqa %%xmm3,%%xmm0 \n"
4140 "pxor %%xmm4,%%xmm3 \n"
4141 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
4142 "pshufb %4,%%xmm3 \n"
4143 "pand %%xmm6,%%xmm2 \n"
4144 "paddw %%xmm7,%%xmm3 \n"
4145 "pmullw %%xmm3,%%xmm2 \n"
4146 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4147 "lea " MEMLEA(0x10,1) ",%1 \n"
4148 "psrlw $0x8,%%xmm1 \n"
4149 "por %%xmm4,%%xmm0 \n"
4150 "pmullw %%xmm3,%%xmm1 \n"
4151 "psrlw $0x8,%%xmm2 \n"
4152 "paddusb %%xmm2,%%xmm0 \n"
4153 "pand %%xmm5,%%xmm1 \n"
4154 "paddusb %%xmm1,%%xmm0 \n"
4155 "sub $0x4,%3 \n"
4156 "movdqa %%xmm0," MEMACCESS(2) " \n"
4157 "lea " MEMLEA(0x10,2) ",%2 \n"
4158 "jge 41b \n"
4159
4160 "49: \n"
4161 "add $0x3,%3 \n"
4162 "jl 99f \n"
4163
4164 // 1 pixel loop.
4165 "91: \n"
4166 "movd " MEMACCESS(0) ",%%xmm3 \n"
4167 "lea " MEMLEA(0x4,0) ",%0 \n"
4168 "movdqa %%xmm3,%%xmm0 \n"
4169 "pxor %%xmm4,%%xmm3 \n"
4170 "movd " MEMACCESS(1) ",%%xmm2 \n"
4171 "pshufb %4,%%xmm3 \n"
4172 "pand %%xmm6,%%xmm2 \n"
4173 "paddw %%xmm7,%%xmm3 \n"
4174 "pmullw %%xmm3,%%xmm2 \n"
4175 "movd " MEMACCESS(1) ",%%xmm1 \n"
4176 "lea " MEMLEA(0x4,1) ",%1 \n"
4177 "psrlw $0x8,%%xmm1 \n"
4178 "por %%xmm4,%%xmm0 \n"
4179 "pmullw %%xmm3,%%xmm1 \n"
4180 "psrlw $0x8,%%xmm2 \n"
4181 "paddusb %%xmm2,%%xmm0 \n"
4182 "pand %%xmm5,%%xmm1 \n"
4183 "paddusb %%xmm1,%%xmm0 \n"
4184 "sub $0x1,%3 \n"
4185 "movd %%xmm0," MEMACCESS(2) " \n"
4186 "lea " MEMLEA(0x4,2) ",%2 \n"
4187 "jge 91b \n"
4188 "99: \n"
4189 : "+r"(src_argb0), // %0
4190 "+r"(src_argb1), // %1
4191 "+r"(dst_argb), // %2
4192 "+r"(width) // %3
4193 : "m"(kShuffleAlpha) // %4
4194 : "memory", "cc"
4195 #if defined(__SSE2__)
4196 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4197 #endif
4198 );
4199 }
4200 #endif // HAS_ARGBBLENDROW_SSSE3
4201
4202 #ifdef HAS_ARGBATTENUATEROW_SSE2
4203 // Attenuate 4 pixels at a time.
4204 // aligned to 16 bytes
4205 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4206 asm volatile (
4207 "pcmpeqb %%xmm4,%%xmm4 \n"
4208 "pslld $0x18,%%xmm4 \n"
4209 "pcmpeqb %%xmm5,%%xmm5 \n"
4210 "psrld $0x8,%%xmm5 \n"
4211
4212 // 4 pixel loop.
4213 LABELALIGN
4214 "1: \n"
4215 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4216 "punpcklbw %%xmm0,%%xmm0 \n"
4217 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
4218 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
4219 "pmulhuw %%xmm2,%%xmm0 \n"
4220 "movdqa " MEMACCESS(0) ",%%xmm1 \n"
4221 "punpckhbw %%xmm1,%%xmm1 \n"
4222 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
4223 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
4224 "pmulhuw %%xmm2,%%xmm1 \n"
4225 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
4226 "lea " MEMLEA(0x10,0) ",%0 \n"
4227 "psrlw $0x8,%%xmm0 \n"
4228 "pand %%xmm4,%%xmm2 \n"
4229 "psrlw $0x8,%%xmm1 \n"
4230 "packuswb %%xmm1,%%xmm0 \n"
4231 "pand %%xmm5,%%xmm0 \n"
4232 "por %%xmm2,%%xmm0 \n"
4233 "sub $0x4,%2 \n"
4234 "movdqa %%xmm0," MEMACCESS(1) " \n"
4235 "lea " MEMLEA(0x10,1) ",%1 \n"
4236 "jg 1b \n"
4237 : "+r"(src_argb), // %0
4238 "+r"(dst_argb), // %1
4239 "+r"(width) // %2
4240 :
4241 : "memory", "cc"
4242 #if defined(__SSE2__)
4243 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4244 #endif
4245 );
4246 }
4247 #endif // HAS_ARGBATTENUATEROW_SSE2
4248
4249 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4250 // Shuffle table duplicating alpha
4251 static uvec8 kShuffleAlpha0 = {
4252 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4253 };
4254 static uvec8 kShuffleAlpha1 = {
4255 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4256 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4257 };
4258 // Attenuate 4 pixels at a time.
4259 // aligned to 16 bytes
4260 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4261 asm volatile (
4262 "pcmpeqb %%xmm3,%%xmm3 \n"
4263 "pslld $0x18,%%xmm3 \n"
4264 "movdqa %3,%%xmm4 \n"
4265 "movdqa %4,%%xmm5 \n"
4266
4267 // 4 pixel loop.
4268 LABELALIGN
4269 "1: \n"
4270 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4271 "pshufb %%xmm4,%%xmm0 \n"
4272 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4273 "punpcklbw %%xmm1,%%xmm1 \n"
4274 "pmulhuw %%xmm1,%%xmm0 \n"
4275 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4276 "pshufb %%xmm5,%%xmm1 \n"
4277 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4278 "punpckhbw %%xmm2,%%xmm2 \n"
4279 "pmulhuw %%xmm2,%%xmm1 \n"
4280 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4281 "lea " MEMLEA(0x10,0) ",%0 \n"
4282 "pand %%xmm3,%%xmm2 \n"
4283 "psrlw $0x8,%%xmm0 \n"
4284 "psrlw $0x8,%%xmm1 \n"
4285 "packuswb %%xmm1,%%xmm0 \n"
4286 "por %%xmm2,%%xmm0 \n"
4287 "sub $0x4,%2 \n"
4288 "movdqu %%xmm0," MEMACCESS(1) " \n"
4289 "lea " MEMLEA(0x10,1) ",%1 \n"
4290 "jg 1b \n"
4291 : "+r"(src_argb), // %0
4292 "+r"(dst_argb), // %1
4293 "+r"(width) // %2
4294 : "m"(kShuffleAlpha0), // %3
4295 "m"(kShuffleAlpha1) // %4
4296 : "memory", "cc"
4297 #if defined(__SSE2__)
4298 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4299 #endif
4300 );
4301 }
4302 #endif // HAS_ARGBATTENUATEROW_SSSE3
4303
4304 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4305 // Unattenuate 4 pixels at a time.
4306 // aligned to 16 bytes
4307 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4308 int width) {
4309 uintptr_t alpha = 0;
4310 asm volatile (
4311 // 4 pixel loop.
4312 LABELALIGN
4313 "1: \n"
4314 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4315 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
4316 "punpcklbw %%xmm0,%%xmm0 \n"
4317 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
4318 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
4319 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
4320 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4321 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4322 "movlhps %%xmm3,%%xmm2 \n"
4323 "pmulhuw %%xmm2,%%xmm0 \n"
4324 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4325 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
4326 "punpckhbw %%xmm1,%%xmm1 \n"
4327 BUNDLEALIGN
4328 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
4329 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
4330 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
4331 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4332 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4333 "movlhps %%xmm3,%%xmm2 \n"
4334 "pmulhuw %%xmm2,%%xmm1 \n"
4335 "lea " MEMLEA(0x10,0) ",%0 \n"
4336 "packuswb %%xmm1,%%xmm0 \n"
4337 "sub $0x4,%2 \n"
4338 "movdqu %%xmm0," MEMACCESS(1) " \n"
4339 "lea " MEMLEA(0x10,1) ",%1 \n"
4340 "jg 1b \n"
4341 : "+r"(src_argb), // %0
4342 "+r"(dst_argb), // %1
4343 "+r"(width), // %2
4344 "+r"(alpha) // %3
4345 : "r"(fixed_invtbl8) // %4
4346 : "memory", "cc"
4347 #if defined(__native_client__) && defined(__x86_64__)
4348 , "r14"
4349 #endif
4350 #if defined(__SSE2__)
4351 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4352 #endif
4353 );
4354 }
4355 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4356
4357 #ifdef HAS_ARGBGRAYROW_SSSE3
4358 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
4359 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4360 asm volatile (
4361 "movdqa %3,%%xmm4 \n"
4362 "movdqa %4,%%xmm5 \n"
4363
4364 // 8 pixel loop.
4365 LABELALIGN
4366 "1: \n"
4367 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4368 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4369 "pmaddubsw %%xmm4,%%xmm0 \n"
4370 "pmaddubsw %%xmm4,%%xmm1 \n"
4371 "phaddw %%xmm1,%%xmm0 \n"
4372 "paddw %%xmm5,%%xmm0 \n"
4373 "psrlw $0x7,%%xmm0 \n"
4374 "packuswb %%xmm0,%%xmm0 \n"
4375 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
4376 "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
4377 "lea " MEMLEA(0x20,0) ",%0 \n"
4378 "psrld $0x18,%%xmm2 \n"
4379 "psrld $0x18,%%xmm3 \n"
4380 "packuswb %%xmm3,%%xmm2 \n"
4381 "packuswb %%xmm2,%%xmm2 \n"
4382 "movdqa %%xmm0,%%xmm3 \n"
4383 "punpcklbw %%xmm0,%%xmm0 \n"
4384 "punpcklbw %%xmm2,%%xmm3 \n"
4385 "movdqa %%xmm0,%%xmm1 \n"
4386 "punpcklwd %%xmm3,%%xmm0 \n"
4387 "punpckhwd %%xmm3,%%xmm1 \n"
4388 "sub $0x8,%2 \n"
4389 "movdqa %%xmm0," MEMACCESS(1) " \n"
4390 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
4391 "lea " MEMLEA(0x20,1) ",%1 \n"
4392 "jg 1b \n"
4393 : "+r"(src_argb), // %0
4394 "+r"(dst_argb), // %1
4395 "+r"(width) // %2
4396 : "m"(kARGBToYJ), // %3
4397 "m"(kAddYJ64) // %4
4398 : "memory", "cc"
4399 #if defined(__SSE2__)
4400 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4401 #endif
4402 );
4403 }
4404 #endif // HAS_ARGBGRAYROW_SSSE3
4405
4406 #ifdef HAS_ARGBSEPIAROW_SSSE3
4407 // b = (r * 35 + g * 68 + b * 17) >> 7
4408 // g = (r * 45 + g * 88 + b * 22) >> 7
4409 // r = (r * 50 + g * 98 + b * 24) >> 7
4410 // Constant for ARGB color to sepia tone
4411 static vec8 kARGBToSepiaB = {
4412 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4413 };
4414
4415 static vec8 kARGBToSepiaG = {
4416 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4417 };
4418
4419 static vec8 kARGBToSepiaR = {
4420 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4421 };
4422
4423 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4424 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4425 asm volatile (
4426 "movdqa %2,%%xmm2 \n"
4427 "movdqa %3,%%xmm3 \n"
4428 "movdqa %4,%%xmm4 \n"
4429
4430 // 8 pixel loop.
4431 LABELALIGN
4432 "1: \n"
4433 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4434 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
4435 "pmaddubsw %%xmm2,%%xmm0 \n"
4436 "pmaddubsw %%xmm2,%%xmm6 \n"
4437 "phaddw %%xmm6,%%xmm0 \n"
4438 "psrlw $0x7,%%xmm0 \n"
4439 "packuswb %%xmm0,%%xmm0 \n"
4440 "movdqa " MEMACCESS(0) ",%%xmm5 \n"
4441 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4442 "pmaddubsw %%xmm3,%%xmm5 \n"
4443 "pmaddubsw %%xmm3,%%xmm1 \n"
4444 "phaddw %%xmm1,%%xmm5 \n"
4445 "psrlw $0x7,%%xmm5 \n"
4446 "packuswb %%xmm5,%%xmm5 \n"
4447 "punpcklbw %%xmm5,%%xmm0 \n"
4448 "movdqa " MEMACCESS(0) ",%%xmm5 \n"
4449 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4450 "pmaddubsw %%xmm4,%%xmm5 \n"
4451 "pmaddubsw %%xmm4,%%xmm1 \n"
4452 "phaddw %%xmm1,%%xmm5 \n"
4453 "psrlw $0x7,%%xmm5 \n"
4454 "packuswb %%xmm5,%%xmm5 \n"
4455 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
4456 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4457 "psrld $0x18,%%xmm6 \n"
4458 "psrld $0x18,%%xmm1 \n"
4459 "packuswb %%xmm1,%%xmm6 \n"
4460 "packuswb %%xmm6,%%xmm6 \n"
4461 "punpcklbw %%xmm6,%%xmm5 \n"
4462 "movdqa %%xmm0,%%xmm1 \n"
4463 "punpcklwd %%xmm5,%%xmm0 \n"
4464 "punpckhwd %%xmm5,%%xmm1 \n"
4465 "sub $0x8,%1 \n"
4466 "movdqa %%xmm0," MEMACCESS(0) " \n"
4467 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
4468 "lea " MEMLEA(0x20,0) ",%0 \n"
4469 "jg 1b \n"
4470 : "+r"(dst_argb), // %0
4471 "+r"(width) // %1
4472 : "m"(kARGBToSepiaB), // %2
4473 "m"(kARGBToSepiaG), // %3
4474 "m"(kARGBToSepiaR) // %4
4475 : "memory", "cc"
4476 #if defined(__SSE2__)
4477 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4478 #endif
4479 );
4480 }
4481 #endif // HAS_ARGBSEPIAROW_SSSE3
4482
4483 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4484 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4485 // Same as Sepia except matrix is provided.
4486 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4487 const int8* matrix_argb, int width) {
4488 asm volatile (
4489 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4490 "pshufd $0x00,%%xmm5,%%xmm2 \n"
4491 "pshufd $0x55,%%xmm5,%%xmm3 \n"
4492 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
4493 "pshufd $0xff,%%xmm5,%%xmm5 \n"
4494
4495 // 8 pixel loop.
4496 LABELALIGN
4497 "1: \n"
4498 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4499 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4500 "pmaddubsw %%xmm2,%%xmm0 \n"
4501 "pmaddubsw %%xmm2,%%xmm7 \n"
4502 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
4503 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4504 "pmaddubsw %%xmm3,%%xmm6 \n"
4505 "pmaddubsw %%xmm3,%%xmm1 \n"
4506 "phaddsw %%xmm7,%%xmm0 \n"
4507 "phaddsw %%xmm1,%%xmm6 \n"
4508 "psraw $0x6,%%xmm0 \n"
4509 "psraw $0x6,%%xmm6 \n"
4510 "packuswb %%xmm0,%%xmm0 \n"
4511 "packuswb %%xmm6,%%xmm6 \n"
4512 "punpcklbw %%xmm6,%%xmm0 \n"
4513 "movdqa " MEMACCESS(0) ",%%xmm1 \n"
4514 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4515 "pmaddubsw %%xmm4,%%xmm1 \n"
4516 "pmaddubsw %%xmm4,%%xmm7 \n"
4517 "phaddsw %%xmm7,%%xmm1 \n"
4518 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
4519 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4520 "pmaddubsw %%xmm5,%%xmm6 \n"
4521 "pmaddubsw %%xmm5,%%xmm7 \n"
4522 "phaddsw %%xmm7,%%xmm6 \n"
4523 "psraw $0x6,%%xmm1 \n"
4524 "psraw $0x6,%%xmm6 \n"
4525 "packuswb %%xmm1,%%xmm1 \n"
4526 "packuswb %%xmm6,%%xmm6 \n"
4527 "punpcklbw %%xmm6,%%xmm1 \n"
4528 "movdqa %%xmm0,%%xmm6 \n"
4529 "punpcklwd %%xmm1,%%xmm0 \n"
4530 "punpckhwd %%xmm1,%%xmm6 \n"
4531 "sub $0x8,%2 \n"
4532 "movdqa %%xmm0," MEMACCESS(1) " \n"
4533 "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
4534 "lea " MEMLEA(0x20,0) ",%0 \n"
4535 "lea " MEMLEA(0x20,1) ",%1 \n"
4536 "jg 1b \n"
4537 : "+r"(src_argb), // %0
4538 "+r"(dst_argb), // %1
4539 "+r"(width) // %2
4540 : "r"(matrix_argb) // %3
4541 : "memory", "cc"
4542 #if defined(__SSE2__)
4543 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4544 #endif
4545 );
4546 }
4547 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4548
4549 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4550 // Quantize 4 ARGB pixels (16 bytes).
4551 // aligned to 16 bytes
4552 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4553 int interval_offset, int width) {
4554 asm volatile (
4555 "movd %2,%%xmm2 \n"
4556 "movd %3,%%xmm3 \n"
4557 "movd %4,%%xmm4 \n"
4558 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4559 "pshufd $0x44,%%xmm2,%%xmm2 \n"
4560 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4561 "pshufd $0x44,%%xmm3,%%xmm3 \n"
4562 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
4563 "pshufd $0x44,%%xmm4,%%xmm4 \n"
4564 "pxor %%xmm5,%%xmm5 \n"
4565 "pcmpeqb %%xmm6,%%xmm6 \n"
4566 "pslld $0x18,%%xmm6 \n"
4567
4568 // 4 pixel loop.
4569 LABELALIGN
4570 "1: \n"
4571 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4572 "punpcklbw %%xmm5,%%xmm0 \n"
4573 "pmulhuw %%xmm2,%%xmm0 \n"
4574 "movdqa " MEMACCESS(0) ",%%xmm1 \n"
4575 "punpckhbw %%xmm5,%%xmm1 \n"
4576 "pmulhuw %%xmm2,%%xmm1 \n"
4577 "pmullw %%xmm3,%%xmm0 \n"
4578 "movdqa " MEMACCESS(0) ",%%xmm7 \n"
4579 "pmullw %%xmm3,%%xmm1 \n"
4580 "pand %%xmm6,%%xmm7 \n"
4581 "paddw %%xmm4,%%xmm0 \n"
4582 "paddw %%xmm4,%%xmm1 \n"
4583 "packuswb %%xmm1,%%xmm0 \n"
4584 "por %%xmm7,%%xmm0 \n"
4585 "sub $0x4,%1 \n"
4586 "movdqa %%xmm0," MEMACCESS(0) " \n"
4587 "lea " MEMLEA(0x10,0) ",%0 \n"
4588 "jg 1b \n"
4589 : "+r"(dst_argb), // %0
4590 "+r"(width) // %1
4591 : "r"(scale), // %2
4592 "r"(interval_size), // %3
4593 "r"(interval_offset) // %4
4594 : "memory", "cc"
4595 #if defined(__SSE2__)
4596 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4597 #endif
4598 );
4599 }
4600 #endif // HAS_ARGBQUANTIZEROW_SSE2
4601
4602 #ifdef HAS_ARGBSHADEROW_SSE2
4603 // Shade 4 pixels at a time by specified value.
4604 // Aligned to 16 bytes.
4605 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4606 uint32 value) {
4607 asm volatile (
4608 "movd %3,%%xmm2 \n"
4609 "punpcklbw %%xmm2,%%xmm2 \n"
4610 "punpcklqdq %%xmm2,%%xmm2 \n"
4611
4612 // 4 pixel loop.
4613 LABELALIGN
4614 "1: \n"
4615 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4616 "lea " MEMLEA(0x10,0) ",%0 \n"
4617 "movdqa %%xmm0,%%xmm1 \n"
4618 "punpcklbw %%xmm0,%%xmm0 \n"
4619 "punpckhbw %%xmm1,%%xmm1 \n"
4620 "pmulhuw %%xmm2,%%xmm0 \n"
4621 "pmulhuw %%xmm2,%%xmm1 \n"
4622 "psrlw $0x8,%%xmm0 \n"
4623 "psrlw $0x8,%%xmm1 \n"
4624 "packuswb %%xmm1,%%xmm0 \n"
4625 "sub $0x4,%2 \n"
4626 "movdqa %%xmm0," MEMACCESS(1) " \n"
4627 "lea " MEMLEA(0x10,1) ",%1 \n"
4628 "jg 1b \n"
4629 : "+r"(src_argb), // %0
4630 "+r"(dst_argb), // %1
4631 "+r"(width) // %2
4632 : "r"(value) // %3
4633 : "memory", "cc"
4634 #if defined(__SSE2__)
4635 , "xmm0", "xmm1", "xmm2"
4636 #endif
4637 );
4638 }
4639 #endif // HAS_ARGBSHADEROW_SSE2
4640
4641 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4642 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4643 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4644 uint8* dst_argb, int width) {
4645 asm volatile (
4646 "pxor %%xmm5,%%xmm5 \n"
4647
4648 // 4 pixel loop.
4649 LABELALIGN
4650 "1: \n"
4651 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4652 "lea " MEMLEA(0x10,0) ",%0 \n"
4653 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
4654 "lea " MEMLEA(0x10,1) ",%1 \n"
4655 "movdqu %%xmm0,%%xmm1 \n"
4656 "movdqu %%xmm2,%%xmm3 \n"
4657 "punpcklbw %%xmm0,%%xmm0 \n"
4658 "punpckhbw %%xmm1,%%xmm1 \n"
4659 "punpcklbw %%xmm5,%%xmm2 \n"
4660 "punpckhbw %%xmm5,%%xmm3 \n"
4661 "pmulhuw %%xmm2,%%xmm0 \n"
4662 "pmulhuw %%xmm3,%%xmm1 \n"
4663 "packuswb %%xmm1,%%xmm0 \n"
4664 "sub $0x4,%3 \n"
4665 "movdqu %%xmm0," MEMACCESS(2) " \n"
4666 "lea " MEMLEA(0x10,2) ",%2 \n"
4667 "jg 1b \n"
4668 : "+r"(src_argb0), // %0
4669 "+r"(src_argb1), // %1
4670 "+r"(dst_argb), // %2
4671 "+r"(width) // %3
4672 :
4673 : "memory", "cc"
4674 #if defined(__SSE2__)
4675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4676 #endif
4677 );
4678 }
4679 #endif // HAS_ARGBMULTIPLYROW_SSE2
4680
4681 #ifdef HAS_ARGBADDROW_SSE2
4682 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4683 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4684 uint8* dst_argb, int width) {
4685 asm volatile (
4686 // 4 pixel loop.
4687 LABELALIGN
4688 "1: \n"
4689 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4690 "lea " MEMLEA(0x10,0) ",%0 \n"
4691 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4692 "lea " MEMLEA(0x10,1) ",%1 \n"
4693 "paddusb %%xmm1,%%xmm0 \n"
4694 "sub $0x4,%3 \n"
4695 "movdqu %%xmm0," MEMACCESS(2) " \n"
4696 "lea " MEMLEA(0x10,2) ",%2 \n"
4697 "jg 1b \n"
4698 : "+r"(src_argb0), // %0
4699 "+r"(src_argb1), // %1
4700 "+r"(dst_argb), // %2
4701 "+r"(width) // %3
4702 :
4703 : "memory", "cc"
4704 #if defined(__SSE2__)
4705 , "xmm0", "xmm1"
4706 #endif
4707 );
4708 }
4709 #endif // HAS_ARGBADDROW_SSE2
4710
4711 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4712 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4713 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4714 uint8* dst_argb, int width) {
4715 asm volatile (
4716 // 4 pixel loop.
4717 LABELALIGN
4718 "1: \n"
4719 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4720 "lea " MEMLEA(0x10,0) ",%0 \n"
4721 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4722 "lea " MEMLEA(0x10,1) ",%1 \n"
4723 "psubusb %%xmm1,%%xmm0 \n"
4724 "sub $0x4,%3 \n"
4725 "movdqu %%xmm0," MEMACCESS(2) " \n"
4726 "lea " MEMLEA(0x10,2) ",%2 \n"
4727 "jg 1b \n"
4728 : "+r"(src_argb0), // %0
4729 "+r"(src_argb1), // %1
4730 "+r"(dst_argb), // %2
4731 "+r"(width) // %3
4732 :
4733 : "memory", "cc"
4734 #if defined(__SSE2__)
4735 , "xmm0", "xmm1"
4736 #endif
4737 );
4738 }
4739 #endif // HAS_ARGBSUBTRACTROW_SSE2
4740
4741 #ifdef HAS_SOBELXROW_SSE2
4742 // SobelX as a matrix is
4743 // -1 0 1
4744 // -2 0 2
4745 // -1 0 1
4746 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4747 const uint8* src_y2, uint8* dst_sobelx, int width) {
4748 asm volatile (
4749 "sub %0,%1 \n"
4750 "sub %0,%2 \n"
4751 "sub %0,%3 \n"
4752 "pxor %%xmm5,%%xmm5 \n"
4753
4754 // 8 pixel loop.
4755 LABELALIGN
4756 "1: \n"
4757 "movq " MEMACCESS(0) ",%%xmm0 \n"
4758 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
4759 "punpcklbw %%xmm5,%%xmm0 \n"
4760 "punpcklbw %%xmm5,%%xmm1 \n"
4761 "psubw %%xmm1,%%xmm0 \n"
4762 BUNDLEALIGN
4763 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4764 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
4765 "punpcklbw %%xmm5,%%xmm1 \n"
4766 "punpcklbw %%xmm5,%%xmm2 \n"
4767 "psubw %%xmm2,%%xmm1 \n"
4768 BUNDLEALIGN
4769 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
4770 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
4771 "punpcklbw %%xmm5,%%xmm2 \n"
4772 "punpcklbw %%xmm5,%%xmm3 \n"
4773 "psubw %%xmm3,%%xmm2 \n"
4774 "paddw %%xmm2,%%xmm0 \n"
4775 "paddw %%xmm1,%%xmm0 \n"
4776 "paddw %%xmm1,%%xmm0 \n"
4777 "pxor %%xmm1,%%xmm1 \n"
4778 "psubw %%xmm0,%%xmm1 \n"
4779 "pmaxsw %%xmm1,%%xmm0 \n"
4780 "packuswb %%xmm0,%%xmm0 \n"
4781 "sub $0x8,%4 \n"
4782 BUNDLEALIGN
4783 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
4784 "lea " MEMLEA(0x8,0) ",%0 \n"
4785 "jg 1b \n"
4786 : "+r"(src_y0), // %0
4787 "+r"(src_y1), // %1
4788 "+r"(src_y2), // %2
4789 "+r"(dst_sobelx), // %3
4790 "+r"(width) // %4
4791 :
4792 : "memory", "cc"
4793 #if defined(__native_client__) && defined(__x86_64__)
4794 , "r14"
4795 #endif
4796 #if defined(__SSE2__)
4797 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4798 #endif
4799 );
4800 }
4801 #endif // HAS_SOBELXROW_SSE2
4802
4803 #ifdef HAS_SOBELYROW_SSE2
4804 // SobelY as a matrix is
4805 // -1 -2 -1
4806 // 0 0 0
4807 // 1 2 1
4808 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4809 uint8* dst_sobely, int width) {
4810 asm volatile (
4811 "sub %0,%1 \n"
4812 "sub %0,%2 \n"
4813 "pxor %%xmm5,%%xmm5 \n"
4814
4815 // 8 pixel loop.
4816 LABELALIGN
4817 "1: \n"
4818 "movq " MEMACCESS(0) ",%%xmm0 \n"
4819 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4820 "punpcklbw %%xmm5,%%xmm0 \n"
4821 "punpcklbw %%xmm5,%%xmm1 \n"
4822 "psubw %%xmm1,%%xmm0 \n"
4823 BUNDLEALIGN
4824 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
4825 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
4826 "punpcklbw %%xmm5,%%xmm1 \n"
4827 "punpcklbw %%xmm5,%%xmm2 \n"
4828 "psubw %%xmm2,%%xmm1 \n"
4829 BUNDLEALIGN
4830 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
4831 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
4832 "punpcklbw %%xmm5,%%xmm2 \n"
4833 "punpcklbw %%xmm5,%%xmm3 \n"
4834 "psubw %%xmm3,%%xmm2 \n"
4835 "paddw %%xmm2,%%xmm0 \n"
4836 "paddw %%xmm1,%%xmm0 \n"
4837 "paddw %%xmm1,%%xmm0 \n"
4838 "pxor %%xmm1,%%xmm1 \n"
4839 "psubw %%xmm0,%%xmm1 \n"
4840 "pmaxsw %%xmm1,%%xmm0 \n"
4841 "packuswb %%xmm0,%%xmm0 \n"
4842 "sub $0x8,%3 \n"
4843 BUNDLEALIGN
4844 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
4845 "lea " MEMLEA(0x8,0) ",%0 \n"
4846 "jg 1b \n"
4847 : "+r"(src_y0), // %0
4848 "+r"(src_y1), // %1
4849 "+r"(dst_sobely), // %2
4850 "+r"(width) // %3
4851 :
4852 : "memory", "cc"
4853 #if defined(__native_client__) && defined(__x86_64__)
4854 , "r14"
4855 #endif
4856 #if defined(__SSE2__)
4857 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4858 #endif
4859 );
4860 }
4861 #endif // HAS_SOBELYROW_SSE2
4862
4863 #ifdef HAS_SOBELROW_SSE2
4864 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4865 // A = 255
4866 // R = Sobel
4867 // G = Sobel
4868 // B = Sobel
4869 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4870 uint8* dst_argb, int width) {
4871 asm volatile (
4872 "sub %0,%1 \n"
4873 "pcmpeqb %%xmm5,%%xmm5 \n"
4874 "pslld $0x18,%%xmm5 \n"
4875
4876 // 8 pixel loop.
4877 LABELALIGN
4878 "1: \n"
4879 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4880 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
4881 "lea " MEMLEA(0x10,0) ",%0 \n"
4882 "paddusb %%xmm1,%%xmm0 \n"
4883 "movdqa %%xmm0,%%xmm2 \n"
4884 "punpcklbw %%xmm0,%%xmm2 \n"
4885 "punpckhbw %%xmm0,%%xmm0 \n"
4886 "movdqa %%xmm2,%%xmm1 \n"
4887 "punpcklwd %%xmm2,%%xmm1 \n"
4888 "punpckhwd %%xmm2,%%xmm2 \n"
4889 "por %%xmm5,%%xmm1 \n"
4890 "por %%xmm5,%%xmm2 \n"
4891 "movdqa %%xmm0,%%xmm3 \n"
4892 "punpcklwd %%xmm0,%%xmm3 \n"
4893 "punpckhwd %%xmm0,%%xmm0 \n"
4894 "por %%xmm5,%%xmm3 \n"
4895 "por %%xmm5,%%xmm0 \n"
4896 "sub $0x10,%3 \n"
4897 "movdqa %%xmm1," MEMACCESS(2) " \n"
4898 "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
4899 "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
4900 "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
4901 "lea " MEMLEA(0x40,2) ",%2 \n"
4902 "jg 1b \n"
4903 : "+r"(src_sobelx), // %0
4904 "+r"(src_sobely), // %1
4905 "+r"(dst_argb), // %2
4906 "+r"(width) // %3
4907 :
4908 : "memory", "cc"
4909 #if defined(__native_client__) && defined(__x86_64__)
4910 , "r14"
4911 #endif
4912 #if defined(__SSE2__)
4913 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4914 #endif
4915 );
4916 }
4917 #endif // HAS_SOBELROW_SSE2
4918
4919 #ifdef HAS_SOBELTOPLANEROW_SSE2
4920 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4921 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4922 uint8* dst_y, int width) {
4923 asm volatile (
4924 "sub %0,%1 \n"
4925 "pcmpeqb %%xmm5,%%xmm5 \n"
4926 "pslld $0x18,%%xmm5 \n"
4927
4928 // 8 pixel loop.
4929 LABELALIGN
4930 "1: \n"
4931 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4932 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
4933 "lea " MEMLEA(0x10,0) ",%0 \n"
4934 "paddusb %%xmm1,%%xmm0 \n"
4935 "sub $0x10,%3 \n"
4936 "movdqa %%xmm0," MEMACCESS(2) " \n"
4937 "lea " MEMLEA(0x10,2) ",%2 \n"
4938 "jg 1b \n"
4939 : "+r"(src_sobelx), // %0
4940 "+r"(src_sobely), // %1
4941 "+r"(dst_y), // %2
4942 "+r"(width) // %3
4943 :
4944 : "memory", "cc"
4945 #if defined(__native_client__) && defined(__x86_64__)
4946 , "r14"
4947 #endif
4948 #if defined(__SSE2__)
4949 , "xmm0", "xmm1"
4950 #endif
4951 );
4952 }
4953 #endif // HAS_SOBELTOPLANEROW_SSE2
4954
4955 #ifdef HAS_SOBELXYROW_SSE2
4956 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4957 // A = 255
4958 // R = Sobel X
4959 // G = Sobel
4960 // B = Sobel Y
4961 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4962 uint8* dst_argb, int width) {
4963 asm volatile (
4964 "sub %0,%1 \n"
4965 "pcmpeqb %%xmm5,%%xmm5 \n"
4966
4967 // 8 pixel loop.
4968 LABELALIGN
4969 "1: \n"
4970 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
4971 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
4972 "lea " MEMLEA(0x10,0) ",%0 \n"
4973 "movdqa %%xmm0,%%xmm2 \n"
4974 "paddusb %%xmm1,%%xmm2 \n"
4975 "movdqa %%xmm0,%%xmm3 \n"
4976 "punpcklbw %%xmm5,%%xmm3 \n"
4977 "punpckhbw %%xmm5,%%xmm0 \n"
4978 "movdqa %%xmm1,%%xmm4 \n"
4979 "punpcklbw %%xmm2,%%xmm4 \n"
4980 "punpckhbw %%xmm2,%%xmm1 \n"
4981 "movdqa %%xmm4,%%xmm6 \n"
4982 "punpcklwd %%xmm3,%%xmm6 \n"
4983 "punpckhwd %%xmm3,%%xmm4 \n"
4984 "movdqa %%xmm1,%%xmm7 \n"
4985 "punpcklwd %%xmm0,%%xmm7 \n"
4986 "punpckhwd %%xmm0,%%xmm1 \n"
4987 "sub $0x10,%3 \n"
4988 "movdqa %%xmm6," MEMACCESS(2) " \n"
4989 "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
4990 "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
4991 "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
4992 "lea " MEMLEA(0x40,2) ",%2 \n"
4993 "jg 1b \n"
4994 : "+r"(src_sobelx), // %0
4995 "+r"(src_sobely), // %1
4996 "+r"(dst_argb), // %2
4997 "+r"(width) // %3
4998 :
4999 : "memory", "cc"
5000 #if defined(__native_client__) && defined(__x86_64__)
5001 , "r14"
5002 #endif
5003 #if defined(__SSE2__)
5004 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5005 #endif
5006 );
5007 }
5008 #endif // HAS_SOBELXYROW_SSE2
5009
5010 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5011 // Creates a table of cumulative sums where each value is a sum of all values
5012 // above and to the left of the value, inclusive of the value.
5013 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5014 const int32* previous_cumsum, int width) {
5015 asm volatile (
5016 "pxor %%xmm0,%%xmm0 \n"
5017 "pxor %%xmm1,%%xmm1 \n"
5018 "sub $0x4,%3 \n"
5019 "jl 49f \n"
5020 "test $0xf,%1 \n"
5021 "jne 49f \n"
5022
5023 // 4 pixel loop \n"
5024 LABELALIGN
5025 "40: \n"
5026 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
5027 "lea " MEMLEA(0x10,0) ",%0 \n"
5028 "movdqa %%xmm2,%%xmm4 \n"
5029 "punpcklbw %%xmm1,%%xmm2 \n"
5030 "movdqa %%xmm2,%%xmm3 \n"
5031 "punpcklwd %%xmm1,%%xmm2 \n"
5032 "punpckhwd %%xmm1,%%xmm3 \n"
5033 "punpckhbw %%xmm1,%%xmm4 \n"
5034 "movdqa %%xmm4,%%xmm5 \n"
5035 "punpcklwd %%xmm1,%%xmm4 \n"
5036 "punpckhwd %%xmm1,%%xmm5 \n"
5037 "paddd %%xmm2,%%xmm0 \n"
5038 "movdqa " MEMACCESS(2) ",%%xmm2 \n"
5039 "paddd %%xmm0,%%xmm2 \n"
5040 "paddd %%xmm3,%%xmm0 \n"
5041 "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
5042 "paddd %%xmm0,%%xmm3 \n"
5043 "paddd %%xmm4,%%xmm0 \n"
5044 "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
5045 "paddd %%xmm0,%%xmm4 \n"
5046 "paddd %%xmm5,%%xmm0 \n"
5047 "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
5048 "lea " MEMLEA(0x40,2) ",%2 \n"
5049 "paddd %%xmm0,%%xmm5 \n"
5050 "movdqa %%xmm2," MEMACCESS(1) " \n"
5051 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
5052 "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
5053 "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
5054 "lea " MEMLEA(0x40,1) ",%1 \n"
5055 "sub $0x4,%3 \n"
5056 "jge 40b \n"
5057
5058 "49: \n"
5059 "add $0x3,%3 \n"
5060 "jl 19f \n"
5061
5062 // 1 pixel loop \n"
5063 LABELALIGN
5064 "10: \n"
5065 "movd " MEMACCESS(0) ",%%xmm2 \n"
5066 "lea " MEMLEA(0x4,0) ",%0 \n"
5067 "punpcklbw %%xmm1,%%xmm2 \n"
5068 "punpcklwd %%xmm1,%%xmm2 \n"
5069 "paddd %%xmm2,%%xmm0 \n"
5070 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
5071 "lea " MEMLEA(0x10,2) ",%2 \n"
5072 "paddd %%xmm0,%%xmm2 \n"
5073 "movdqu %%xmm2," MEMACCESS(1) " \n"
5074 "lea " MEMLEA(0x10,1) ",%1 \n"
5075 "sub $0x1,%3 \n"
5076 "jge 10b \n"
5077
5078 "19: \n"
5079 : "+r"(row), // %0
5080 "+r"(cumsum), // %1
5081 "+r"(previous_cumsum), // %2
5082 "+r"(width) // %3
5083 :
5084 : "memory", "cc"
5085 #if defined(__SSE2__)
5086 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5087 #endif
5088 );
5089 }
5090 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5091
5092 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5093 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5094 int width, int area, uint8* dst,
5095 int count) {
5096 asm volatile (
5097 "movd %5,%%xmm5 \n"
5098 "cvtdq2ps %%xmm5,%%xmm5 \n"
5099 "rcpss %%xmm5,%%xmm4 \n"
5100 "pshufd $0x0,%%xmm4,%%xmm4 \n"
5101 "sub $0x4,%3 \n"
5102 "jl 49f \n"
5103 "cmpl $0x80,%5 \n"
5104 "ja 40f \n"
5105
5106 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5107 "pcmpeqb %%xmm6,%%xmm6 \n"
5108 "psrld $0x10,%%xmm6 \n"
5109 "cvtdq2ps %%xmm6,%%xmm6 \n"
5110 "addps %%xmm6,%%xmm5 \n"
5111 "mulps %%xmm4,%%xmm5 \n"
5112 "cvtps2dq %%xmm5,%%xmm5 \n"
5113 "packssdw %%xmm5,%%xmm5 \n"
5114
5115 // 4 pixel small loop \n"
5116 LABELALIGN
5117 "4: \n"
5118 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
5119 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5120 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
5121 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
5122 BUNDLEALIGN
5123 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
5124 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
5125 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
5126 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
5127 "lea " MEMLEA(0x40,0) ",%0 \n"
5128 "psubd " MEMACCESS(1) ",%%xmm0 \n"
5129 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
5130 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
5131 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
5132 BUNDLEALIGN
5133 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
5134 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
5135 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
5136 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
5137 "lea " MEMLEA(0x40,1) ",%1 \n"
5138 "packssdw %%xmm1,%%xmm0 \n"
5139 "packssdw %%xmm3,%%xmm2 \n"
5140 "pmulhuw %%xmm5,%%xmm0 \n"
5141 "pmulhuw %%xmm5,%%xmm2 \n"
5142 "packuswb %%xmm2,%%xmm0 \n"
5143 "movdqu %%xmm0," MEMACCESS(2) " \n"
5144 "lea " MEMLEA(0x10,2) ",%2 \n"
5145 "sub $0x4,%3 \n"
5146 "jge 4b \n"
5147 "jmp 49f \n"
5148
5149 // 4 pixel loop \n"
5150 LABELALIGN
5151 "40: \n"
5152 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
5153 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5154 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
5155 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
5156 BUNDLEALIGN
5157 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
5158 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
5159 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
5160 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
5161 "lea " MEMLEA(0x40,0) ",%0 \n"
5162 "psubd " MEMACCESS(1) ",%%xmm0 \n"
5163 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
5164 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
5165 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
5166 BUNDLEALIGN
5167 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
5168 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
5169 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
5170 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
5171 "lea " MEMLEA(0x40,1) ",%1 \n"
5172 "cvtdq2ps %%xmm0,%%xmm0 \n"
5173 "cvtdq2ps %%xmm1,%%xmm1 \n"
5174 "mulps %%xmm4,%%xmm0 \n"
5175 "mulps %%xmm4,%%xmm1 \n"
5176 "cvtdq2ps %%xmm2,%%xmm2 \n"
5177 "cvtdq2ps %%xmm3,%%xmm3 \n"
5178 "mulps %%xmm4,%%xmm2 \n"
5179 "mulps %%xmm4,%%xmm3 \n"
5180 "cvtps2dq %%xmm0,%%xmm0 \n"
5181 "cvtps2dq %%xmm1,%%xmm1 \n"
5182 "cvtps2dq %%xmm2,%%xmm2 \n"
5183 "cvtps2dq %%xmm3,%%xmm3 \n"
5184 "packssdw %%xmm1,%%xmm0 \n"
5185 "packssdw %%xmm3,%%xmm2 \n"
5186 "packuswb %%xmm2,%%xmm0 \n"
5187 "movdqu %%xmm0," MEMACCESS(2) " \n"
5188 "lea " MEMLEA(0x10,2) ",%2 \n"
5189 "sub $0x4,%3 \n"
5190 "jge 40b \n"
5191
5192 "49: \n"
5193 "add $0x3,%3 \n"
5194 "jl 19f \n"
5195
5196 // 1 pixel loop \n"
5197 LABELALIGN
5198 "10: \n"
5199 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
5200 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
5201 "lea " MEMLEA(0x10,0) ",%0 \n"
5202 "psubd " MEMACCESS(1) ",%%xmm0 \n"
5203 BUNDLEALIGN
5204 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
5205 "lea " MEMLEA(0x10,1) ",%1 \n"
5206 "cvtdq2ps %%xmm0,%%xmm0 \n"
5207 "mulps %%xmm4,%%xmm0 \n"
5208 "cvtps2dq %%xmm0,%%xmm0 \n"
5209 "packssdw %%xmm0,%%xmm0 \n"
5210 "packuswb %%xmm0,%%xmm0 \n"
5211 "movd %%xmm0," MEMACCESS(2) " \n"
5212 "lea " MEMLEA(0x4,2) ",%2 \n"
5213 "sub $0x1,%3 \n"
5214 "jge 10b \n"
5215 "19: \n"
5216 : "+r"(topleft), // %0
5217 "+r"(botleft), // %1
5218 "+r"(dst), // %2
5219 "+rm"(count) // %3
5220 : "r"((intptr_t)(width)), // %4
5221 "rm"(area) // %5
5222 : "memory", "cc"
5223 #if defined(__native_client__) && defined(__x86_64__)
5224 , "r14"
5225 #endif
5226 #if defined(__SSE2__)
5227 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5228 #endif
5229 );
5230 }
5231 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5232
5233 #ifdef HAS_ARGBAFFINEROW_SSE2
5234 // Copy ARGB pixels from source image with slope to a row of destination.
5235 LIBYUV_API
5236 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5237 uint8* dst_argb, const float* src_dudv, int width) {
5238 intptr_t src_argb_stride_temp = src_argb_stride;
5239 intptr_t temp = 0;
5240 asm volatile (
5241 "movq " MEMACCESS(3) ",%%xmm2 \n"
5242 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
5243 "shl $0x10,%1 \n"
5244 "add $0x4,%1 \n"
5245 "movd %1,%%xmm5 \n"
5246 "sub $0x4,%4 \n"
5247 "jl 49f \n"
5248
5249 "pshufd $0x44,%%xmm7,%%xmm7 \n"
5250 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5251 "movdqa %%xmm2,%%xmm0 \n"
5252 "addps %%xmm7,%%xmm0 \n"
5253 "movlhps %%xmm0,%%xmm2 \n"
5254 "movdqa %%xmm7,%%xmm4 \n"
5255 "addps %%xmm4,%%xmm4 \n"
5256 "movdqa %%xmm2,%%xmm3 \n"
5257 "addps %%xmm4,%%xmm3 \n"
5258 "addps %%xmm4,%%xmm4 \n"
5259
5260 // 4 pixel loop \n"
5261 LABELALIGN
5262 "40: \n"
5263 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
5264 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
5265 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
5266 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
5267 "movd %%xmm0,%k1 \n"
5268 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5269 "movd %%xmm0,%k5 \n"
5270 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5271 BUNDLEALIGN
5272 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
5273 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
5274 "punpckldq %%xmm6,%%xmm1 \n"
5275 "addps %%xmm4,%%xmm2 \n"
5276 "movq %%xmm1," MEMACCESS(2) " \n"
5277 "movd %%xmm0,%k1 \n"
5278 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5279 "movd %%xmm0,%k5 \n"
5280 BUNDLEALIGN
5281 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
5282 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
5283 "punpckldq %%xmm6,%%xmm0 \n"
5284 "addps %%xmm4,%%xmm3 \n"
5285 "sub $0x4,%4 \n"
5286 "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
5287 "lea " MEMLEA(0x10,2) ",%2 \n"
5288 "jge 40b \n"
5289
5290 "49: \n"
5291 "add $0x3,%4 \n"
5292 "jl 19f \n"
5293
5294 // 1 pixel loop \n"
5295 LABELALIGN
5296 "10: \n"
5297 "cvttps2dq %%xmm2,%%xmm0 \n"
5298 "packssdw %%xmm0,%%xmm0 \n"
5299 "pmaddwd %%xmm5,%%xmm0 \n"
5300 "addps %%xmm7,%%xmm2 \n"
5301 "movd %%xmm0,%k1 \n"
5302 BUNDLEALIGN
5303 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
5304 "sub $0x1,%4 \n"
5305 "movd %%xmm0," MEMACCESS(2) " \n"
5306 "lea " MEMLEA(0x04,2) ",%2 \n"
5307 "jge 10b \n"
5308 "19: \n"
5309 : "+r"(src_argb), // %0
5310 "+r"(src_argb_stride_temp), // %1
5311 "+r"(dst_argb), // %2
5312 "+r"(src_dudv), // %3
5313 "+rm"(width), // %4
5314 "+r"(temp) // %5
5315 :
5316 : "memory", "cc"
5317 #if defined(__native_client__) && defined(__x86_64__)
5318 , "r14"
5319 #endif
5320 #if defined(__SSE2__)
5321 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5322 #endif
5323 );
5324 }
5325 #endif // HAS_ARGBAFFINEROW_SSE2
5326
5327 #ifdef HAS_INTERPOLATEROW_SSSE3
5328 // Bilinear filter 16x2 -> 16x1
5329 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5330 ptrdiff_t src_stride, int dst_width,
5331 int source_y_fraction) {
5332 asm volatile (
5333 "sub %1,%0 \n"
5334 "shr %3 \n"
5335 "cmp $0x0,%3 \n"
5336 "je 100f \n"
5337 "cmp $0x20,%3 \n"
5338 "je 75f \n"
5339 "cmp $0x40,%3 \n"
5340 "je 50f \n"
5341 "cmp $0x60,%3 \n"
5342 "je 25f \n"
5343
5344 "movd %3,%%xmm0 \n"
5345 "neg %3 \n"
5346 "add $0x80,%3 \n"
5347 "movd %3,%%xmm5 \n"
5348 "punpcklbw %%xmm0,%%xmm5 \n"
5349 "punpcklwd %%xmm5,%%xmm5 \n"
5350 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5351
5352 // General purpose row blend.
5353 LABELALIGN
5354 "1: \n"
5355 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
5356 MEMOPREG(movdqa,0x00,1,4,1,xmm2)
5357 "movdqa %%xmm0,%%xmm1 \n"
5358 "punpcklbw %%xmm2,%%xmm0 \n"
5359 "punpckhbw %%xmm2,%%xmm1 \n"
5360 "pmaddubsw %%xmm5,%%xmm0 \n"
5361 "pmaddubsw %%xmm5,%%xmm1 \n"
5362 "psrlw $0x7,%%xmm0 \n"
5363 "psrlw $0x7,%%xmm1 \n"
5364 "packuswb %%xmm1,%%xmm0 \n"
5365 "sub $0x10,%2 \n"
5366 BUNDLEALIGN
5367 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5368 "lea " MEMLEA(0x10,1) ",%1 \n"
5369 "jg 1b \n"
5370 "jmp 99f \n"
5371
5372 // Blend 25 / 75.
5373 LABELALIGN
5374 "25: \n"
5375 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
5376 MEMOPREG(movdqa,0x00,1,4,1,xmm1)
5377 "pavgb %%xmm1,%%xmm0 \n"
5378 "pavgb %%xmm1,%%xmm0 \n"
5379 "sub $0x10,%2 \n"
5380 BUNDLEALIGN
5381 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5382 "lea " MEMLEA(0x10,1) ",%1 \n"
5383 "jg 25b \n"
5384 "jmp 99f \n"
5385
5386 // Blend 50 / 50.
5387 LABELALIGN
5388 "50: \n"
5389 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
5390 MEMOPREG(movdqa,0x00,1,4,1,xmm1)
5391 "pavgb %%xmm1,%%xmm0 \n"
5392 "sub $0x10,%2 \n"
5393 BUNDLEALIGN
5394 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5395 "lea " MEMLEA(0x10,1) ",%1 \n"
5396 "jg 50b \n"
5397 "jmp 99f \n"
5398
5399 // Blend 75 / 25.
5400 LABELALIGN
5401 "75: \n"
5402 "movdqa " MEMACCESS(1) ",%%xmm1 \n"
5403 MEMOPREG(movdqa,0x00,1,4,1,xmm0)
5404 "pavgb %%xmm1,%%xmm0 \n"
5405 "pavgb %%xmm1,%%xmm0 \n"
5406 "sub $0x10,%2 \n"
5407 BUNDLEALIGN
5408 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5409 "lea " MEMLEA(0x10,1) ",%1 \n"
5410 "jg 75b \n"
5411 "jmp 99f \n"
5412
5413 // Blend 100 / 0 - Copy row unchanged.
5414 LABELALIGN
5415 "100: \n"
5416 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
5417 "sub $0x10,%2 \n"
5418 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5419 "lea " MEMLEA(0x10,1) ",%1 \n"
5420 "jg 100b \n"
5421
5422 "99: \n"
5423 : "+r"(dst_ptr), // %0
5424 "+r"(src_ptr), // %1
5425 "+r"(dst_width), // %2
5426 "+r"(source_y_fraction) // %3
5427 : "r"((intptr_t)(src_stride)) // %4
5428 : "memory", "cc"
5429 #if defined(__native_client__) && defined(__x86_64__)
5430 , "r14"
5431 #endif
5432 #if defined(__SSE2__)
5433 , "xmm0", "xmm1", "xmm2", "xmm5"
5434 #endif
5435 );
5436 }
5437 #endif // HAS_INTERPOLATEROW_SSSE3
5438
5439 #ifdef HAS_INTERPOLATEROW_SSE2
5440 // Bilinear filter 16x2 -> 16x1
5441 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5442 ptrdiff_t src_stride, int dst_width,
5443 int source_y_fraction) {
5444 asm volatile (
5445 "sub %1,%0 \n"
5446 "shr %3 \n"
5447 "cmp $0x0,%3 \n"
5448 "je 100f \n"
5449 "cmp $0x20,%3 \n"
5450 "je 75f \n"
5451 "cmp $0x40,%3 \n"
5452 "je 50f \n"
5453 "cmp $0x60,%3 \n"
5454 "je 25f \n"
5455
5456 "movd %3,%%xmm0 \n"
5457 "neg %3 \n"
5458 "add $0x80,%3 \n"
5459 "movd %3,%%xmm5 \n"
5460 "punpcklbw %%xmm0,%%xmm5 \n"
5461 "punpcklwd %%xmm5,%%xmm5 \n"
5462 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5463 "pxor %%xmm4,%%xmm4 \n"
5464
5465 // General purpose row blend.
5466 LABELALIGN
5467 "1: \n"
5468 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
5469 MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2
5470 "movdqa %%xmm0,%%xmm1 \n"
5471 "movdqa %%xmm2,%%xmm3 \n"
5472 "punpcklbw %%xmm4,%%xmm2 \n"
5473 "punpckhbw %%xmm4,%%xmm3 \n"
5474 "punpcklbw %%xmm4,%%xmm0 \n"
5475 "punpckhbw %%xmm4,%%xmm1 \n"
5476 "psubw %%xmm0,%%xmm2 \n"
5477 "psubw %%xmm1,%%xmm3 \n"
5478 "paddw %%xmm2,%%xmm2 \n"
5479 "paddw %%xmm3,%%xmm3 \n"
5480 "pmulhw %%xmm5,%%xmm2 \n"
5481 "pmulhw %%xmm5,%%xmm3 \n"
5482 "paddw %%xmm2,%%xmm0 \n"
5483 "paddw %%xmm3,%%xmm1 \n"
5484 "packuswb %%xmm1,%%xmm0 \n"
5485 "sub $0x10,%2 \n"
5486 BUNDLEALIGN
5487 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
5488 "lea " MEMLEA(0x10,1) ",%1 \n"
5489 "jg 1b \n"
5490 "jmp 99f \n"
5491
5492 // Blend 25 / 75.
5493 LABELALIGN
5494 "25: \n"
5495 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
5496 MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
5497 "pavgb %%xmm1,%%xmm0 \n"
5498 "pavgb %%xmm1,%%xmm0 \n"
5499 "sub $0x10,%2 \n"
5500 BUNDLEALIGN
5501 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
5502 "lea " MEMLEA(0x10,1) ",%1 \n"
5503 "jg 25b \n"
5504 "jmp 99f \n"
5505
5506 // Blend 50 / 50.
5507 LABELALIGN
5508 "50: \n"
5509 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
5510 MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
5511 "pavgb %%xmm1,%%xmm0 \n"
5512 "sub $0x10,%2 \n"
5513 BUNDLEALIGN
5514 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
5515 "lea " MEMLEA(0x10,1) ",%1 \n"
5516 "jg 50b \n"
5517 "jmp 99f \n"
5518
5519 // Blend 75 / 25.
5520 LABELALIGN
5521 "75: \n"
5522 "movdqa " MEMACCESS(1) ",%%xmm1 \n"
5523 MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0
5524 "pavgb %%xmm1,%%xmm0 \n"
5525 "pavgb %%xmm1,%%xmm0 \n"
5526 "sub $0x10,%2 \n"
5527 BUNDLEALIGN
5528 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
5529 "lea " MEMLEA(0x10,1) ",%1 \n"
5530 "jg 75b \n"
5531 "jmp 99f \n"
5532
5533 // Blend 100 / 0 - Copy row unchanged.
5534 LABELALIGN
5535 "100: \n"
5536 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
5537 "sub $0x10,%2 \n"
5538 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
5539 "lea " MEMLEA(0x10,1) ",%1 \n"
5540 "jg 100b \n"
5541
5542 "99: \n"
5543 : "+r"(dst_ptr), // %0
5544 "+r"(src_ptr), // %1
5545 "+r"(dst_width), // %2
5546 "+r"(source_y_fraction) // %3
5547 : "r"((intptr_t)(src_stride)) // %4
5548 : "memory", "cc"
5549 #if defined(__native_client__) && defined(__x86_64__)
5550 , "r14"
5551 #endif
5552 #if defined(__SSE2__)
5553 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5554 #endif
5555 );
5556 }
5557 #endif // HAS_INTERPOLATEROW_SSE2
5558
5559 #ifdef HAS_INTERPOLATEROW_SSSE3
5560 // Bilinear filter 16x2 -> 16x1
5561 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5562 ptrdiff_t src_stride, int dst_width,
5563 int source_y_fraction) {
5564 asm volatile (
5565 "sub %1,%0 \n"
5566 "shr %3 \n"
5567 "cmp $0x0,%3 \n"
5568 "je 100f \n"
5569 "cmp $0x20,%3 \n"
5570 "je 75f \n"
5571 "cmp $0x40,%3 \n"
5572 "je 50f \n"
5573 "cmp $0x60,%3 \n"
5574 "je 25f \n"
5575
5576 "movd %3,%%xmm0 \n"
5577 "neg %3 \n"
5578 "add $0x80,%3 \n"
5579 "movd %3,%%xmm5 \n"
5580 "punpcklbw %%xmm0,%%xmm5 \n"
5581 "punpcklwd %%xmm5,%%xmm5 \n"
5582 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5583
5584 // General purpose row blend.
5585 LABELALIGN
5586 "1: \n"
5587 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5588 MEMOPREG(movdqu,0x00,1,4,1,xmm2)
5589 "movdqu %%xmm0,%%xmm1 \n"
5590 "punpcklbw %%xmm2,%%xmm0 \n"
5591 "punpckhbw %%xmm2,%%xmm1 \n"
5592 "pmaddubsw %%xmm5,%%xmm0 \n"
5593 "pmaddubsw %%xmm5,%%xmm1 \n"
5594 "psrlw $0x7,%%xmm0 \n"
5595 "psrlw $0x7,%%xmm1 \n"
5596 "packuswb %%xmm1,%%xmm0 \n"
5597 "sub $0x10,%2 \n"
5598 BUNDLEALIGN
5599 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5600 "lea " MEMLEA(0x10,1) ",%1 \n"
5601 "jg 1b \n"
5602 "jmp 99f \n"
5603
5604 // Blend 25 / 75.
5605 LABELALIGN
5606 "25: \n"
5607 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5608 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
5609 "pavgb %%xmm1,%%xmm0 \n"
5610 "pavgb %%xmm1,%%xmm0 \n"
5611 "sub $0x10,%2 \n"
5612 BUNDLEALIGN
5613 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5614 "lea " MEMLEA(0x10,1) ",%1 \n"
5615 "jg 25b \n"
5616 "jmp 99f \n"
5617
5618 // Blend 50 / 50.
5619 LABELALIGN
5620 "50: \n"
5621 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5622 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
5623 "pavgb %%xmm1,%%xmm0 \n"
5624 "sub $0x10,%2 \n"
5625 BUNDLEALIGN
5626 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5627 "lea " MEMLEA(0x10,1) ",%1 \n"
5628 "jg 50b \n"
5629 "jmp 99f \n"
5630
5631 // Blend 75 / 25.
5632 LABELALIGN
5633 "75: \n"
5634 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
5635 MEMOPREG(movdqu,0x00,1,4,1,xmm0)
5636 "pavgb %%xmm1,%%xmm0 \n"
5637 "pavgb %%xmm1,%%xmm0 \n"
5638 "sub $0x10,%2 \n"
5639 BUNDLEALIGN
5640 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5641 "lea " MEMLEA(0x10,1) ",%1 \n"
5642 "jg 75b \n"
5643 "jmp 99f \n"
5644
5645 // Blend 100 / 0 - Copy row unchanged.
5646 LABELALIGN
5647 "100: \n"
5648 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5649 "sub $0x10,%2 \n"
5650 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5651 "lea " MEMLEA(0x10,1) ",%1 \n"
5652 "jg 100b \n"
5653
5654 "99: \n"
5655 : "+r"(dst_ptr), // %0
5656 "+r"(src_ptr), // %1
5657 "+r"(dst_width), // %2
5658 "+r"(source_y_fraction) // %3
5659 : "r"((intptr_t)(src_stride)) // %4
5660 : "memory", "cc"
5661 #if defined(__native_client__) && defined(__x86_64__)
5662 , "r14"
5663 #endif
5664 #if defined(__SSE2__)
5665 , "xmm0", "xmm1", "xmm2", "xmm5"
5666 #endif
5667 );
5668 }
5669 #endif // HAS_INTERPOLATEROW_SSSE3
5670
5671 #ifdef HAS_INTERPOLATEROW_SSE2
5672 // Bilinear filter 16x2 -> 16x1
5673 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5674 ptrdiff_t src_stride, int dst_width,
5675 int source_y_fraction) {
5676 asm volatile (
5677 "sub %1,%0 \n"
5678 "shr %3 \n"
5679 "cmp $0x0,%3 \n"
5680 "je 100f \n"
5681 "cmp $0x20,%3 \n"
5682 "je 75f \n"
5683 "cmp $0x40,%3 \n"
5684 "je 50f \n"
5685 "cmp $0x60,%3 \n"
5686 "je 25f \n"
5687
5688 "movd %3,%%xmm0 \n"
5689 "neg %3 \n"
5690 "add $0x80,%3 \n"
5691 "movd %3,%%xmm5 \n"
5692 "punpcklbw %%xmm0,%%xmm5 \n"
5693 "punpcklwd %%xmm5,%%xmm5 \n"
5694 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5695 "pxor %%xmm4,%%xmm4 \n"
5696
5697 // General purpose row blend.
5698 LABELALIGN
5699 "1: \n"
5700 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5701 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
5702 "movdqu %%xmm0,%%xmm1 \n"
5703 "movdqu %%xmm2,%%xmm3 \n"
5704 "punpcklbw %%xmm4,%%xmm2 \n"
5705 "punpckhbw %%xmm4,%%xmm3 \n"
5706 "punpcklbw %%xmm4,%%xmm0 \n"
5707 "punpckhbw %%xmm4,%%xmm1 \n"
5708 "psubw %%xmm0,%%xmm2 \n"
5709 "psubw %%xmm1,%%xmm3 \n"
5710 "paddw %%xmm2,%%xmm2 \n"
5711 "paddw %%xmm3,%%xmm3 \n"
5712 "pmulhw %%xmm5,%%xmm2 \n"
5713 "pmulhw %%xmm5,%%xmm3 \n"
5714 "paddw %%xmm2,%%xmm0 \n"
5715 "paddw %%xmm3,%%xmm1 \n"
5716 "packuswb %%xmm1,%%xmm0 \n"
5717 "sub $0x10,%2 \n"
5718 BUNDLEALIGN
5719 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
5720 "lea " MEMLEA(0x10,1) ",%1 \n"
5721 "jg 1b \n"
5722 "jmp 99f \n"
5723
5724 // Blend 25 / 75.
5725 LABELALIGN
5726 "25: \n"
5727 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5728 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
5729 "pavgb %%xmm1,%%xmm0 \n"
5730 "pavgb %%xmm1,%%xmm0 \n"
5731 "sub $0x10,%2 \n"
5732 BUNDLEALIGN
5733 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
5734 "lea " MEMLEA(0x10,1) ",%1 \n"
5735 "jg 25b \n"
5736 "jmp 99f \n"
5737
5738 // Blend 50 / 50.
5739 LABELALIGN
5740 "50: \n"
5741 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5742 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
5743 "pavgb %%xmm1,%%xmm0 \n"
5744 "sub $0x10,%2 \n"
5745 BUNDLEALIGN
5746 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
5747 "lea " MEMLEA(0x10,1) ",%1 \n"
5748 "jg 50b \n"
5749 "jmp 99f \n"
5750
5751 // Blend 75 / 25.
5752 LABELALIGN
5753 "75: \n"
5754 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
5755 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
5756 "pavgb %%xmm1,%%xmm0 \n"
5757 "pavgb %%xmm1,%%xmm0 \n"
5758 "sub $0x10,%2 \n"
5759 BUNDLEALIGN
5760 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
5761 "lea " MEMLEA(0x10,1) ",%1 \n"
5762 "jg 75b \n"
5763 "jmp 99f \n"
5764
5765 // Blend 100 / 0 - Copy row unchanged.
5766 LABELALIGN
5767 "100: \n"
5768 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5769 "sub $0x10,%2 \n"
5770 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
5771 "lea " MEMLEA(0x10,1) ",%1 \n"
5772 "jg 100b \n"
5773
5774 "99: \n"
5775 : "+r"(dst_ptr), // %0
5776 "+r"(src_ptr), // %1
5777 "+r"(dst_width), // %2
5778 "+r"(source_y_fraction) // %3
5779 : "r"((intptr_t)(src_stride)) // %4
5780 : "memory", "cc"
5781 #if defined(__native_client__) && defined(__x86_64__)
5782 , "r14"
5783 #endif
5784 #if defined(__SSE2__)
5785 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5786 #endif
5787 );
5788 }
5789 #endif // HAS_INTERPOLATEROW_SSE2
5790
5791 #ifdef HAS_HALFROW_SSE2
5792 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
5793 uint8* dst_uv, int pix) {
5794 asm volatile (
5795 "sub %0,%1 \n"
5796 LABELALIGN
5797 "1: \n"
5798 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
5799 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0
5800 "sub $0x10,%2 \n"
5801 MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1)
5802 "lea " MEMLEA(0x10,0) ",%0 \n"
5803 "jg 1b \n"
5804 : "+r"(src_uv), // %0
5805 "+r"(dst_uv), // %1
5806 "+r"(pix) // %2
5807 : "r"((intptr_t)(src_uv_stride)) // %3
5808 : "memory", "cc"
5809 #if defined(__SSE2__)
5810 , "xmm0"
5811 #endif
5812 );
5813 }
5814 #endif // HAS_HALFROW_SSE2
5815
5816 #ifdef HAS_ARGBTOBAYERROW_SSSE3
5817 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
5818 uint32 selector, int pix) {
5819 asm volatile (
5820 // NaCL caveat - assumes movd is from GPR
5821 "movd %3,%%xmm5 \n"
5822 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5823 LABELALIGN
5824 "1: \n"
5825 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
5826 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5827 "lea " MEMLEA(0x20,0) ",%0 \n"
5828 "pshufb %%xmm5,%%xmm0 \n"
5829 "pshufb %%xmm5,%%xmm1 \n"
5830 "punpckldq %%xmm1,%%xmm0 \n"
5831 "sub $0x8,%2 \n"
5832 "movq %%xmm0," MEMACCESS(1) " \n"
5833 "lea " MEMLEA(0x8,1) ",%1 \n"
5834 "jg 1b \n"
5835 : "+r"(src_argb), // %0
5836 "+r"(dst_bayer), // %1
5837 "+r"(pix) // %2
5838 : "g"(selector) // %3
5839 : "memory", "cc"
5840 #if defined(__SSE2__)
5841 , "xmm0", "xmm1", "xmm5"
5842 #endif
5843 );
5844 }
5845 #endif // HAS_ARGBTOBAYERROW_SSSE3
5846
5847 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
5848 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
5849 uint32 selector, int pix) {
5850 asm volatile (
5851 "pcmpeqb %%xmm5,%%xmm5 \n"
5852 "psrld $0x18,%%xmm5 \n"
5853 LABELALIGN
5854 "1: \n"
5855 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
5856 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5857 "lea " MEMLEA(0x20,0) ",%0 \n"
5858 "psrld $0x8,%%xmm0 \n"
5859 "psrld $0x8,%%xmm1 \n"
5860 "pand %%xmm5,%%xmm0 \n"
5861 "pand %%xmm5,%%xmm1 \n"
5862 "packssdw %%xmm1,%%xmm0 \n"
5863 "packuswb %%xmm1,%%xmm0 \n"
5864 "sub $0x8,%2 \n"
5865 "movq %%xmm0," MEMACCESS(1) " \n"
5866 "lea " MEMLEA(0x8,1) ",%1 \n"
5867 "jg 1b \n"
5868 : "+r"(src_argb), // %0
5869 "+r"(dst_bayer), // %1
5870 "+r"(pix) // %2
5871 :
5872 : "memory", "cc"
5873 #if defined(__SSE2__)
5874 , "xmm0", "xmm1", "xmm5"
5875 #endif
5876 );
5877 }
5878 #endif // HAS_ARGBTOBAYERGGROW_SSE2
5879
5880 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
5881 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5882 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5883 const uint8* shuffler, int pix) {
5884 asm volatile (
5885 "movdqa " MEMACCESS(3) ",%%xmm5 \n"
5886 LABELALIGN
5887 "1: \n"
5888 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
5889 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5890 "lea " MEMLEA(0x20,0) ",%0 \n"
5891 "pshufb %%xmm5,%%xmm0 \n"
5892 "pshufb %%xmm5,%%xmm1 \n"
5893 "sub $0x8,%2 \n"
5894 "movdqa %%xmm0," MEMACCESS(1) " \n"
5895 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
5896 "lea " MEMLEA(0x20,1) ",%1 \n"
5897 "jg 1b \n"
5898 : "+r"(src_argb), // %0
5899 "+r"(dst_argb), // %1
5900 "+r"(pix) // %2
5901 : "r"(shuffler) // %3
5902 : "memory", "cc"
5903 #if defined(__SSE2__)
5904 , "xmm0", "xmm1", "xmm5"
5905 #endif
5906 );
5907 }
5908
5909 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
5910 const uint8* shuffler, int pix) {
5911 asm volatile (
5912 "movdqa " MEMACCESS(3) ",%%xmm5 \n"
5913 LABELALIGN
5914 "1: \n"
5915 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5916 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5917 "lea " MEMLEA(0x20,0) ",%0 \n"
5918 "pshufb %%xmm5,%%xmm0 \n"
5919 "pshufb %%xmm5,%%xmm1 \n"
5920 "sub $0x8,%2 \n"
5921 "movdqu %%xmm0," MEMACCESS(1) " \n"
5922 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
5923 "lea " MEMLEA(0x20,1) ",%1 \n"
5924 "jg 1b \n"
5925 : "+r"(src_argb), // %0
5926 "+r"(dst_argb), // %1
5927 "+r"(pix) // %2
5928 : "r"(shuffler) // %3
5929 : "memory", "cc"
5930 #if defined(__SSE2__)
5931 , "xmm0", "xmm1", "xmm5"
5932 #endif
5933 );
5934 }
5935 #endif // HAS_ARGBSHUFFLEROW_SSSE3
5936
5937 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5938 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5939 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5940 const uint8* shuffler, int pix) {
5941 asm volatile (
5942 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
5943 LABELALIGN
5944 "1: \n"
5945 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
5946 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
5947 "lea " MEMLEA(0x40,0) ",%0 \n"
5948 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
5949 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
5950 "sub $0x10,%2 \n"
5951 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
5952 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
5953 "lea " MEMLEA(0x40,1) ",%1 \n"
5954 "jg 1b \n"
5955 : "+r"(src_argb), // %0
5956 "+r"(dst_argb), // %1
5957 "+r"(pix) // %2
5958 : "r"(shuffler) // %3
5959 : "memory", "cc"
5960 #if defined(__SSE2__)
5961 , "xmm0", "xmm1", "xmm5"
5962 #endif
5963 );
5964 }
5965 #endif // HAS_ARGBSHUFFLEROW_AVX2
5966
5967 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5969 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5970 const uint8* shuffler, int pix) {
5971 uintptr_t pixel_temp = 0u;
5972 asm volatile (
5973 "pxor %%xmm5,%%xmm5 \n"
5974 "mov " MEMACCESS(4) ",%k2 \n"
5975 "cmp $0x3000102,%k2 \n"
5976 "je 3012f \n"
5977 "cmp $0x10203,%k2 \n"
5978 "je 123f \n"
5979 "cmp $0x30201,%k2 \n"
5980 "je 321f \n"
5981 "cmp $0x2010003,%k2 \n"
5982 "je 2103f \n"
5983
5984 LABELALIGN
5985 "1: \n"
5986 "movzb " MEMACCESS(4) ",%2 \n"
5987 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5988 "mov %b2," MEMACCESS(1) " \n"
5989 "movzb " MEMACCESS2(0x1,4) ",%2 \n"
5990 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5991 "mov %b2," MEMACCESS2(0x1,1) " \n"
5992 BUNDLEALIGN
5993 "movzb " MEMACCESS2(0x2,4) ",%2 \n"
5994 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5995 "mov %b2," MEMACCESS2(0x2,1) " \n"
5996 "movzb " MEMACCESS2(0x3,4) ",%2 \n"
5997 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5998 "mov %b2," MEMACCESS2(0x3,1) " \n"
5999 "lea " MEMLEA(0x4,0) ",%0 \n"
6000 "lea " MEMLEA(0x4,1) ",%1 \n"
6001 "sub $0x1,%3 \n"
6002 "jg 1b \n"
6003 "jmp 99f \n"
6004
6005 LABELALIGN
6006 "123: \n"
6007 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
6008 "lea " MEMLEA(0x10,0) ",%0 \n"
6009 "movdqa %%xmm0,%%xmm1 \n"
6010 "punpcklbw %%xmm5,%%xmm0 \n"
6011 "punpckhbw %%xmm5,%%xmm1 \n"
6012 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
6013 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
6014 "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
6015 "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
6016 "packuswb %%xmm1,%%xmm0 \n"
6017 "sub $0x4,%3 \n"
6018 "movdqu %%xmm0," MEMACCESS(1) " \n"
6019 "lea " MEMLEA(0x10,1) ",%1 \n"
6020 "jg 123b \n"
6021 "jmp 99f \n"
6022
6023 LABELALIGN
6024 "321: \n"
6025 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
6026 "lea " MEMLEA(0x10,0) ",%0 \n"
6027 "movdqa %%xmm0,%%xmm1 \n"
6028 "punpcklbw %%xmm5,%%xmm0 \n"
6029 "punpckhbw %%xmm5,%%xmm1 \n"
6030 "pshufhw $0x39,%%xmm0,%%xmm0 \n"
6031 "pshuflw $0x39,%%xmm0,%%xmm0 \n"
6032 "pshufhw $0x39,%%xmm1,%%xmm1 \n"
6033 "pshuflw $0x39,%%xmm1,%%xmm1 \n"
6034 "packuswb %%xmm1,%%xmm0 \n"
6035 "sub $0x4,%3 \n"
6036 "movdqu %%xmm0," MEMACCESS(1) " \n"
6037 "lea " MEMLEA(0x10,1) ",%1 \n"
6038 "jg 321b \n"
6039 "jmp 99f \n"
6040
6041 LABELALIGN
6042 "2103: \n"
6043 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
6044 "lea " MEMLEA(0x10,0) ",%0 \n"
6045 "movdqa %%xmm0,%%xmm1 \n"
6046 "punpcklbw %%xmm5,%%xmm0 \n"
6047 "punpckhbw %%xmm5,%%xmm1 \n"
6048 "pshufhw $0x93,%%xmm0,%%xmm0 \n"
6049 "pshuflw $0x93,%%xmm0,%%xmm0 \n"
6050 "pshufhw $0x93,%%xmm1,%%xmm1 \n"
6051 "pshuflw $0x93,%%xmm1,%%xmm1 \n"
6052 "packuswb %%xmm1,%%xmm0 \n"
6053 "sub $0x4,%3 \n"
6054 "movdqu %%xmm0," MEMACCESS(1) " \n"
6055 "lea " MEMLEA(0x10,1) ",%1 \n"
6056 "jg 2103b \n"
6057 "jmp 99f \n"
6058
6059 LABELALIGN
6060 "3012: \n"
6061 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
6062 "lea " MEMLEA(0x10,0) ",%0 \n"
6063 "movdqa %%xmm0,%%xmm1 \n"
6064 "punpcklbw %%xmm5,%%xmm0 \n"
6065 "punpckhbw %%xmm5,%%xmm1 \n"
6066 "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
6067 "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
6068 "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
6069 "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
6070 "packuswb %%xmm1,%%xmm0 \n"
6071 "sub $0x4,%3 \n"
6072 "movdqu %%xmm0," MEMACCESS(1) " \n"
6073 "lea " MEMLEA(0x10,1) ",%1 \n"
6074 "jg 3012b \n"
6075
6076 "99: \n"
6077 : "+r"(src_argb), // %0
6078 "+r"(dst_argb), // %1
6079 "+d"(pixel_temp), // %2
6080 "+r"(pix) // %3
6081 : "r"(shuffler) // %4
6082 : "memory", "cc"
6083 #if defined(__native_client__) && defined(__x86_64__)
6084 , "r14"
6085 #endif
6086 #if defined(__SSE2__)
6087 , "xmm0", "xmm1", "xmm5"
6088 #endif
6089 );
6090 }
6091 #endif // HAS_ARGBSHUFFLEROW_SSE2
6092
6093 #ifdef HAS_I422TOYUY2ROW_SSE2
6094 void I422ToYUY2Row_SSE2(const uint8* src_y,
6095 const uint8* src_u,
6096 const uint8* src_v,
6097 uint8* dst_frame, int width) {
6098 asm volatile (
6099 "sub %1,%2 \n"
6100 LABELALIGN
6101 "1: \n"
6102 "movq " MEMACCESS(1) ",%%xmm2 \n"
6103 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
6104 "lea " MEMLEA(0x8,1) ",%1 \n"
6105 "punpcklbw %%xmm3,%%xmm2 \n"
6106 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
6107 "lea " MEMLEA(0x10,0) ",%0 \n"
6108 "movdqa %%xmm0,%%xmm1 \n"
6109 "punpcklbw %%xmm2,%%xmm0 \n"
6110 "punpckhbw %%xmm2,%%xmm1 \n"
6111 "movdqu %%xmm0," MEMACCESS(3) " \n"
6112 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
6113 "lea " MEMLEA(0x20,3) ",%3 \n"
6114 "sub $0x10,%4 \n"
6115 "jg 1b \n"
6116 : "+r"(src_y), // %0
6117 "+r"(src_u), // %1
6118 "+r"(src_v), // %2
6119 "+r"(dst_frame), // %3
6120 "+rm"(width) // %4
6121 :
6122 : "memory", "cc"
6123 #if defined(__native_client__) && defined(__x86_64__)
6124 , "r14"
6125 #endif
6126 #if defined(__SSE2__)
6127 , "xmm0", "xmm1", "xmm2", "xmm3"
6128 #endif
6129 );
6130 }
6131 #endif // HAS_I422TOYUY2ROW_SSE2
6132
6133 #ifdef HAS_I422TOUYVYROW_SSE2
6134 void I422ToUYVYRow_SSE2(const uint8* src_y,
6135 const uint8* src_u,
6136 const uint8* src_v,
6137 uint8* dst_frame, int width) {
6138 asm volatile (
6139 "sub %1,%2 \n"
6140 LABELALIGN
6141 "1: \n"
6142 "movq " MEMACCESS(1) ",%%xmm2 \n"
6143 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
6144 "lea " MEMLEA(0x8,1) ",%1 \n"
6145 "punpcklbw %%xmm3,%%xmm2 \n"
6146 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
6147 "movdqa %%xmm2,%%xmm1 \n"
6148 "lea " MEMLEA(0x10,0) ",%0 \n"
6149 "punpcklbw %%xmm0,%%xmm1 \n"
6150 "punpckhbw %%xmm0,%%xmm2 \n"
6151 "movdqu %%xmm1," MEMACCESS(3) " \n"
6152 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
6153 "lea " MEMLEA(0x20,3) ",%3 \n"
6154 "sub $0x10,%4 \n"
6155 "jg 1b \n"
6156 : "+r"(src_y), // %0
6157 "+r"(src_u), // %1
6158 "+r"(src_v), // %2
6159 "+r"(dst_frame), // %3
6160 "+rm"(width) // %4
6161 :
6162 : "memory", "cc"
6163 #if defined(__native_client__) && defined(__x86_64__)
6164 , "r14"
6165 #endif
6166 #if defined(__SSE2__)
6167 , "xmm0", "xmm1", "xmm2", "xmm3"
6168 #endif
6169 );
6170 }
6171 #endif // HAS_I422TOUYVYROW_SSE2
6172
6173 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6174 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
6175 uint8* dst_argb, const float* poly,
6176 int width) {
6177 asm volatile (
6178 "pxor %%xmm3,%%xmm3 \n"
6179
6180 // 2 pixel loop.
6181 LABELALIGN
6182 "1: \n"
6183 "movq " MEMACCESS(0) ",%%xmm0 \n"
6184 "lea " MEMLEA(0x8,0) ",%0 \n"
6185 "punpcklbw %%xmm3,%%xmm0 \n"
6186 "movdqa %%xmm0,%%xmm4 \n"
6187 "punpcklwd %%xmm3,%%xmm0 \n"
6188 "punpckhwd %%xmm3,%%xmm4 \n"
6189 "cvtdq2ps %%xmm0,%%xmm0 \n"
6190 "cvtdq2ps %%xmm4,%%xmm4 \n"
6191 "movdqa %%xmm0,%%xmm1 \n"
6192 "movdqa %%xmm4,%%xmm5 \n"
6193 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
6194 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
6195 "addps " MEMACCESS(3) ",%%xmm0 \n"
6196 "addps " MEMACCESS(3) ",%%xmm4 \n"
6197 "movdqa %%xmm1,%%xmm2 \n"
6198 "movdqa %%xmm5,%%xmm6 \n"
6199 "mulps %%xmm1,%%xmm2 \n"
6200 "mulps %%xmm5,%%xmm6 \n"
6201 "mulps %%xmm2,%%xmm1 \n"
6202 "mulps %%xmm6,%%xmm5 \n"
6203 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
6204 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
6205 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
6206 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
6207 "addps %%xmm2,%%xmm0 \n"
6208 "addps %%xmm6,%%xmm4 \n"
6209 "addps %%xmm1,%%xmm0 \n"
6210 "addps %%xmm5,%%xmm4 \n"
6211 "cvttps2dq %%xmm0,%%xmm0 \n"
6212 "cvttps2dq %%xmm4,%%xmm4 \n"
6213 "packuswb %%xmm4,%%xmm0 \n"
6214 "packuswb %%xmm0,%%xmm0 \n"
6215 "sub $0x2,%2 \n"
6216 "movq %%xmm0," MEMACCESS(1) " \n"
6217 "lea " MEMLEA(0x8,1) ",%1 \n"
6218 "jg 1b \n"
6219 : "+r"(src_argb), // %0
6220 "+r"(dst_argb), // %1
6221 "+r"(width) // %2
6222 : "r"(poly) // %3
6223 : "memory", "cc"
6224 #if defined(__SSE2__)
6225 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
6226 #endif
6227 );
6228 }
6229 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
6230
6231 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6232 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6233 uint8* dst_argb, const float* poly,
6234 int width) {
6235 asm volatile (
6236 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
6237 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
6238 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
6239 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
6240
6241 // 2 pixel loop.
6242 LABELALIGN
6243 "1: \n"
6244 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
6245 "lea " MEMLEA(0x8,0) ",%0 \n"
6246 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
6247 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
6248 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
6249 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
6250 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
6251 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
6252 "vcvttps2dq %%ymm0,%%ymm0 \n"
6253 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
6254 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6255 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
6256 "sub $0x2,%2 \n"
6257 "vmovq %%xmm0," MEMACCESS(1) " \n"
6258 "lea " MEMLEA(0x8,1) ",%1 \n"
6259 "jg 1b \n"
6260 "vzeroupper \n"
6261 : "+r"(src_argb), // %0
6262 "+r"(dst_argb), // %1
6263 "+r"(width) // %2
6264 : "r"(poly) // %3
6265 : "memory", "cc"
6266 #if defined(__SSE2__)
6267 // TODO(fbarchard): declare ymm usage when applicable.
6268 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
6269 #endif
6270 );
6271 }
6272 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
6273
6274 #ifdef HAS_ARGBCOLORTABLEROW_X86
6275 // Tranform ARGB pixels with color table.
6276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6277 int width) {
6278 uintptr_t pixel_temp = 0u;
6279 asm volatile (
6280 // 1 pixel loop.
6281 LABELALIGN
6282 "1: \n"
6283 "movzb " MEMACCESS(0) ",%1 \n"
6284 "lea " MEMLEA(0x4,0) ",%0 \n"
6285 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
6286 "mov %b1," MEMACCESS2(-0x4,0) " \n"
6287 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
6288 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
6289 "mov %b1," MEMACCESS2(-0x3,0) " \n"
6290 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
6291 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
6292 "mov %b1," MEMACCESS2(-0x2,0) " \n"
6293 "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
6294 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
6295 "mov %b1," MEMACCESS2(-0x1,0) " \n"
6296 "dec %2 \n"
6297 "jg 1b \n"
6298 : "+r"(dst_argb), // %0
6299 "+d"(pixel_temp), // %1
6300 "+r"(width) // %2
6301 : "r"(table_argb) // %3
6302 : "memory", "cc");
6303 }
6304 #endif // HAS_ARGBCOLORTABLEROW_X86
6305
6306 #ifdef HAS_RGBCOLORTABLEROW_X86
6307 // Tranform RGB pixels with color table.
6308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6309 uintptr_t pixel_temp = 0u;
6310 asm volatile (
6311 // 1 pixel loop.
6312 LABELALIGN
6313 "1: \n"
6314 "movzb " MEMACCESS(0) ",%1 \n"
6315 "lea " MEMLEA(0x4,0) ",%0 \n"
6316 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
6317 "mov %b1," MEMACCESS2(-0x4,0) " \n"
6318 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
6319 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
6320 "mov %b1," MEMACCESS2(-0x3,0) " \n"
6321 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
6322 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
6323 "mov %b1," MEMACCESS2(-0x2,0) " \n"
6324 "dec %2 \n"
6325 "jg 1b \n"
6326 : "+r"(dst_argb), // %0
6327 "+d"(pixel_temp), // %1
6328 "+r"(width) // %2
6329 : "r"(table_argb) // %3
6330 : "memory", "cc");
6331 }
6332 #endif // HAS_RGBCOLORTABLEROW_X86
6333
6334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6335 // Tranform RGB pixels with luma table.
6336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6337 int width,
6338 const uint8* luma, uint32 lumacoeff) {
6339 uintptr_t pixel_temp = 0u;
6340 uintptr_t table_temp = 0u;
6341 asm volatile (
6342 "movd %6,%%xmm3 \n"
6343 "pshufd $0x0,%%xmm3,%%xmm3 \n"
6344 "pcmpeqb %%xmm4,%%xmm4 \n"
6345 "psllw $0x8,%%xmm4 \n"
6346 "pxor %%xmm5,%%xmm5 \n"
6347
6348 // 4 pixel loop.
6349 LABELALIGN
6350 "1: \n"
6351 "movdqu " MEMACCESS(2) ",%%xmm0 \n"
6352 "pmaddubsw %%xmm3,%%xmm0 \n"
6353 "phaddw %%xmm0,%%xmm0 \n"
6354 "pand %%xmm4,%%xmm0 \n"
6355 "punpcklwd %%xmm5,%%xmm0 \n"
6356 "movd %%xmm0,%k1 \n" // 32 bit offset
6357 "add %5,%1 \n"
6358 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6359
6360 "movzb " MEMACCESS(2) ",%0 \n"
6361 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6362 "mov %b0," MEMACCESS(3) " \n"
6363 "movzb " MEMACCESS2(0x1,2) ",%0 \n"
6364 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6365 "mov %b0," MEMACCESS2(0x1,3) " \n"
6366 "movzb " MEMACCESS2(0x2,2) ",%0 \n"
6367 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6368 "mov %b0," MEMACCESS2(0x2,3) " \n"
6369 "movzb " MEMACCESS2(0x3,2) ",%0 \n"
6370 "mov %b0," MEMACCESS2(0x3,3) " \n"
6371
6372 "movd %%xmm0,%k1 \n" // 32 bit offset
6373 "add %5,%1 \n"
6374 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6375
6376 "movzb " MEMACCESS2(0x4,2) ",%0 \n"
6377 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6378 "mov %b0," MEMACCESS2(0x4,3) " \n"
6379 BUNDLEALIGN
6380 "movzb " MEMACCESS2(0x5,2) ",%0 \n"
6381 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6382 "mov %b0," MEMACCESS2(0x5,3) " \n"
6383 "movzb " MEMACCESS2(0x6,2) ",%0 \n"
6384 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6385 "mov %b0," MEMACCESS2(0x6,3) " \n"
6386 "movzb " MEMACCESS2(0x7,2) ",%0 \n"
6387 "mov %b0," MEMACCESS2(0x7,3) " \n"
6388
6389 "movd %%xmm0,%k1 \n" // 32 bit offset
6390 "add %5,%1 \n"
6391 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6392
6393 "movzb " MEMACCESS2(0x8,2) ",%0 \n"
6394 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6395 "mov %b0," MEMACCESS2(0x8,3) " \n"
6396 "movzb " MEMACCESS2(0x9,2) ",%0 \n"
6397 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6398 "mov %b0," MEMACCESS2(0x9,3) " \n"
6399 "movzb " MEMACCESS2(0xa,2) ",%0 \n"
6400 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6401 "mov %b0," MEMACCESS2(0xa,3) " \n"
6402 "movzb " MEMACCESS2(0xb,2) ",%0 \n"
6403 "mov %b0," MEMACCESS2(0xb,3) " \n"
6404
6405 "movd %%xmm0,%k1 \n" // 32 bit offset
6406 "add %5,%1 \n"
6407
6408 "movzb " MEMACCESS2(0xc,2) ",%0 \n"
6409 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6410 "mov %b0," MEMACCESS2(0xc,3) " \n"
6411 "movzb " MEMACCESS2(0xd,2) ",%0 \n"
6412 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6413 "mov %b0," MEMACCESS2(0xd,3) " \n"
6414 "movzb " MEMACCESS2(0xe,2) ",%0 \n"
6415 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
6416 "mov %b0," MEMACCESS2(0xe,3) " \n"
6417 "movzb " MEMACCESS2(0xf,2) ",%0 \n"
6418 "mov %b0," MEMACCESS2(0xf,3) " \n"
6419 "sub $0x4,%4 \n"
6420 "lea " MEMLEA(0x10,2) ",%2 \n"
6421 "lea " MEMLEA(0x10,3) ",%3 \n"
6422 "jg 1b \n"
6423 : "+d"(pixel_temp), // %0
6424 "+a"(table_temp), // %1
6425 "+r"(src_argb), // %2
6426 "+r"(dst_argb), // %3
6427 "+rm"(width) // %4
6428 : "r"(luma), // %5
6429 "rm"(lumacoeff) // %6
6430 : "memory", "cc"
6431 #if defined(__SSE2__)
6432 , "xmm0", "xmm3", "xmm4", "xmm5"
6433 #endif
6434 );
6435 }
6436 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6437
6438 #endif // defined(__x86_64__) || defined(__i386__)
6439
6440 #ifdef __cplusplus
6441 } // extern "C"
6442 } // namespace libyuv
6443 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_neon.cc ('k') | source/libvpx/third_party/libyuv/source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698