Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_posix.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // VERSION 2
2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 *
5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree.
10 */
11
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24 // Constants for ARGB
25 static vec8 kARGBToY = {
26 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27 };
28
29 // JPeg full range.
30 static vec8 kARGBToYJ = {
31 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
32 };
33 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
34
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
36
37 static vec8 kARGBToU = {
38 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
39 };
40
41 static vec8 kARGBToUJ = {
42 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
43 };
44
45 static vec8 kARGBToV = {
46 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
47 };
48
49 static vec8 kARGBToVJ = {
50 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
51 };
52
53 // Constants for BGRA
54 static vec8 kBGRAToY = {
55 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
56 };
57
58 static vec8 kBGRAToU = {
59 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
60 };
61
62 static vec8 kBGRAToV = {
63 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
64 };
65
66 // Constants for ABGR
67 static vec8 kABGRToY = {
68 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
69 };
70
71 static vec8 kABGRToU = {
72 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
73 };
74
75 static vec8 kABGRToV = {
76 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
77 };
78
79 // Constants for RGBA.
80 static vec8 kRGBAToY = {
81 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
82 };
83
84 static vec8 kRGBAToU = {
85 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
86 };
87
88 static vec8 kRGBAToV = {
89 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
90 };
91
92 static uvec8 kAddY16 = {
93 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
94 };
95
96 // 7 bit fixed point 0.5.
97 static vec16 kAddYJ64 = {
98 64, 64, 64, 64, 64, 64, 64, 64
99 };
100
101 static uvec8 kAddUV128 = {
102 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
104 };
105
106 static uvec16 kAddUVJ128 = {
107 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
108 };
109 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
110
111 #ifdef HAS_RGB24TOARGBROW_SSSE3
112
113 // Shuffle table for converting RGB24 to ARGB.
114 static uvec8 kShuffleMaskRGB24ToARGB = {
115 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
116 };
117
118 // Shuffle table for converting RAW to ARGB.
119 static uvec8 kShuffleMaskRAWToARGB = {
120 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
121 };
122
123 // Shuffle table for converting ARGB to RGB24.
124 static uvec8 kShuffleMaskARGBToRGB24 = {
125 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
126 };
127
128 // Shuffle table for converting ARGB to RAW.
129 static uvec8 kShuffleMaskARGBToRAW = {
130 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
131 };
132
133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
134 static uvec8 kShuffleMaskARGBToRGB24_0 = {
135 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
136 };
137
138 // Shuffle table for converting ARGB to RAW.
139 static uvec8 kShuffleMaskARGBToRAW_0 = {
140 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
141 };
142 #endif // HAS_RGB24TOARGBROW_SSSE3
143
144 #if defined(TESTING) && defined(__x86_64__)
145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
146 asm volatile (
147 ".p2align 5 \n"
148 "mov %%eax,%%eax \n"
149 "mov %%ebx,%%ebx \n"
150 "mov %%ecx,%%ecx \n"
151 "mov %%edx,%%edx \n"
152 "mov %%esi,%%esi \n"
153 "mov %%edi,%%edi \n"
154 "mov %%ebp,%%ebp \n"
155 "mov %%esp,%%esp \n"
156 ".p2align 5 \n"
157 "mov %%r8d,%%r8d \n"
158 "mov %%r9d,%%r9d \n"
159 "mov %%r10d,%%r10d \n"
160 "mov %%r11d,%%r11d \n"
161 "mov %%r12d,%%r12d \n"
162 "mov %%r13d,%%r13d \n"
163 "mov %%r14d,%%r14d \n"
164 "mov %%r15d,%%r15d \n"
165 ".p2align 5 \n"
166 "lea (%%rax),%%eax \n"
167 "lea (%%rbx),%%ebx \n"
168 "lea (%%rcx),%%ecx \n"
169 "lea (%%rdx),%%edx \n"
170 "lea (%%rsi),%%esi \n"
171 "lea (%%rdi),%%edi \n"
172 "lea (%%rbp),%%ebp \n"
173 "lea (%%rsp),%%esp \n"
174 ".p2align 5 \n"
175 "lea (%%r8),%%r8d \n"
176 "lea (%%r9),%%r9d \n"
177 "lea (%%r10),%%r10d \n"
178 "lea (%%r11),%%r11d \n"
179 "lea (%%r12),%%r12d \n"
180 "lea (%%r13),%%r13d \n"
181 "lea (%%r14),%%r14d \n"
182 "lea (%%r15),%%r15d \n"
183
184 ".p2align 5 \n"
185 "lea 0x10(%%rax),%%eax \n"
186 "lea 0x10(%%rbx),%%ebx \n"
187 "lea 0x10(%%rcx),%%ecx \n"
188 "lea 0x10(%%rdx),%%edx \n"
189 "lea 0x10(%%rsi),%%esi \n"
190 "lea 0x10(%%rdi),%%edi \n"
191 "lea 0x10(%%rbp),%%ebp \n"
192 "lea 0x10(%%rsp),%%esp \n"
193 ".p2align 5 \n"
194 "lea 0x10(%%r8),%%r8d \n"
195 "lea 0x10(%%r9),%%r9d \n"
196 "lea 0x10(%%r10),%%r10d \n"
197 "lea 0x10(%%r11),%%r11d \n"
198 "lea 0x10(%%r12),%%r12d \n"
199 "lea 0x10(%%r13),%%r13d \n"
200 "lea 0x10(%%r14),%%r14d \n"
201 "lea 0x10(%%r15),%%r15d \n"
202
203 ".p2align 5 \n"
204 "add 0x10,%%eax \n"
205 "add 0x10,%%ebx \n"
206 "add 0x10,%%ecx \n"
207 "add 0x10,%%edx \n"
208 "add 0x10,%%esi \n"
209 "add 0x10,%%edi \n"
210 "add 0x10,%%ebp \n"
211 "add 0x10,%%esp \n"
212 ".p2align 5 \n"
213 "add 0x10,%%r8d \n"
214 "add 0x10,%%r9d \n"
215 "add 0x10,%%r10d \n"
216 "add 0x10,%%r11d \n"
217 "add 0x10,%%r12d \n"
218 "add 0x10,%%r13d \n"
219 "add 0x10,%%r14d \n"
220 "add 0x10,%%r15d \n"
221
222 ".p2align 2 \n"
223 "1: \n"
224 "movq " MEMACCESS(0) ",%%xmm0 \n"
225 "lea " MEMLEA(0x8,0) ",%0 \n"
226 "movdqu %%xmm0," MEMACCESS(1) " \n"
227 "lea " MEMLEA(0x20,1) ",%1 \n"
228 "sub $0x8,%2 \n"
229 "jg 1b \n"
230 : "+r"(src_y), // %0
231 "+r"(dst_argb), // %1
232 "+r"(pix) // %2
233 :
234 : "memory", "cc", "xmm0", "xmm1", "xmm5"
235 );
236 }
237 #endif // TESTING
238
239 #ifdef HAS_I400TOARGBROW_SSE2
240 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
241 asm volatile (
242 "pcmpeqb %%xmm5,%%xmm5 \n"
243 "pslld $0x18,%%xmm5 \n"
244 LABELALIGN
245 "1: \n"
246 "movq " MEMACCESS(0) ",%%xmm0 \n"
247 "lea " MEMLEA(0x8,0) ",%0 \n"
248 "punpcklbw %%xmm0,%%xmm0 \n"
249 "movdqa %%xmm0,%%xmm1 \n"
250 "punpcklwd %%xmm0,%%xmm0 \n"
251 "punpckhwd %%xmm1,%%xmm1 \n"
252 "por %%xmm5,%%xmm0 \n"
253 "por %%xmm5,%%xmm1 \n"
254 "movdqu %%xmm0," MEMACCESS(1) " \n"
255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
256 "lea " MEMLEA(0x20,1) ",%1 \n"
257 "sub $0x8,%2 \n"
258 "jg 1b \n"
259 : "+r"(src_y), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
262 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
263 );
264 }
265 #endif // HAS_I400TOARGBROW_SSE2
266
267 #ifdef HAS_RGB24TOARGBROW_SSSE3
268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
269 asm volatile (
270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
271 "pslld $0x18,%%xmm5 \n"
272 "movdqa %3,%%xmm4 \n"
273 LABELALIGN
274 "1: \n"
275 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
276 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
277 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
278 "lea " MEMLEA(0x30,0) ",%0 \n"
279 "movdqa %%xmm3,%%xmm2 \n"
280 "palignr $0x8,%%xmm1,%%xmm2 \n"
281 "pshufb %%xmm4,%%xmm2 \n"
282 "por %%xmm5,%%xmm2 \n"
283 "palignr $0xc,%%xmm0,%%xmm1 \n"
284 "pshufb %%xmm4,%%xmm0 \n"
285 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
286 "por %%xmm5,%%xmm0 \n"
287 "pshufb %%xmm4,%%xmm1 \n"
288 "movdqu %%xmm0," MEMACCESS(1) " \n"
289 "por %%xmm5,%%xmm1 \n"
290 "palignr $0x4,%%xmm3,%%xmm3 \n"
291 "pshufb %%xmm4,%%xmm3 \n"
292 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
293 "por %%xmm5,%%xmm3 \n"
294 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
295 "lea " MEMLEA(0x40,1) ",%1 \n"
296 "sub $0x10,%2 \n"
297 "jg 1b \n"
298 : "+r"(src_rgb24), // %0
299 "+r"(dst_argb), // %1
300 "+r"(pix) // %2
301 : "m"(kShuffleMaskRGB24ToARGB) // %3
302 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
303 );
304 }
305
306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
307 asm volatile (
308 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
309 "pslld $0x18,%%xmm5 \n"
310 "movdqa %3,%%xmm4 \n"
311 LABELALIGN
312 "1: \n"
313 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
314 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
315 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
316 "lea " MEMLEA(0x30,0) ",%0 \n"
317 "movdqa %%xmm3,%%xmm2 \n"
318 "palignr $0x8,%%xmm1,%%xmm2 \n"
319 "pshufb %%xmm4,%%xmm2 \n"
320 "por %%xmm5,%%xmm2 \n"
321 "palignr $0xc,%%xmm0,%%xmm1 \n"
322 "pshufb %%xmm4,%%xmm0 \n"
323 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
324 "por %%xmm5,%%xmm0 \n"
325 "pshufb %%xmm4,%%xmm1 \n"
326 "movdqu %%xmm0," MEMACCESS(1) " \n"
327 "por %%xmm5,%%xmm1 \n"
328 "palignr $0x4,%%xmm3,%%xmm3 \n"
329 "pshufb %%xmm4,%%xmm3 \n"
330 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
331 "por %%xmm5,%%xmm3 \n"
332 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
333 "lea " MEMLEA(0x40,1) ",%1 \n"
334 "sub $0x10,%2 \n"
335 "jg 1b \n"
336 : "+r"(src_raw), // %0
337 "+r"(dst_argb), // %1
338 "+r"(pix) // %2
339 : "m"(kShuffleMaskRAWToARGB) // %3
340 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
341 );
342 }
343
344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
345 asm volatile (
346 "mov $0x1080108,%%eax \n"
347 "movd %%eax,%%xmm5 \n"
348 "pshufd $0x0,%%xmm5,%%xmm5 \n"
349 "mov $0x20802080,%%eax \n"
350 "movd %%eax,%%xmm6 \n"
351 "pshufd $0x0,%%xmm6,%%xmm6 \n"
352 "pcmpeqb %%xmm3,%%xmm3 \n"
353 "psllw $0xb,%%xmm3 \n"
354 "pcmpeqb %%xmm4,%%xmm4 \n"
355 "psllw $0xa,%%xmm4 \n"
356 "psrlw $0x5,%%xmm4 \n"
357 "pcmpeqb %%xmm7,%%xmm7 \n"
358 "psllw $0x8,%%xmm7 \n"
359 "sub %0,%1 \n"
360 "sub %0,%1 \n"
361 LABELALIGN
362 "1: \n"
363 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
364 "movdqa %%xmm0,%%xmm1 \n"
365 "movdqa %%xmm0,%%xmm2 \n"
366 "pand %%xmm3,%%xmm1 \n"
367 "psllw $0xb,%%xmm2 \n"
368 "pmulhuw %%xmm5,%%xmm1 \n"
369 "pmulhuw %%xmm5,%%xmm2 \n"
370 "psllw $0x8,%%xmm1 \n"
371 "por %%xmm2,%%xmm1 \n"
372 "pand %%xmm4,%%xmm0 \n"
373 "pmulhuw %%xmm6,%%xmm0 \n"
374 "por %%xmm7,%%xmm0 \n"
375 "movdqa %%xmm1,%%xmm2 \n"
376 "punpcklbw %%xmm0,%%xmm1 \n"
377 "punpckhbw %%xmm0,%%xmm2 \n"
378 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
379 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
380 "lea " MEMLEA(0x10,0) ",%0 \n"
381 "sub $0x8,%2 \n"
382 "jg 1b \n"
383 : "+r"(src), // %0
384 "+r"(dst), // %1
385 "+r"(pix) // %2
386 :
387 : "memory", "cc", "eax", NACL_R14
388 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
389 );
390 }
391
392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
393 asm volatile (
394 "mov $0x1080108,%%eax \n"
395 "movd %%eax,%%xmm5 \n"
396 "pshufd $0x0,%%xmm5,%%xmm5 \n"
397 "mov $0x42004200,%%eax \n"
398 "movd %%eax,%%xmm6 \n"
399 "pshufd $0x0,%%xmm6,%%xmm6 \n"
400 "pcmpeqb %%xmm3,%%xmm3 \n"
401 "psllw $0xb,%%xmm3 \n"
402 "movdqa %%xmm3,%%xmm4 \n"
403 "psrlw $0x6,%%xmm4 \n"
404 "pcmpeqb %%xmm7,%%xmm7 \n"
405 "psllw $0x8,%%xmm7 \n"
406 "sub %0,%1 \n"
407 "sub %0,%1 \n"
408 LABELALIGN
409 "1: \n"
410 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
411 "movdqa %%xmm0,%%xmm1 \n"
412 "movdqa %%xmm0,%%xmm2 \n"
413 "psllw $0x1,%%xmm1 \n"
414 "psllw $0xb,%%xmm2 \n"
415 "pand %%xmm3,%%xmm1 \n"
416 "pmulhuw %%xmm5,%%xmm2 \n"
417 "pmulhuw %%xmm5,%%xmm1 \n"
418 "psllw $0x8,%%xmm1 \n"
419 "por %%xmm2,%%xmm1 \n"
420 "movdqa %%xmm0,%%xmm2 \n"
421 "pand %%xmm4,%%xmm0 \n"
422 "psraw $0x8,%%xmm2 \n"
423 "pmulhuw %%xmm6,%%xmm0 \n"
424 "pand %%xmm7,%%xmm2 \n"
425 "por %%xmm2,%%xmm0 \n"
426 "movdqa %%xmm1,%%xmm2 \n"
427 "punpcklbw %%xmm0,%%xmm1 \n"
428 "punpckhbw %%xmm0,%%xmm2 \n"
429 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
430 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
431 "lea " MEMLEA(0x10,0) ",%0 \n"
432 "sub $0x8,%2 \n"
433 "jg 1b \n"
434 : "+r"(src), // %0
435 "+r"(dst), // %1
436 "+r"(pix) // %2
437 :
438 : "memory", "cc", "eax", NACL_R14
439 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
440 );
441 }
442
443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
444 asm volatile (
445 "mov $0xf0f0f0f,%%eax \n"
446 "movd %%eax,%%xmm4 \n"
447 "pshufd $0x0,%%xmm4,%%xmm4 \n"
448 "movdqa %%xmm4,%%xmm5 \n"
449 "pslld $0x4,%%xmm5 \n"
450 "sub %0,%1 \n"
451 "sub %0,%1 \n"
452 LABELALIGN
453 "1: \n"
454 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
455 "movdqa %%xmm0,%%xmm2 \n"
456 "pand %%xmm4,%%xmm0 \n"
457 "pand %%xmm5,%%xmm2 \n"
458 "movdqa %%xmm0,%%xmm1 \n"
459 "movdqa %%xmm2,%%xmm3 \n"
460 "psllw $0x4,%%xmm1 \n"
461 "psrlw $0x4,%%xmm3 \n"
462 "por %%xmm1,%%xmm0 \n"
463 "por %%xmm3,%%xmm2 \n"
464 "movdqa %%xmm0,%%xmm1 \n"
465 "punpcklbw %%xmm2,%%xmm0 \n"
466 "punpckhbw %%xmm2,%%xmm1 \n"
467 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
468 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
469 "lea " MEMLEA(0x10,0) ",%0 \n"
470 "sub $0x8,%2 \n"
471 "jg 1b \n"
472 : "+r"(src), // %0
473 "+r"(dst), // %1
474 "+r"(pix) // %2
475 :
476 : "memory", "cc", "eax", NACL_R14
477 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
478 );
479 }
480
481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
482 asm volatile (
483 "movdqa %3,%%xmm6 \n"
484 LABELALIGN
485 "1: \n"
486 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
487 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
488 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
489 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
490 "lea " MEMLEA(0x40,0) ",%0 \n"
491 "pshufb %%xmm6,%%xmm0 \n"
492 "pshufb %%xmm6,%%xmm1 \n"
493 "pshufb %%xmm6,%%xmm2 \n"
494 "pshufb %%xmm6,%%xmm3 \n"
495 "movdqa %%xmm1,%%xmm4 \n"
496 "psrldq $0x4,%%xmm1 \n"
497 "pslldq $0xc,%%xmm4 \n"
498 "movdqa %%xmm2,%%xmm5 \n"
499 "por %%xmm4,%%xmm0 \n"
500 "pslldq $0x8,%%xmm5 \n"
501 "movdqu %%xmm0," MEMACCESS(1) " \n"
502 "por %%xmm5,%%xmm1 \n"
503 "psrldq $0x8,%%xmm2 \n"
504 "pslldq $0x4,%%xmm3 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
507 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
508 "lea " MEMLEA(0x30,1) ",%1 \n"
509 "sub $0x10,%2 \n"
510 "jg 1b \n"
511 : "+r"(src), // %0
512 "+r"(dst), // %1
513 "+r"(pix) // %2
514 : "m"(kShuffleMaskARGBToRGB24) // %3
515 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
516 );
517 }
518
519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
520 asm volatile (
521 "movdqa %3,%%xmm6 \n"
522 LABELALIGN
523 "1: \n"
524 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
525 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
526 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
527 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
528 "lea " MEMLEA(0x40,0) ",%0 \n"
529 "pshufb %%xmm6,%%xmm0 \n"
530 "pshufb %%xmm6,%%xmm1 \n"
531 "pshufb %%xmm6,%%xmm2 \n"
532 "pshufb %%xmm6,%%xmm3 \n"
533 "movdqa %%xmm1,%%xmm4 \n"
534 "psrldq $0x4,%%xmm1 \n"
535 "pslldq $0xc,%%xmm4 \n"
536 "movdqa %%xmm2,%%xmm5 \n"
537 "por %%xmm4,%%xmm0 \n"
538 "pslldq $0x8,%%xmm5 \n"
539 "movdqu %%xmm0," MEMACCESS(1) " \n"
540 "por %%xmm5,%%xmm1 \n"
541 "psrldq $0x8,%%xmm2 \n"
542 "pslldq $0x4,%%xmm3 \n"
543 "por %%xmm3,%%xmm2 \n"
544 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
545 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
546 "lea " MEMLEA(0x30,1) ",%1 \n"
547 "sub $0x10,%2 \n"
548 "jg 1b \n"
549 : "+r"(src), // %0
550 "+r"(dst), // %1
551 "+r"(pix) // %2
552 : "m"(kShuffleMaskARGBToRAW) // %3
553 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
554 );
555 }
556
557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
558 asm volatile (
559 "pcmpeqb %%xmm3,%%xmm3 \n"
560 "psrld $0x1b,%%xmm3 \n"
561 "pcmpeqb %%xmm4,%%xmm4 \n"
562 "psrld $0x1a,%%xmm4 \n"
563 "pslld $0x5,%%xmm4 \n"
564 "pcmpeqb %%xmm5,%%xmm5 \n"
565 "pslld $0xb,%%xmm5 \n"
566 LABELALIGN
567 "1: \n"
568 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
569 "movdqa %%xmm0,%%xmm1 \n"
570 "movdqa %%xmm0,%%xmm2 \n"
571 "pslld $0x8,%%xmm0 \n"
572 "psrld $0x3,%%xmm1 \n"
573 "psrld $0x5,%%xmm2 \n"
574 "psrad $0x10,%%xmm0 \n"
575 "pand %%xmm3,%%xmm1 \n"
576 "pand %%xmm4,%%xmm2 \n"
577 "pand %%xmm5,%%xmm0 \n"
578 "por %%xmm2,%%xmm1 \n"
579 "por %%xmm1,%%xmm0 \n"
580 "packssdw %%xmm0,%%xmm0 \n"
581 "lea " MEMLEA(0x10,0) ",%0 \n"
582 "movq %%xmm0," MEMACCESS(1) " \n"
583 "lea " MEMLEA(0x8,1) ",%1 \n"
584 "sub $0x4,%2 \n"
585 "jg 1b \n"
586 : "+r"(src), // %0
587 "+r"(dst), // %1
588 "+r"(pix) // %2
589 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
590 );
591 }
592
593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
594 asm volatile (
595 "pcmpeqb %%xmm4,%%xmm4 \n"
596 "psrld $0x1b,%%xmm4 \n"
597 "movdqa %%xmm4,%%xmm5 \n"
598 "pslld $0x5,%%xmm5 \n"
599 "movdqa %%xmm4,%%xmm6 \n"
600 "pslld $0xa,%%xmm6 \n"
601 "pcmpeqb %%xmm7,%%xmm7 \n"
602 "pslld $0xf,%%xmm7 \n"
603 LABELALIGN
604 "1: \n"
605 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "movdqa %%xmm0,%%xmm2 \n"
608 "movdqa %%xmm0,%%xmm3 \n"
609 "psrad $0x10,%%xmm0 \n"
610 "psrld $0x3,%%xmm1 \n"
611 "psrld $0x6,%%xmm2 \n"
612 "psrld $0x9,%%xmm3 \n"
613 "pand %%xmm7,%%xmm0 \n"
614 "pand %%xmm4,%%xmm1 \n"
615 "pand %%xmm5,%%xmm2 \n"
616 "pand %%xmm6,%%xmm3 \n"
617 "por %%xmm1,%%xmm0 \n"
618 "por %%xmm3,%%xmm2 \n"
619 "por %%xmm2,%%xmm0 \n"
620 "packssdw %%xmm0,%%xmm0 \n"
621 "lea " MEMLEA(0x10,0) ",%0 \n"
622 "movq %%xmm0," MEMACCESS(1) " \n"
623 "lea " MEMLEA(0x8,1) ",%1 \n"
624 "sub $0x4,%2 \n"
625 "jg 1b \n"
626 : "+r"(src), // %0
627 "+r"(dst), // %1
628 "+r"(pix) // %2
629 :: "memory", "cc",
630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
631 );
632 }
633
634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
635 asm volatile (
636 "pcmpeqb %%xmm4,%%xmm4 \n"
637 "psllw $0xc,%%xmm4 \n"
638 "movdqa %%xmm4,%%xmm3 \n"
639 "psrlw $0x8,%%xmm3 \n"
640 LABELALIGN
641 "1: \n"
642 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
643 "movdqa %%xmm0,%%xmm1 \n"
644 "pand %%xmm3,%%xmm0 \n"
645 "pand %%xmm4,%%xmm1 \n"
646 "psrlq $0x4,%%xmm0 \n"
647 "psrlq $0x8,%%xmm1 \n"
648 "por %%xmm1,%%xmm0 \n"
649 "packuswb %%xmm0,%%xmm0 \n"
650 "lea " MEMLEA(0x10,0) ",%0 \n"
651 "movq %%xmm0," MEMACCESS(1) " \n"
652 "lea " MEMLEA(0x8,1) ",%1 \n"
653 "sub $0x4,%2 \n"
654 "jg 1b \n"
655 : "+r"(src), // %0
656 "+r"(dst), // %1
657 "+r"(pix) // %2
658 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
659 );
660 }
661 #endif // HAS_RGB24TOARGBROW_SSSE3
662
663 #ifdef HAS_ARGBTOYROW_SSSE3
664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
666 asm volatile (
667 "movdqa %3,%%xmm4 \n"
668 "movdqa %4,%%xmm5 \n"
669 LABELALIGN
670 "1: \n"
671 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
672 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
673 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
674 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
675 "pmaddubsw %%xmm4,%%xmm0 \n"
676 "pmaddubsw %%xmm4,%%xmm1 \n"
677 "pmaddubsw %%xmm4,%%xmm2 \n"
678 "pmaddubsw %%xmm4,%%xmm3 \n"
679 "lea " MEMLEA(0x40,0) ",%0 \n"
680 "phaddw %%xmm1,%%xmm0 \n"
681 "phaddw %%xmm3,%%xmm2 \n"
682 "psrlw $0x7,%%xmm0 \n"
683 "psrlw $0x7,%%xmm2 \n"
684 "packuswb %%xmm2,%%xmm0 \n"
685 "paddb %%xmm5,%%xmm0 \n"
686 "movdqu %%xmm0," MEMACCESS(1) " \n"
687 "lea " MEMLEA(0x10,1) ",%1 \n"
688 "sub $0x10,%2 \n"
689 "jg 1b \n"
690 : "+r"(src_argb), // %0
691 "+r"(dst_y), // %1
692 "+r"(pix) // %2
693 : "m"(kARGBToY), // %3
694 "m"(kAddY16) // %4
695 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
696 );
697 }
698 #endif // HAS_ARGBTOYROW_SSSE3
699
700 #ifdef HAS_ARGBTOYJROW_SSSE3
701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
704 asm volatile (
705 "movdqa %3,%%xmm4 \n"
706 "movdqa %4,%%xmm5 \n"
707 LABELALIGN
708 "1: \n"
709 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
710 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
711 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
712 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
713 "pmaddubsw %%xmm4,%%xmm0 \n"
714 "pmaddubsw %%xmm4,%%xmm1 \n"
715 "pmaddubsw %%xmm4,%%xmm2 \n"
716 "pmaddubsw %%xmm4,%%xmm3 \n"
717 "lea " MEMLEA(0x40,0) ",%0 \n"
718 "phaddw %%xmm1,%%xmm0 \n"
719 "phaddw %%xmm3,%%xmm2 \n"
720 "paddw %%xmm5,%%xmm0 \n"
721 "paddw %%xmm5,%%xmm2 \n"
722 "psrlw $0x7,%%xmm0 \n"
723 "psrlw $0x7,%%xmm2 \n"
724 "packuswb %%xmm2,%%xmm0 \n"
725 "movdqu %%xmm0," MEMACCESS(1) " \n"
726 "lea " MEMLEA(0x10,1) ",%1 \n"
727 "sub $0x10,%2 \n"
728 "jg 1b \n"
729 : "+r"(src_argb), // %0
730 "+r"(dst_y), // %1
731 "+r"(pix) // %2
732 : "m"(kARGBToYJ), // %3
733 "m"(kAddYJ64) // %4
734 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
735 );
736 }
737 #endif // HAS_ARGBTOYJROW_SSSE3
738
739 #ifdef HAS_ARGBTOYROW_AVX2
740 // vpermd for vphaddw + vpackuswb vpermd.
741 static const lvec32 kPermdARGBToY_AVX = {
742 0, 4, 1, 5, 2, 6, 3, 7
743 };
744
745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
747 asm volatile (
748 "vbroadcastf128 %3,%%ymm4 \n"
749 "vbroadcastf128 %4,%%ymm5 \n"
750 "vmovdqu %5,%%ymm6 \n"
751 LABELALIGN
752 "1: \n"
753 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
754 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
755 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
756 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
757 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
758 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
759 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
760 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
761 "lea " MEMLEA(0x80,0) ",%0 \n"
762 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
763 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
764 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
765 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
766 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
767 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
768 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
769 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
770 "lea " MEMLEA(0x20,1) ",%1 \n"
771 "sub $0x20,%2 \n"
772 "jg 1b \n"
773 "vzeroupper \n"
774 : "+r"(src_argb), // %0
775 "+r"(dst_y), // %1
776 "+r"(pix) // %2
777 : "m"(kARGBToY), // %3
778 "m"(kAddY16), // %4
779 "m"(kPermdARGBToY_AVX) // %5
780 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
781 );
782 }
783 #endif // HAS_ARGBTOYROW_AVX2
784
785 #ifdef HAS_ARGBTOYJROW_AVX2
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
788 asm volatile (
789 "vbroadcastf128 %3,%%ymm4 \n"
790 "vbroadcastf128 %4,%%ymm5 \n"
791 "vmovdqu %5,%%ymm6 \n"
792 LABELALIGN
793 "1: \n"
794 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
795 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
796 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
797 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
798 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
799 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
800 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
801 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
802 "lea " MEMLEA(0x80,0) ",%0 \n"
803 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
804 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
805 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
806 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
807 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
808 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
809 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
810 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
811 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
812 "lea " MEMLEA(0x20,1) ",%1 \n"
813 "sub $0x20,%2 \n"
814 "jg 1b \n"
815 "vzeroupper \n"
816 : "+r"(src_argb), // %0
817 "+r"(dst_y), // %1
818 "+r"(pix) // %2
819 : "m"(kARGBToYJ), // %3
820 "m"(kAddYJ64), // %4
821 "m"(kPermdARGBToY_AVX) // %5
822 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823 );
824 }
825 #endif // HAS_ARGBTOYJROW_AVX2
826
827 #ifdef HAS_ARGBTOUVROW_SSSE3
828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
829 uint8* dst_u, uint8* dst_v, int width) {
830 asm volatile (
831 "movdqa %5,%%xmm3 \n"
832 "movdqa %6,%%xmm4 \n"
833 "movdqa %7,%%xmm5 \n"
834 "sub %1,%2 \n"
835 LABELALIGN
836 "1: \n"
837 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
838 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
839 "pavgb %%xmm7,%%xmm0 \n"
840 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
841 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
842 "pavgb %%xmm7,%%xmm1 \n"
843 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
844 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
845 "pavgb %%xmm7,%%xmm2 \n"
846 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
847 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
848 "pavgb %%xmm7,%%xmm6 \n"
849
850 "lea " MEMLEA(0x40,0) ",%0 \n"
851 "movdqa %%xmm0,%%xmm7 \n"
852 "shufps $0x88,%%xmm1,%%xmm0 \n"
853 "shufps $0xdd,%%xmm1,%%xmm7 \n"
854 "pavgb %%xmm7,%%xmm0 \n"
855 "movdqa %%xmm2,%%xmm7 \n"
856 "shufps $0x88,%%xmm6,%%xmm2 \n"
857 "shufps $0xdd,%%xmm6,%%xmm7 \n"
858 "pavgb %%xmm7,%%xmm2 \n"
859 "movdqa %%xmm0,%%xmm1 \n"
860 "movdqa %%xmm2,%%xmm6 \n"
861 "pmaddubsw %%xmm4,%%xmm0 \n"
862 "pmaddubsw %%xmm4,%%xmm2 \n"
863 "pmaddubsw %%xmm3,%%xmm1 \n"
864 "pmaddubsw %%xmm3,%%xmm6 \n"
865 "phaddw %%xmm2,%%xmm0 \n"
866 "phaddw %%xmm6,%%xmm1 \n"
867 "psraw $0x8,%%xmm0 \n"
868 "psraw $0x8,%%xmm1 \n"
869 "packsswb %%xmm1,%%xmm0 \n"
870 "paddb %%xmm5,%%xmm0 \n"
871 "movlps %%xmm0," MEMACCESS(1) " \n"
872 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
873 "lea " MEMLEA(0x8,1) ",%1 \n"
874 "sub $0x10,%3 \n"
875 "jg 1b \n"
876 : "+r"(src_argb0), // %0
877 "+r"(dst_u), // %1
878 "+r"(dst_v), // %2
879 "+rm"(width) // %3
880 : "r"((intptr_t)(src_stride_argb)), // %4
881 "m"(kARGBToV), // %5
882 "m"(kARGBToU), // %6
883 "m"(kAddUV128) // %7
884 : "memory", "cc", NACL_R14
885 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
886 );
887 }
888 #endif // HAS_ARGBTOUVROW_SSSE3
889
890 #ifdef HAS_ARGBTOUVROW_AVX2
891 // vpshufb for vphaddw + vpackuswb packed to shorts.
892 static const lvec8 kShufARGBToUV_AVX = {
893 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
894 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
895 };
896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
897 uint8* dst_u, uint8* dst_v, int width) {
898 asm volatile (
899 "vbroadcastf128 %5,%%ymm5 \n"
900 "vbroadcastf128 %6,%%ymm6 \n"
901 "vbroadcastf128 %7,%%ymm7 \n"
902 "sub %1,%2 \n"
903 LABELALIGN
904 "1: \n"
905 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
906 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
907 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
908 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
909 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
910 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
911 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
912 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
913 "lea " MEMLEA(0x80,0) ",%0 \n"
914 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
915 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
916 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
917 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
918 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
919 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
920
921 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
922 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
923 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
924 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
925 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
926 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
927 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
928 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
929 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
930 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
931 "vpshufb %8,%%ymm0,%%ymm0 \n"
932 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
933
934 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
935 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
936 "lea " MEMLEA(0x10,1) ",%1 \n"
937 "sub $0x20,%3 \n"
938 "jg 1b \n"
939 "vzeroupper \n"
940 : "+r"(src_argb0), // %0
941 "+r"(dst_u), // %1
942 "+r"(dst_v), // %2
943 "+rm"(width) // %3
944 : "r"((intptr_t)(src_stride_argb)), // %4
945 "m"(kAddUV128), // %5
946 "m"(kARGBToV), // %6
947 "m"(kARGBToU), // %7
948 "m"(kShufARGBToUV_AVX) // %8
949 : "memory", "cc", NACL_R14
950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
951 );
952 }
953 #endif // HAS_ARGBTOUVROW_AVX2
954
955 #ifdef HAS_ARGBTOUVJROW_SSSE3
956 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
957 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
958 uint8* dst_u, uint8* dst_v, int width) {
959 asm volatile (
960 "movdqa %5,%%xmm3 \n"
961 "movdqa %6,%%xmm4 \n"
962 "movdqa %7,%%xmm5 \n"
963 "sub %1,%2 \n"
964 LABELALIGN
965 "1: \n"
966 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
967 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
968 "pavgb %%xmm7,%%xmm0 \n"
969 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
970 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
971 "pavgb %%xmm7,%%xmm1 \n"
972 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
973 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
974 "pavgb %%xmm7,%%xmm2 \n"
975 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
976 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
977 "pavgb %%xmm7,%%xmm6 \n"
978
979 "lea " MEMLEA(0x40,0) ",%0 \n"
980 "movdqa %%xmm0,%%xmm7 \n"
981 "shufps $0x88,%%xmm1,%%xmm0 \n"
982 "shufps $0xdd,%%xmm1,%%xmm7 \n"
983 "pavgb %%xmm7,%%xmm0 \n"
984 "movdqa %%xmm2,%%xmm7 \n"
985 "shufps $0x88,%%xmm6,%%xmm2 \n"
986 "shufps $0xdd,%%xmm6,%%xmm7 \n"
987 "pavgb %%xmm7,%%xmm2 \n"
988 "movdqa %%xmm0,%%xmm1 \n"
989 "movdqa %%xmm2,%%xmm6 \n"
990 "pmaddubsw %%xmm4,%%xmm0 \n"
991 "pmaddubsw %%xmm4,%%xmm2 \n"
992 "pmaddubsw %%xmm3,%%xmm1 \n"
993 "pmaddubsw %%xmm3,%%xmm6 \n"
994 "phaddw %%xmm2,%%xmm0 \n"
995 "phaddw %%xmm6,%%xmm1 \n"
996 "paddw %%xmm5,%%xmm0 \n"
997 "paddw %%xmm5,%%xmm1 \n"
998 "psraw $0x8,%%xmm0 \n"
999 "psraw $0x8,%%xmm1 \n"
1000 "packsswb %%xmm1,%%xmm0 \n"
1001 "movlps %%xmm0," MEMACCESS(1) " \n"
1002 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1003 "lea " MEMLEA(0x8,1) ",%1 \n"
1004 "sub $0x10,%3 \n"
1005 "jg 1b \n"
1006 : "+r"(src_argb0), // %0
1007 "+r"(dst_u), // %1
1008 "+r"(dst_v), // %2
1009 "+rm"(width) // %3
1010 : "r"((intptr_t)(src_stride_argb)), // %4
1011 "m"(kARGBToVJ), // %5
1012 "m"(kARGBToUJ), // %6
1013 "m"(kAddUVJ128) // %7
1014 : "memory", "cc", NACL_R14
1015 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1016 );
1017 }
1018 #endif // HAS_ARGBTOUVJROW_SSSE3
1019
1020 #ifdef HAS_ARGBTOUV444ROW_SSSE3
1021 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1022 int width) {
1023 asm volatile (
1024 "movdqa %4,%%xmm3 \n"
1025 "movdqa %5,%%xmm4 \n"
1026 "movdqa %6,%%xmm5 \n"
1027 "sub %1,%2 \n"
1028 LABELALIGN
1029 "1: \n"
1030 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1031 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1032 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1033 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1034 "pmaddubsw %%xmm4,%%xmm0 \n"
1035 "pmaddubsw %%xmm4,%%xmm1 \n"
1036 "pmaddubsw %%xmm4,%%xmm2 \n"
1037 "pmaddubsw %%xmm4,%%xmm6 \n"
1038 "phaddw %%xmm1,%%xmm0 \n"
1039 "phaddw %%xmm6,%%xmm2 \n"
1040 "psraw $0x8,%%xmm0 \n"
1041 "psraw $0x8,%%xmm2 \n"
1042 "packsswb %%xmm2,%%xmm0 \n"
1043 "paddb %%xmm5,%%xmm0 \n"
1044 "movdqu %%xmm0," MEMACCESS(1) " \n"
1045 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1046 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1047 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1048 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1049 "pmaddubsw %%xmm3,%%xmm0 \n"
1050 "pmaddubsw %%xmm3,%%xmm1 \n"
1051 "pmaddubsw %%xmm3,%%xmm2 \n"
1052 "pmaddubsw %%xmm3,%%xmm6 \n"
1053 "phaddw %%xmm1,%%xmm0 \n"
1054 "phaddw %%xmm6,%%xmm2 \n"
1055 "psraw $0x8,%%xmm0 \n"
1056 "psraw $0x8,%%xmm2 \n"
1057 "packsswb %%xmm2,%%xmm0 \n"
1058 "paddb %%xmm5,%%xmm0 \n"
1059 "lea " MEMLEA(0x40,0) ",%0 \n"
1060 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
1061 "lea " MEMLEA(0x10,1) ",%1 \n"
1062 "sub $0x10,%3 \n"
1063 "jg 1b \n"
1064 : "+r"(src_argb), // %0
1065 "+r"(dst_u), // %1
1066 "+r"(dst_v), // %2
1067 "+rm"(width) // %3
1068 : "m"(kARGBToV), // %4
1069 "m"(kARGBToU), // %5
1070 "m"(kAddUV128) // %6
1071 : "memory", "cc", NACL_R14
1072 "xmm0", "xmm1", "xmm2", "xmm6"
1073 );
1074 }
1075 #endif // HAS_ARGBTOUV444ROW_SSSE3
1076
1077 #ifdef HAS_ARGBTOUV422ROW_SSSE3
1078 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1079 uint8* dst_u, uint8* dst_v, int width) {
1080 asm volatile (
1081 "movdqa %4,%%xmm3 \n"
1082 "movdqa %5,%%xmm4 \n"
1083 "movdqa %6,%%xmm5 \n"
1084 "sub %1,%2 \n"
1085 LABELALIGN
1086 "1: \n"
1087 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1088 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1089 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1090 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1091 "lea " MEMLEA(0x40,0) ",%0 \n"
1092 "movdqa %%xmm0,%%xmm7 \n"
1093 "shufps $0x88,%%xmm1,%%xmm0 \n"
1094 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1095 "pavgb %%xmm7,%%xmm0 \n"
1096 "movdqa %%xmm2,%%xmm7 \n"
1097 "shufps $0x88,%%xmm6,%%xmm2 \n"
1098 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1099 "pavgb %%xmm7,%%xmm2 \n"
1100 "movdqa %%xmm0,%%xmm1 \n"
1101 "movdqa %%xmm2,%%xmm6 \n"
1102 "pmaddubsw %%xmm4,%%xmm0 \n"
1103 "pmaddubsw %%xmm4,%%xmm2 \n"
1104 "pmaddubsw %%xmm3,%%xmm1 \n"
1105 "pmaddubsw %%xmm3,%%xmm6 \n"
1106 "phaddw %%xmm2,%%xmm0 \n"
1107 "phaddw %%xmm6,%%xmm1 \n"
1108 "psraw $0x8,%%xmm0 \n"
1109 "psraw $0x8,%%xmm1 \n"
1110 "packsswb %%xmm1,%%xmm0 \n"
1111 "paddb %%xmm5,%%xmm0 \n"
1112 "movlps %%xmm0," MEMACCESS(1) " \n"
1113 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1114 "lea " MEMLEA(0x8,1) ",%1 \n"
1115 "sub $0x10,%3 \n"
1116 "jg 1b \n"
1117 : "+r"(src_argb0), // %0
1118 "+r"(dst_u), // %1
1119 "+r"(dst_v), // %2
1120 "+rm"(width) // %3
1121 : "m"(kARGBToV), // %4
1122 "m"(kARGBToU), // %5
1123 "m"(kAddUV128) // %6
1124 : "memory", "cc", NACL_R14
1125 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1126 );
1127 }
1128 #endif // HAS_ARGBTOUV422ROW_SSSE3
1129
1130 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1131 asm volatile (
1132 "movdqa %4,%%xmm5 \n"
1133 "movdqa %3,%%xmm4 \n"
1134 LABELALIGN
1135 "1: \n"
1136 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1137 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1138 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1139 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1140 "pmaddubsw %%xmm4,%%xmm0 \n"
1141 "pmaddubsw %%xmm4,%%xmm1 \n"
1142 "pmaddubsw %%xmm4,%%xmm2 \n"
1143 "pmaddubsw %%xmm4,%%xmm3 \n"
1144 "lea " MEMLEA(0x40,0) ",%0 \n"
1145 "phaddw %%xmm1,%%xmm0 \n"
1146 "phaddw %%xmm3,%%xmm2 \n"
1147 "psrlw $0x7,%%xmm0 \n"
1148 "psrlw $0x7,%%xmm2 \n"
1149 "packuswb %%xmm2,%%xmm0 \n"
1150 "paddb %%xmm5,%%xmm0 \n"
1151 "movdqu %%xmm0," MEMACCESS(1) " \n"
1152 "lea " MEMLEA(0x10,1) ",%1 \n"
1153 "sub $0x10,%2 \n"
1154 "jg 1b \n"
1155 : "+r"(src_bgra), // %0
1156 "+r"(dst_y), // %1
1157 "+r"(pix) // %2
1158 : "m"(kBGRAToY), // %3
1159 "m"(kAddY16) // %4
1160 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1161 );
1162 }
1163
1164 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1165 uint8* dst_u, uint8* dst_v, int width) {
1166 asm volatile (
1167 "movdqa %5,%%xmm3 \n"
1168 "movdqa %6,%%xmm4 \n"
1169 "movdqa %7,%%xmm5 \n"
1170 "sub %1,%2 \n"
1171 LABELALIGN
1172 "1: \n"
1173 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1174 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1175 "pavgb %%xmm7,%%xmm0 \n"
1176 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1177 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1178 "pavgb %%xmm7,%%xmm1 \n"
1179 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1180 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1181 "pavgb %%xmm7,%%xmm2 \n"
1182 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1183 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1184 "pavgb %%xmm7,%%xmm6 \n"
1185
1186 "lea " MEMLEA(0x40,0) ",%0 \n"
1187 "movdqa %%xmm0,%%xmm7 \n"
1188 "shufps $0x88,%%xmm1,%%xmm0 \n"
1189 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1190 "pavgb %%xmm7,%%xmm0 \n"
1191 "movdqa %%xmm2,%%xmm7 \n"
1192 "shufps $0x88,%%xmm6,%%xmm2 \n"
1193 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1194 "pavgb %%xmm7,%%xmm2 \n"
1195 "movdqa %%xmm0,%%xmm1 \n"
1196 "movdqa %%xmm2,%%xmm6 \n"
1197 "pmaddubsw %%xmm4,%%xmm0 \n"
1198 "pmaddubsw %%xmm4,%%xmm2 \n"
1199 "pmaddubsw %%xmm3,%%xmm1 \n"
1200 "pmaddubsw %%xmm3,%%xmm6 \n"
1201 "phaddw %%xmm2,%%xmm0 \n"
1202 "phaddw %%xmm6,%%xmm1 \n"
1203 "psraw $0x8,%%xmm0 \n"
1204 "psraw $0x8,%%xmm1 \n"
1205 "packsswb %%xmm1,%%xmm0 \n"
1206 "paddb %%xmm5,%%xmm0 \n"
1207 "movlps %%xmm0," MEMACCESS(1) " \n"
1208 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1209 "lea " MEMLEA(0x8,1) ",%1 \n"
1210 "sub $0x10,%3 \n"
1211 "jg 1b \n"
1212 : "+r"(src_bgra0), // %0
1213 "+r"(dst_u), // %1
1214 "+r"(dst_v), // %2
1215 "+rm"(width) // %3
1216 : "r"((intptr_t)(src_stride_bgra)), // %4
1217 "m"(kBGRAToV), // %5
1218 "m"(kBGRAToU), // %6
1219 "m"(kAddUV128) // %7
1220 : "memory", "cc", NACL_R14
1221 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1222 );
1223 }
1224
1225 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1226 asm volatile (
1227 "movdqa %4,%%xmm5 \n"
1228 "movdqa %3,%%xmm4 \n"
1229 LABELALIGN
1230 "1: \n"
1231 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1232 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1233 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1234 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1235 "pmaddubsw %%xmm4,%%xmm0 \n"
1236 "pmaddubsw %%xmm4,%%xmm1 \n"
1237 "pmaddubsw %%xmm4,%%xmm2 \n"
1238 "pmaddubsw %%xmm4,%%xmm3 \n"
1239 "lea " MEMLEA(0x40,0) ",%0 \n"
1240 "phaddw %%xmm1,%%xmm0 \n"
1241 "phaddw %%xmm3,%%xmm2 \n"
1242 "psrlw $0x7,%%xmm0 \n"
1243 "psrlw $0x7,%%xmm2 \n"
1244 "packuswb %%xmm2,%%xmm0 \n"
1245 "paddb %%xmm5,%%xmm0 \n"
1246 "movdqu %%xmm0," MEMACCESS(1) " \n"
1247 "lea " MEMLEA(0x10,1) ",%1 \n"
1248 "sub $0x10,%2 \n"
1249 "jg 1b \n"
1250 : "+r"(src_abgr), // %0
1251 "+r"(dst_y), // %1
1252 "+r"(pix) // %2
1253 : "m"(kABGRToY), // %3
1254 "m"(kAddY16) // %4
1255 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1256 );
1257 }
1258
1259 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1260 asm volatile (
1261 "movdqa %4,%%xmm5 \n"
1262 "movdqa %3,%%xmm4 \n"
1263 LABELALIGN
1264 "1: \n"
1265 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1266 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1267 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1268 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1269 "pmaddubsw %%xmm4,%%xmm0 \n"
1270 "pmaddubsw %%xmm4,%%xmm1 \n"
1271 "pmaddubsw %%xmm4,%%xmm2 \n"
1272 "pmaddubsw %%xmm4,%%xmm3 \n"
1273 "lea " MEMLEA(0x40,0) ",%0 \n"
1274 "phaddw %%xmm1,%%xmm0 \n"
1275 "phaddw %%xmm3,%%xmm2 \n"
1276 "psrlw $0x7,%%xmm0 \n"
1277 "psrlw $0x7,%%xmm2 \n"
1278 "packuswb %%xmm2,%%xmm0 \n"
1279 "paddb %%xmm5,%%xmm0 \n"
1280 "movdqu %%xmm0," MEMACCESS(1) " \n"
1281 "lea " MEMLEA(0x10,1) ",%1 \n"
1282 "sub $0x10,%2 \n"
1283 "jg 1b \n"
1284 : "+r"(src_rgba), // %0
1285 "+r"(dst_y), // %1
1286 "+r"(pix) // %2
1287 : "m"(kRGBAToY), // %3
1288 "m"(kAddY16) // %4
1289 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1290 );
1291 }
1292
1293 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1294 uint8* dst_u, uint8* dst_v, int width) {
1295 asm volatile (
1296 "movdqa %5,%%xmm3 \n"
1297 "movdqa %6,%%xmm4 \n"
1298 "movdqa %7,%%xmm5 \n"
1299 "sub %1,%2 \n"
1300 LABELALIGN
1301 "1: \n"
1302 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1303 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1304 "pavgb %%xmm7,%%xmm0 \n"
1305 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1306 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1307 "pavgb %%xmm7,%%xmm1 \n"
1308 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1309 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1310 "pavgb %%xmm7,%%xmm2 \n"
1311 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1312 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1313 "pavgb %%xmm7,%%xmm6 \n"
1314
1315 "lea " MEMLEA(0x40,0) ",%0 \n"
1316 "movdqa %%xmm0,%%xmm7 \n"
1317 "shufps $0x88,%%xmm1,%%xmm0 \n"
1318 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1319 "pavgb %%xmm7,%%xmm0 \n"
1320 "movdqa %%xmm2,%%xmm7 \n"
1321 "shufps $0x88,%%xmm6,%%xmm2 \n"
1322 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1323 "pavgb %%xmm7,%%xmm2 \n"
1324 "movdqa %%xmm0,%%xmm1 \n"
1325 "movdqa %%xmm2,%%xmm6 \n"
1326 "pmaddubsw %%xmm4,%%xmm0 \n"
1327 "pmaddubsw %%xmm4,%%xmm2 \n"
1328 "pmaddubsw %%xmm3,%%xmm1 \n"
1329 "pmaddubsw %%xmm3,%%xmm6 \n"
1330 "phaddw %%xmm2,%%xmm0 \n"
1331 "phaddw %%xmm6,%%xmm1 \n"
1332 "psraw $0x8,%%xmm0 \n"
1333 "psraw $0x8,%%xmm1 \n"
1334 "packsswb %%xmm1,%%xmm0 \n"
1335 "paddb %%xmm5,%%xmm0 \n"
1336 "movlps %%xmm0," MEMACCESS(1) " \n"
1337 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1338 "lea " MEMLEA(0x8,1) ",%1 \n"
1339 "sub $0x10,%3 \n"
1340 "jg 1b \n"
1341 : "+r"(src_abgr0), // %0
1342 "+r"(dst_u), // %1
1343 "+r"(dst_v), // %2
1344 "+rm"(width) // %3
1345 : "r"((intptr_t)(src_stride_abgr)), // %4
1346 "m"(kABGRToV), // %5
1347 "m"(kABGRToU), // %6
1348 "m"(kAddUV128) // %7
1349 : "memory", "cc", NACL_R14
1350 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1351 );
1352 }
1353
1354 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1355 uint8* dst_u, uint8* dst_v, int width) {
1356 asm volatile (
1357 "movdqa %5,%%xmm3 \n"
1358 "movdqa %6,%%xmm4 \n"
1359 "movdqa %7,%%xmm5 \n"
1360 "sub %1,%2 \n"
1361 LABELALIGN
1362 "1: \n"
1363 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1364 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1365 "pavgb %%xmm7,%%xmm0 \n"
1366 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1367 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1368 "pavgb %%xmm7,%%xmm1 \n"
1369 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1370 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1371 "pavgb %%xmm7,%%xmm2 \n"
1372 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1373 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1374 "pavgb %%xmm7,%%xmm6 \n"
1375
1376 "lea " MEMLEA(0x40,0) ",%0 \n"
1377 "movdqa %%xmm0,%%xmm7 \n"
1378 "shufps $0x88,%%xmm1,%%xmm0 \n"
1379 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1380 "pavgb %%xmm7,%%xmm0 \n"
1381 "movdqa %%xmm2,%%xmm7 \n"
1382 "shufps $0x88,%%xmm6,%%xmm2 \n"
1383 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1384 "pavgb %%xmm7,%%xmm2 \n"
1385 "movdqa %%xmm0,%%xmm1 \n"
1386 "movdqa %%xmm2,%%xmm6 \n"
1387 "pmaddubsw %%xmm4,%%xmm0 \n"
1388 "pmaddubsw %%xmm4,%%xmm2 \n"
1389 "pmaddubsw %%xmm3,%%xmm1 \n"
1390 "pmaddubsw %%xmm3,%%xmm6 \n"
1391 "phaddw %%xmm2,%%xmm0 \n"
1392 "phaddw %%xmm6,%%xmm1 \n"
1393 "psraw $0x8,%%xmm0 \n"
1394 "psraw $0x8,%%xmm1 \n"
1395 "packsswb %%xmm1,%%xmm0 \n"
1396 "paddb %%xmm5,%%xmm0 \n"
1397 "movlps %%xmm0," MEMACCESS(1) " \n"
1398 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1399 "lea " MEMLEA(0x8,1) ",%1 \n"
1400 "sub $0x10,%3 \n"
1401 "jg 1b \n"
1402 : "+r"(src_rgba0), // %0
1403 "+r"(dst_u), // %1
1404 "+r"(dst_v), // %2
1405 "+rm"(width) // %3
1406 : "r"((intptr_t)(src_stride_rgba)), // %4
1407 "m"(kRGBAToV), // %5
1408 "m"(kRGBAToU), // %6
1409 "m"(kAddUV128) // %7
1410 : "memory", "cc", NACL_R14
1411 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1412 );
1413 }
1414
1415 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1416
1417 // YUV to RGB conversion constants.
1418 // Y contribution to R,G,B. Scale and bias.
1419 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1420 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
1421
1422 // U and V contributions to R,G,B.
1423 #define UB -128 /* -min(128, round(2.018 * 64)) */
1424 #define UG 25 /* -round(-0.391 * 64) */
1425 #define VG 52 /* -round(-0.813 * 64) */
1426 #define VR -102 /* -round(1.596 * 64) */
1427
1428 // Bias values to subtract 16 from Y and 128 from U and V.
1429 #define BB (UB * 128 - YGB)
1430 #define BG (UG * 128 + VG * 128 - YGB)
1431 #define BR (VR * 128 - YGB)
1432
1433 struct YuvConstants {
1434 lvec8 kUVToB; // 0
1435 lvec8 kUVToG; // 32
1436 lvec8 kUVToR; // 64
1437 lvec16 kUVBiasB; // 96
1438 lvec16 kUVBiasG; // 128
1439 lvec16 kUVBiasR; // 160
1440 lvec16 kYToRgb; // 192
1441 };
1442
1443 // BT601 constants for YUV to RGB.
1444 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
1445 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1446 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1447 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1448 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1449 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1450 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1451 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1452 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1453 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1454 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1455 };
1456
1457 // BT601 constants for NV21 where chroma plane is VU instead of UV.
1458 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
1459 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1460 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1461 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1462 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1463 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1464 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1465 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1466 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1467 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1468 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1469 };
1470
1471 // Read 8 UV from 411
1472 #define READYUV444 \
1473 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1474 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1475 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1476 "punpcklbw %%xmm1,%%xmm0 \n"
1477
1478 // Read 4 UV from 422, upsample to 8 UV
1479 #define READYUV422 \
1480 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1481 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1482 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1483 "punpcklbw %%xmm1,%%xmm0 \n" \
1484 "punpcklwd %%xmm0,%%xmm0 \n"
1485
1486 // Read 2 UV from 411, upsample to 8 UV
1487 #define READYUV411 \
1488 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1489 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1490 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
1491 "punpcklbw %%xmm1,%%xmm0 \n" \
1492 "punpcklwd %%xmm0,%%xmm0 \n" \
1493 "punpckldq %%xmm0,%%xmm0 \n"
1494
1495 // Read 4 UV from NV12, upsample to 8 UV
1496 #define READNV12 \
1497 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1498 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1499 "punpcklwd %%xmm0,%%xmm0 \n"
1500
1501 // Convert 8 pixels: 8 UV and 8 Y
1502 #define YUVTORGB(YuvConstants) \
1503 "movdqa %%xmm0,%%xmm1 \n" \
1504 "movdqa %%xmm0,%%xmm2 \n" \
1505 "movdqa %%xmm0,%%xmm3 \n" \
1506 "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \
1507 "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \
1508 "psubw %%xmm1,%%xmm0 \n" \
1509 "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \
1510 "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \
1511 "psubw %%xmm2,%%xmm1 \n" \
1512 "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \
1513 "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \
1514 "psubw %%xmm3,%%xmm2 \n" \
1515 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1516 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1517 "punpcklbw %%xmm3,%%xmm3 \n" \
1518 "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \
1519 "paddsw %%xmm3,%%xmm0 \n" \
1520 "paddsw %%xmm3,%%xmm1 \n" \
1521 "paddsw %%xmm3,%%xmm2 \n" \
1522 "psraw $0x6,%%xmm0 \n" \
1523 "psraw $0x6,%%xmm1 \n" \
1524 "psraw $0x6,%%xmm2 \n" \
1525 "packuswb %%xmm0,%%xmm0 \n" \
1526 "packuswb %%xmm1,%%xmm1 \n" \
1527 "packuswb %%xmm2,%%xmm2 \n"
1528
1529 // Store 8 ARGB values. Assumes XMM5 is zero.
1530 #define STOREARGB \
1531 "punpcklbw %%xmm1,%%xmm0 \n" \
1532 "punpcklbw %%xmm5,%%xmm2 \n" \
1533 "movdqa %%xmm0,%%xmm1 \n" \
1534 "punpcklwd %%xmm2,%%xmm0 \n" \
1535 "punpckhwd %%xmm2,%%xmm1 \n" \
1536 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1537 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \
1538 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
1539
1540 // Store 8 BGRA values. Assumes XMM5 is zero.
1541 #define STOREBGRA \
1542 "pcmpeqb %%xmm5,%%xmm5 \n" \
1543 "punpcklbw %%xmm0,%%xmm1 \n" \
1544 "punpcklbw %%xmm2,%%xmm5 \n" \
1545 "movdqa %%xmm5,%%xmm0 \n" \
1546 "punpcklwd %%xmm1,%%xmm5 \n" \
1547 "punpckhwd %%xmm1,%%xmm0 \n" \
1548 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
1549 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \
1550 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
1551
1552 // Store 8 ABGR values. Assumes XMM5 is zero.
1553 #define STOREABGR \
1554 "punpcklbw %%xmm1,%%xmm2 \n" \
1555 "punpcklbw %%xmm5,%%xmm0 \n" \
1556 "movdqa %%xmm2,%%xmm1 \n" \
1557 "punpcklwd %%xmm0,%%xmm2 \n" \
1558 "punpckhwd %%xmm0,%%xmm1 \n" \
1559 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
1560 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \
1561 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
1562
1563 // Store 8 RGBA values. Assumes XMM5 is zero.
1564 #define STORERGBA \
1565 "pcmpeqb %%xmm5,%%xmm5 \n" \
1566 "punpcklbw %%xmm2,%%xmm1 \n" \
1567 "punpcklbw %%xmm0,%%xmm5 \n" \
1568 "movdqa %%xmm5,%%xmm0 \n" \
1569 "punpcklwd %%xmm1,%%xmm5 \n" \
1570 "punpckhwd %%xmm1,%%xmm0 \n" \
1571 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1572 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \
1573 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
1574
1575 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1576 const uint8* u_buf,
1577 const uint8* v_buf,
1578 uint8* dst_argb,
1579 int width) {
1580 asm volatile (
1581 "sub %[u_buf],%[v_buf] \n"
1582 "pcmpeqb %%xmm5,%%xmm5 \n"
1583 LABELALIGN
1584 "1: \n"
1585 READYUV444
1586 YUVTORGB(kYuvConstants)
1587 STOREARGB
1588 "sub $0x8,%[width] \n"
1589 "jg 1b \n"
1590 : [y_buf]"+r"(y_buf), // %[y_buf]
1591 [u_buf]"+r"(u_buf), // %[u_buf]
1592 [v_buf]"+r"(v_buf), // %[v_buf]
1593 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1594 [width]"+rm"(width) // %[width]
1595 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1596 : "memory", "cc", NACL_R14
1597 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1598 );
1599 }
1600
1601 // TODO(fbarchard): Consider putting masks into constants.
1602 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1603 const uint8* u_buf,
1604 const uint8* v_buf,
1605 uint8* dst_rgb24,
1606 int width) {
1607 asm volatile (
1608 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1609 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1610 "sub %[u_buf],%[v_buf] \n"
1611 LABELALIGN
1612 "1: \n"
1613 READYUV422
1614 YUVTORGB(kYuvConstants)
1615 "punpcklbw %%xmm1,%%xmm0 \n"
1616 "punpcklbw %%xmm2,%%xmm2 \n"
1617 "movdqa %%xmm0,%%xmm1 \n"
1618 "punpcklwd %%xmm2,%%xmm0 \n"
1619 "punpckhwd %%xmm2,%%xmm1 \n"
1620 "pshufb %%xmm5,%%xmm0 \n"
1621 "pshufb %%xmm6,%%xmm1 \n"
1622 "palignr $0xc,%%xmm0,%%xmm1 \n"
1623 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1624 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1625 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1626 "subl $0x8,%[width] \n"
1627 "jg 1b \n"
1628 : [y_buf]"+r"(y_buf), // %[y_buf]
1629 [u_buf]"+r"(u_buf), // %[u_buf]
1630 [v_buf]"+r"(v_buf), // %[v_buf]
1631 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
1632 // TODO(fbarchard): Make width a register for 32 bit.
1633 #if defined(__i386__) && defined(__pic__)
1634 [width]"+m"(width) // %[width]
1635 #else
1636 [width]"+rm"(width) // %[width]
1637 #endif
1638 : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1639 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1640 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1641 : "memory", "cc", NACL_R14
1642 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1643 );
1644 }
1645
1646 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1647 const uint8* u_buf,
1648 const uint8* v_buf,
1649 uint8* dst_raw,
1650 int width) {
1651 asm volatile (
1652 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1653 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1654 "sub %[u_buf],%[v_buf] \n"
1655 LABELALIGN
1656 "1: \n"
1657 READYUV422
1658 YUVTORGB(kYuvConstants)
1659 "punpcklbw %%xmm1,%%xmm0 \n"
1660 "punpcklbw %%xmm2,%%xmm2 \n"
1661 "movdqa %%xmm0,%%xmm1 \n"
1662 "punpcklwd %%xmm2,%%xmm0 \n"
1663 "punpckhwd %%xmm2,%%xmm1 \n"
1664 "pshufb %%xmm5,%%xmm0 \n"
1665 "pshufb %%xmm6,%%xmm1 \n"
1666 "palignr $0xc,%%xmm0,%%xmm1 \n"
1667 "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
1668 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
1669 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
1670 "subl $0x8,%[width] \n"
1671 "jg 1b \n"
1672 : [y_buf]"+r"(y_buf), // %[y_buf]
1673 [u_buf]"+r"(u_buf), // %[u_buf]
1674 [v_buf]"+r"(v_buf), // %[v_buf]
1675 [dst_raw]"+r"(dst_raw), // %[dst_raw]
1676 // TODO(fbarchard): Make width a register for 32 bit.
1677 #if defined(__i386__) && defined(__pic__)
1678 [width]"+m"(width) // %[width]
1679 #else
1680 [width]"+rm"(width) // %[width]
1681 #endif
1682 : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1683 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1684 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
1685 : "memory", "cc", NACL_R14
1686 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1687 );
1688 }
1689
1690 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1691 const uint8* u_buf,
1692 const uint8* v_buf,
1693 uint8* dst_argb,
1694 int width) {
1695 asm volatile (
1696 "sub %[u_buf],%[v_buf] \n"
1697 "pcmpeqb %%xmm5,%%xmm5 \n"
1698 LABELALIGN
1699 "1: \n"
1700 READYUV422
1701 YUVTORGB(kYuvConstants)
1702 STOREARGB
1703 "sub $0x8,%[width] \n"
1704 "jg 1b \n"
1705 : [y_buf]"+r"(y_buf), // %[y_buf]
1706 [u_buf]"+r"(u_buf), // %[u_buf]
1707 [v_buf]"+r"(v_buf), // %[v_buf]
1708 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1709 [width]"+rm"(width) // %[width]
1710 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1711 : "memory", "cc", NACL_R14
1712 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1713 );
1714 }
1715
1716 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1717 const uint8* u_buf,
1718 const uint8* v_buf,
1719 uint8* dst_argb,
1720 int width) {
1721 asm volatile (
1722 "sub %[u_buf],%[v_buf] \n"
1723 "pcmpeqb %%xmm5,%%xmm5 \n"
1724 LABELALIGN
1725 "1: \n"
1726 READYUV411
1727 YUVTORGB(kYuvConstants)
1728 STOREARGB
1729 "sub $0x8,%[width] \n"
1730 "jg 1b \n"
1731 : [y_buf]"+r"(y_buf), // %[y_buf]
1732 [u_buf]"+r"(u_buf), // %[u_buf]
1733 [v_buf]"+r"(v_buf), // %[v_buf]
1734 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1735 [width]"+rm"(width) // %[width]
1736 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1737 : "memory", "cc", NACL_R14
1738 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1739 );
1740 }
1741
1742 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1743 const uint8* uv_buf,
1744 uint8* dst_argb,
1745 int width) {
1746 asm volatile (
1747 "pcmpeqb %%xmm5,%%xmm5 \n"
1748 LABELALIGN
1749 "1: \n"
1750 READNV12
1751 YUVTORGB(kYuvConstants)
1752 STOREARGB
1753 "sub $0x8,%[width] \n"
1754 "jg 1b \n"
1755 : [y_buf]"+r"(y_buf), // %[y_buf]
1756 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1757 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1758 [width]"+rm"(width) // %[width]
1759 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1760 // Does not use r14.
1761 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1762 );
1763 }
1764
1765 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1766 const uint8* uv_buf,
1767 uint8* dst_argb,
1768 int width) {
1769 asm volatile (
1770 "pcmpeqb %%xmm5,%%xmm5 \n"
1771 LABELALIGN
1772 "1: \n"
1773 READNV12
1774 YUVTORGB(kYuvConstants)
1775 STOREARGB
1776 "sub $0x8,%[width] \n"
1777 "jg 1b \n"
1778 : [y_buf]"+r"(y_buf), // %[y_buf]
1779 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1780 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1781 [width]"+rm"(width) // %[width]
1782 : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
1783 // Does not use r14.
1784 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1785 );
1786 }
1787
1788 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1789 const uint8* u_buf,
1790 const uint8* v_buf,
1791 uint8* dst_bgra,
1792 int width) {
1793 asm volatile (
1794 "sub %[u_buf],%[v_buf] \n"
1795 "pcmpeqb %%xmm5,%%xmm5 \n"
1796 LABELALIGN
1797 "1: \n"
1798 READYUV422
1799 YUVTORGB(kYuvConstants)
1800 STOREBGRA
1801 "sub $0x8,%[width] \n"
1802 "jg 1b \n"
1803 : [y_buf]"+r"(y_buf), // %[y_buf]
1804 [u_buf]"+r"(u_buf), // %[u_buf]
1805 [v_buf]"+r"(v_buf), // %[v_buf]
1806 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
1807 [width]"+rm"(width) // %[width]
1808 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1809 : "memory", "cc", NACL_R14
1810 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1811 );
1812 }
1813
1814 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1815 const uint8* u_buf,
1816 const uint8* v_buf,
1817 uint8* dst_abgr,
1818 int width) {
1819 asm volatile (
1820 "sub %[u_buf],%[v_buf] \n"
1821 "pcmpeqb %%xmm5,%%xmm5 \n"
1822 LABELALIGN
1823 "1: \n"
1824 READYUV422
1825 YUVTORGB(kYuvConstants)
1826 STOREABGR
1827 "sub $0x8,%[width] \n"
1828 "jg 1b \n"
1829 : [y_buf]"+r"(y_buf), // %[y_buf]
1830 [u_buf]"+r"(u_buf), // %[u_buf]
1831 [v_buf]"+r"(v_buf), // %[v_buf]
1832 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
1833 [width]"+rm"(width) // %[width]
1834 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1835 : "memory", "cc", NACL_R14
1836 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1837 );
1838 }
1839
1840 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1841 const uint8* u_buf,
1842 const uint8* v_buf,
1843 uint8* dst_rgba,
1844 int width) {
1845 asm volatile (
1846 "sub %[u_buf],%[v_buf] \n"
1847 "pcmpeqb %%xmm5,%%xmm5 \n"
1848 LABELALIGN
1849 "1: \n"
1850 READYUV422
1851 YUVTORGB(kYuvConstants)
1852 STORERGBA
1853 "sub $0x8,%[width] \n"
1854 "jg 1b \n"
1855 : [y_buf]"+r"(y_buf), // %[y_buf]
1856 [u_buf]"+r"(u_buf), // %[u_buf]
1857 [v_buf]"+r"(v_buf), // %[v_buf]
1858 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
1859 [width]"+rm"(width) // %[width]
1860 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1861 : "memory", "cc", NACL_R14
1862 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1863 );
1864 }
1865
1866 #endif // HAS_I422TOARGBROW_SSSE3
1867
1868 // Read 8 UV from 422, upsample to 16 UV.
1869 #define READYUV422_AVX2 \
1870 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1871 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1872 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1873 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1874 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1875 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1876
1877 // Convert 16 pixels: 16 UV and 16 Y.
1878 #define YUVTORGB_AVX2(YuvConstants) \
1879 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \
1880 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \
1881 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
1882 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
1883 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
1884 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \
1885 "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \
1886 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \
1887 "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \
1888 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1889 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1890 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
1891 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \
1892 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \
1893 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \
1894 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \
1895 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \
1896 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1897 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
1898 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
1899 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
1900 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
1901 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
1902
1903 #if defined(HAS_I422TOBGRAROW_AVX2)
1904 // 16 pixels
1905 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
1906 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
1907 const uint8* u_buf,
1908 const uint8* v_buf,
1909 uint8* dst_bgra,
1910 int width) {
1911 asm volatile (
1912 "sub %[u_buf],%[v_buf] \n"
1913 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
1914 LABELALIGN
1915 "1: \n"
1916 READYUV422_AVX2
1917 YUVTORGB_AVX2(kYuvConstants)
1918
1919 // Step 3: Weave into BGRA
1920 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB
1921 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
1922 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR
1923 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
1924 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels
1925 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels
1926
1927 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"
1928 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
1929 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
1930 "sub $0x10,%[width] \n"
1931 "jg 1b \n"
1932 "vzeroupper \n"
1933 : [y_buf]"+r"(y_buf), // %[y_buf]
1934 [u_buf]"+r"(u_buf), // %[u_buf]
1935 [v_buf]"+r"(v_buf), // %[v_buf]
1936 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
1937 [width]"+rm"(width) // %[width]
1938 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1939 : "memory", "cc", NACL_R14
1940 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1941 );
1942 }
1943 #endif // HAS_I422TOBGRAROW_AVX2
1944
1945 #if defined(HAS_I422TOARGBROW_AVX2)
1946 // 16 pixels
1947 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
1948 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
1949 const uint8* u_buf,
1950 const uint8* v_buf,
1951 uint8* dst_argb,
1952 int width) {
1953 asm volatile (
1954 "sub %[u_buf],%[v_buf] \n"
1955 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
1956 LABELALIGN
1957 "1: \n"
1958 READYUV422_AVX2
1959 YUVTORGB_AVX2(kYuvConstants)
1960
1961 // Step 3: Weave into ARGB
1962 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
1963 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1964 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
1965 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
1966 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
1967 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
1968
1969 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
1970 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
1971 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
1972 "sub $0x10,%[width] \n"
1973 "jg 1b \n"
1974 "vzeroupper \n"
1975 : [y_buf]"+r"(y_buf), // %[y_buf]
1976 [u_buf]"+r"(u_buf), // %[u_buf]
1977 [v_buf]"+r"(v_buf), // %[v_buf]
1978 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1979 [width]"+rm"(width) // %[width]
1980 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1981 : "memory", "cc", NACL_R14
1982 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1983 );
1984 }
1985 #endif // HAS_I422TOARGBROW_AVX2
1986
1987 #if defined(HAS_I422TOABGRROW_AVX2)
1988 // 16 pixels
1989 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
1990 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
1991 const uint8* u_buf,
1992 const uint8* v_buf,
1993 uint8* dst_argb,
1994 int width) {
1995 asm volatile (
1996 "sub %[u_buf],%[v_buf] \n"
1997 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
1998 LABELALIGN
1999 "1: \n"
2000 READYUV422_AVX2
2001 YUVTORGB_AVX2(kYuvConstants)
2002
2003 // Step 3: Weave into ABGR
2004 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG
2005 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2006 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA
2007 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2008 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels
2009 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels
2010 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2011 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2012 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2013 "sub $0x10,%[width] \n"
2014 "jg 1b \n"
2015 "vzeroupper \n"
2016 : [y_buf]"+r"(y_buf), // %[y_buf]
2017 [u_buf]"+r"(u_buf), // %[u_buf]
2018 [v_buf]"+r"(v_buf), // %[v_buf]
2019 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2020 [width]"+rm"(width) // %[width]
2021 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2022 : "memory", "cc", NACL_R14
2023 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2024 );
2025 }
2026 #endif // HAS_I422TOABGRROW_AVX2
2027
2028 #if defined(HAS_I422TORGBAROW_AVX2)
2029 // 16 pixels
2030 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2031 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2032 const uint8* u_buf,
2033 const uint8* v_buf,
2034 uint8* dst_argb,
2035 int width) {
2036 asm volatile (
2037 "sub %[u_buf],%[v_buf] \n"
2038 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2039 LABELALIGN
2040 "1: \n"
2041 READYUV422_AVX2
2042 YUVTORGB_AVX2(kYuvConstants)
2043
2044 // Step 3: Weave into RGBA
2045 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2046 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2047 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2048 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2049 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2050 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2051 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2052 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2053 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2054 "sub $0x10,%[width] \n"
2055 "jg 1b \n"
2056 "vzeroupper \n"
2057 : [y_buf]"+r"(y_buf), // %[y_buf]
2058 [u_buf]"+r"(u_buf), // %[u_buf]
2059 [v_buf]"+r"(v_buf), // %[v_buf]
2060 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2061 [width]"+rm"(width) // %[width]
2062 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2063 : "memory", "cc", NACL_R14
2064 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2065 );
2066 }
2067 #endif // HAS_I422TORGBAROW_AVX2
2068
2069 #ifdef HAS_YTOARGBROW_SSE2
2070 void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2071 asm volatile (
2072 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2073 "movd %%eax,%%xmm2 \n"
2074 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2075 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2076 "movd %%eax,%%xmm3 \n"
2077 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2078 "pcmpeqb %%xmm4,%%xmm4 \n"
2079 "pslld $0x18,%%xmm4 \n"
2080 LABELALIGN
2081 "1: \n"
2082 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2083 "movq " MEMACCESS(0) ",%%xmm0 \n"
2084 "lea " MEMLEA(0x8,0) ",%0 \n"
2085 "punpcklbw %%xmm0,%%xmm0 \n"
2086 "pmulhuw %%xmm2,%%xmm0 \n"
2087 "psubusw %%xmm3,%%xmm0 \n"
2088 "psrlw $6, %%xmm0 \n"
2089 "packuswb %%xmm0,%%xmm0 \n"
2090
2091 // Step 2: Weave into ARGB
2092 "punpcklbw %%xmm0,%%xmm0 \n"
2093 "movdqa %%xmm0,%%xmm1 \n"
2094 "punpcklwd %%xmm0,%%xmm0 \n"
2095 "punpckhwd %%xmm1,%%xmm1 \n"
2096 "por %%xmm4,%%xmm0 \n"
2097 "por %%xmm4,%%xmm1 \n"
2098 "movdqu %%xmm0," MEMACCESS(1) " \n"
2099 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2100 "lea " MEMLEA(0x20,1) ",%1 \n"
2101
2102 "sub $0x8,%2 \n"
2103 "jg 1b \n"
2104 : "+r"(y_buf), // %0
2105 "+r"(dst_argb), // %1
2106 "+rm"(width) // %2
2107 :
2108 : "memory", "cc", "eax"
2109 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2110 );
2111 }
2112 #endif // HAS_YTOARGBROW_SSE2
2113
2114 #ifdef HAS_YTOARGBROW_AVX2
2115 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2116 // note: vpunpcklbw mutates and vpackuswb unmutates.
2117 void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2118 asm volatile (
2119 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
2120 "vmovd %%eax,%%xmm2 \n"
2121 "vbroadcastss %%xmm2,%%ymm2 \n"
2122 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
2123 "vmovd %%eax,%%xmm3 \n"
2124 "vbroadcastss %%xmm3,%%ymm3 \n"
2125 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
2126 "vpslld $0x18,%%ymm4,%%ymm4 \n"
2127
2128 LABELALIGN
2129 "1: \n"
2130 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2131 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
2132 "lea " MEMLEA(0x10,0) ",%0 \n"
2133 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2134 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
2135 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
2136 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
2137 "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
2138 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2139 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
2140 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2141 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
2142 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
2143 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
2144 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
2145 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2146 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2147 "lea " MEMLEA(0x40,1) ",%1 \n"
2148 "sub $0x10,%2 \n"
2149 "jg 1b \n"
2150 "vzeroupper \n"
2151 : "+r"(y_buf), // %0
2152 "+r"(dst_argb), // %1
2153 "+rm"(width) // %2
2154 :
2155 : "memory", "cc", "eax"
2156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2157 );
2158 }
2159 #endif // HAS_YTOARGBROW_AVX2
2160
2161 #ifdef HAS_MIRRORROW_SSSE3
2162 // Shuffle table for reversing the bytes.
2163 static uvec8 kShuffleMirror = {
2164 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2165 };
2166
2167 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2168 intptr_t temp_width = (intptr_t)(width);
2169 asm volatile (
2170 "movdqa %3,%%xmm5 \n"
2171 LABELALIGN
2172 "1: \n"
2173 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2174 "pshufb %%xmm5,%%xmm0 \n"
2175 "movdqu %%xmm0," MEMACCESS(1) " \n"
2176 "lea " MEMLEA(0x10,1) ",%1 \n"
2177 "sub $0x10,%2 \n"
2178 "jg 1b \n"
2179 : "+r"(src), // %0
2180 "+r"(dst), // %1
2181 "+r"(temp_width) // %2
2182 : "m"(kShuffleMirror) // %3
2183 : "memory", "cc", NACL_R14
2184 "xmm0", "xmm5"
2185 );
2186 }
2187 #endif // HAS_MIRRORROW_SSSE3
2188
2189 #ifdef HAS_MIRRORROW_AVX2
2190 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2191 intptr_t temp_width = (intptr_t)(width);
2192 asm volatile (
2193 "vbroadcastf128 %3,%%ymm5 \n"
2194 LABELALIGN
2195 "1: \n"
2196 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
2197 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
2198 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
2199 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2200 "lea " MEMLEA(0x20,1) ",%1 \n"
2201 "sub $0x20,%2 \n"
2202 "jg 1b \n"
2203 "vzeroupper \n"
2204 : "+r"(src), // %0
2205 "+r"(dst), // %1
2206 "+r"(temp_width) // %2
2207 : "m"(kShuffleMirror) // %3
2208 : "memory", "cc", NACL_R14
2209 "xmm0", "xmm5"
2210 );
2211 }
2212 #endif // HAS_MIRRORROW_AVX2
2213
2214 #ifdef HAS_MIRRORROW_SSE2
2215 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2216 intptr_t temp_width = (intptr_t)(width);
2217 asm volatile (
2218 LABELALIGN
2219 "1: \n"
2220 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2221 "movdqa %%xmm0,%%xmm1 \n"
2222 "psllw $0x8,%%xmm0 \n"
2223 "psrlw $0x8,%%xmm1 \n"
2224 "por %%xmm1,%%xmm0 \n"
2225 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2226 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2227 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2228 "movdqu %%xmm0," MEMACCESS(1) " \n"
2229 "lea " MEMLEA(0x10,1)",%1 \n"
2230 "sub $0x10,%2 \n"
2231 "jg 1b \n"
2232 : "+r"(src), // %0
2233 "+r"(dst), // %1
2234 "+r"(temp_width) // %2
2235 :
2236 : "memory", "cc", NACL_R14
2237 "xmm0", "xmm1"
2238 );
2239 }
2240 #endif // HAS_MIRRORROW_SSE2
2241
2242 #ifdef HAS_MIRRORROW_UV_SSSE3
2243 // Shuffle table for reversing the bytes of UV channels.
2244 static uvec8 kShuffleMirrorUV = {
2245 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2246 };
2247 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2248 int width) {
2249 intptr_t temp_width = (intptr_t)(width);
2250 asm volatile (
2251 "movdqa %4,%%xmm1 \n"
2252 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
2253 "sub %1,%2 \n"
2254 LABELALIGN
2255 "1: \n"
2256 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2257 "lea " MEMLEA(-0x10,0) ",%0 \n"
2258 "pshufb %%xmm1,%%xmm0 \n"
2259 "movlpd %%xmm0," MEMACCESS(1) " \n"
2260 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
2261 "lea " MEMLEA(0x8,1) ",%1 \n"
2262 "sub $8,%3 \n"
2263 "jg 1b \n"
2264 : "+r"(src), // %0
2265 "+r"(dst_u), // %1
2266 "+r"(dst_v), // %2
2267 "+r"(temp_width) // %3
2268 : "m"(kShuffleMirrorUV) // %4
2269 : "memory", "cc", NACL_R14
2270 "xmm0", "xmm1"
2271 );
2272 }
2273 #endif // HAS_MIRRORROW_UV_SSSE3
2274
2275 #ifdef HAS_ARGBMIRRORROW_SSE2
2276
2277 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2278 intptr_t temp_width = (intptr_t)(width);
2279 asm volatile (
2280 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
2281 LABELALIGN
2282 "1: \n"
2283 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2284 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
2285 "lea " MEMLEA(-0x10,0) ",%0 \n"
2286 "movdqu %%xmm0," MEMACCESS(1) " \n"
2287 "lea " MEMLEA(0x10,1) ",%1 \n"
2288 "sub $0x4,%2 \n"
2289 "jg 1b \n"
2290 : "+r"(src), // %0
2291 "+r"(dst), // %1
2292 "+r"(temp_width) // %2
2293 :
2294 : "memory", "cc"
2295 , "xmm0"
2296 );
2297 }
2298 #endif // HAS_ARGBMIRRORROW_SSE2
2299
2300 #ifdef HAS_ARGBMIRRORROW_AVX2
2301 // Shuffle table for reversing the bytes.
2302 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2303 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2304 };
2305 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2306 intptr_t temp_width = (intptr_t)(width);
2307 asm volatile (
2308 "vmovdqu %3,%%ymm5 \n"
2309 LABELALIGN
2310 "1: \n"
2311 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2312 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2313 "lea " MEMLEA(0x20,1) ",%1 \n"
2314 "sub $0x8,%2 \n"
2315 "jg 1b \n"
2316 "vzeroupper \n"
2317 : "+r"(src), // %0
2318 "+r"(dst), // %1
2319 "+r"(temp_width) // %2
2320 : "m"(kARGBShuffleMirror_AVX2) // %3
2321 : "memory", "cc", NACL_R14
2322 "xmm0", "xmm5"
2323 );
2324 }
2325 #endif // HAS_ARGBMIRRORROW_AVX2
2326
2327 #ifdef HAS_SPLITUVROW_AVX2
2328 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2329 asm volatile (
2330 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2331 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2332 "sub %1,%2 \n"
2333 LABELALIGN
2334 "1: \n"
2335 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2336 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2337 "lea " MEMLEA(0x40,0) ",%0 \n"
2338 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
2339 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
2340 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2341 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2342 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2343 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
2344 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2345 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2346 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2347 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
2348 "lea " MEMLEA(0x20,1) ",%1 \n"
2349 "sub $0x20,%3 \n"
2350 "jg 1b \n"
2351 "vzeroupper \n"
2352 : "+r"(src_uv), // %0
2353 "+r"(dst_u), // %1
2354 "+r"(dst_v), // %2
2355 "+r"(pix) // %3
2356 :
2357 : "memory", "cc", NACL_R14
2358 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2359 );
2360 }
2361 #endif // HAS_SPLITUVROW_AVX2
2362
2363 #ifdef HAS_SPLITUVROW_SSE2
2364 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2365 asm volatile (
2366 "pcmpeqb %%xmm5,%%xmm5 \n"
2367 "psrlw $0x8,%%xmm5 \n"
2368 "sub %1,%2 \n"
2369 LABELALIGN
2370 "1: \n"
2371 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2372 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2373 "lea " MEMLEA(0x20,0) ",%0 \n"
2374 "movdqa %%xmm0,%%xmm2 \n"
2375 "movdqa %%xmm1,%%xmm3 \n"
2376 "pand %%xmm5,%%xmm0 \n"
2377 "pand %%xmm5,%%xmm1 \n"
2378 "packuswb %%xmm1,%%xmm0 \n"
2379 "psrlw $0x8,%%xmm2 \n"
2380 "psrlw $0x8,%%xmm3 \n"
2381 "packuswb %%xmm3,%%xmm2 \n"
2382 "movdqu %%xmm0," MEMACCESS(1) " \n"
2383 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
2384 "lea " MEMLEA(0x10,1) ",%1 \n"
2385 "sub $0x10,%3 \n"
2386 "jg 1b \n"
2387 : "+r"(src_uv), // %0
2388 "+r"(dst_u), // %1
2389 "+r"(dst_v), // %2
2390 "+r"(pix) // %3
2391 :
2392 : "memory", "cc", NACL_R14
2393 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2394 );
2395 }
2396 #endif // HAS_SPLITUVROW_SSE2
2397
2398 #ifdef HAS_MERGEUVROW_AVX2
2399 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2400 int width) {
2401 asm volatile (
2402 "sub %0,%1 \n"
2403 LABELALIGN
2404 "1: \n"
2405 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2406 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
2407 "lea " MEMLEA(0x20,0) ",%0 \n"
2408 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
2409 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
2410 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2411 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2412 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2413 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2414 "lea " MEMLEA(0x40,2) ",%2 \n"
2415 "sub $0x20,%3 \n"
2416 "jg 1b \n"
2417 "vzeroupper \n"
2418 : "+r"(src_u), // %0
2419 "+r"(src_v), // %1
2420 "+r"(dst_uv), // %2
2421 "+r"(width) // %3
2422 :
2423 : "memory", "cc", NACL_R14
2424 "xmm0", "xmm1", "xmm2"
2425 );
2426 }
2427 #endif // HAS_MERGEUVROW_AVX2
2428
2429 #ifdef HAS_MERGEUVROW_SSE2
2430 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2431 int width) {
2432 asm volatile (
2433 "sub %0,%1 \n"
2434 LABELALIGN
2435 "1: \n"
2436 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2437 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
2438 "lea " MEMLEA(0x10,0) ",%0 \n"
2439 "movdqa %%xmm0,%%xmm2 \n"
2440 "punpcklbw %%xmm1,%%xmm0 \n"
2441 "punpckhbw %%xmm1,%%xmm2 \n"
2442 "movdqu %%xmm0," MEMACCESS(2) " \n"
2443 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
2444 "lea " MEMLEA(0x20,2) ",%2 \n"
2445 "sub $0x10,%3 \n"
2446 "jg 1b \n"
2447 : "+r"(src_u), // %0
2448 "+r"(src_v), // %1
2449 "+r"(dst_uv), // %2
2450 "+r"(width) // %3
2451 :
2452 : "memory", "cc", NACL_R14
2453 "xmm0", "xmm1", "xmm2"
2454 );
2455 }
2456 #endif // HAS_MERGEUVROW_SSE2
2457
2458 #ifdef HAS_COPYROW_SSE2
2459 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2460 asm volatile (
2461 LABELALIGN
2462 "1: \n"
2463 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2464 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2465 "lea " MEMLEA(0x20,0) ",%0 \n"
2466 "movdqu %%xmm0," MEMACCESS(1) " \n"
2467 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2468 "lea " MEMLEA(0x20,1) ",%1 \n"
2469 "sub $0x20,%2 \n"
2470 "jg 1b \n"
2471 : "+r"(src), // %0
2472 "+r"(dst), // %1
2473 "+r"(count) // %2
2474 :
2475 : "memory", "cc"
2476 , "xmm0", "xmm1"
2477 );
2478 }
2479 #endif // HAS_COPYROW_SSE2
2480
2481 #ifdef HAS_COPYROW_AVX
2482 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2483 asm volatile (
2484 LABELALIGN
2485 "1: \n"
2486 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2487 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2488 "lea " MEMLEA(0x40,0) ",%0 \n"
2489 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2490 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2491 "lea " MEMLEA(0x40,1) ",%1 \n"
2492 "sub $0x40,%2 \n"
2493 "jg 1b \n"
2494 : "+r"(src), // %0
2495 "+r"(dst), // %1
2496 "+r"(count) // %2
2497 :
2498 : "memory", "cc"
2499 , "xmm0", "xmm1"
2500 );
2501 }
2502 #endif // HAS_COPYROW_AVX
2503
2504 #ifdef HAS_COPYROW_ERMS
2505 // Multiple of 1.
2506 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2507 size_t width_tmp = (size_t)(width);
2508 asm volatile (
2509 "rep movsb " MEMMOVESTRING(0,1) " \n"
2510 : "+S"(src), // %0
2511 "+D"(dst), // %1
2512 "+c"(width_tmp) // %2
2513 :
2514 : "memory", "cc"
2515 );
2516 }
2517 #endif // HAS_COPYROW_ERMS
2518
2519 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2520 // width in pixels
2521 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2522 asm volatile (
2523 "pcmpeqb %%xmm0,%%xmm0 \n"
2524 "pslld $0x18,%%xmm0 \n"
2525 "pcmpeqb %%xmm1,%%xmm1 \n"
2526 "psrld $0x8,%%xmm1 \n"
2527 LABELALIGN
2528 "1: \n"
2529 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
2530 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
2531 "lea " MEMLEA(0x20,0) ",%0 \n"
2532 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2533 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2534 "pand %%xmm0,%%xmm2 \n"
2535 "pand %%xmm0,%%xmm3 \n"
2536 "pand %%xmm1,%%xmm4 \n"
2537 "pand %%xmm1,%%xmm5 \n"
2538 "por %%xmm4,%%xmm2 \n"
2539 "por %%xmm5,%%xmm3 \n"
2540 "movdqu %%xmm2," MEMACCESS(1) " \n"
2541 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2542 "lea " MEMLEA(0x20,1) ",%1 \n"
2543 "sub $0x8,%2 \n"
2544 "jg 1b \n"
2545 : "+r"(src), // %0
2546 "+r"(dst), // %1
2547 "+r"(width) // %2
2548 :
2549 : "memory", "cc"
2550 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2551 );
2552 }
2553 #endif // HAS_ARGBCOPYALPHAROW_SSE2
2554
2555 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2556 // width in pixels
2557 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2558 asm volatile (
2559 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2560 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2561 LABELALIGN
2562 "1: \n"
2563 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
2564 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
2565 "lea " MEMLEA(0x40,0) ",%0 \n"
2566 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2567 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2568 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2569 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2570 "lea " MEMLEA(0x40,1) ",%1 \n"
2571 "sub $0x10,%2 \n"
2572 "jg 1b \n"
2573 "vzeroupper \n"
2574 : "+r"(src), // %0
2575 "+r"(dst), // %1
2576 "+r"(width) // %2
2577 :
2578 : "memory", "cc"
2579 , "xmm0", "xmm1", "xmm2"
2580 );
2581 }
2582 #endif // HAS_ARGBCOPYALPHAROW_AVX2
2583
2584 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2585 // width in pixels
2586 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2587 asm volatile (
2588 "pcmpeqb %%xmm0,%%xmm0 \n"
2589 "pslld $0x18,%%xmm0 \n"
2590 "pcmpeqb %%xmm1,%%xmm1 \n"
2591 "psrld $0x8,%%xmm1 \n"
2592 LABELALIGN
2593 "1: \n"
2594 "movq " MEMACCESS(0) ",%%xmm2 \n"
2595 "lea " MEMLEA(0x8,0) ",%0 \n"
2596 "punpcklbw %%xmm2,%%xmm2 \n"
2597 "punpckhwd %%xmm2,%%xmm3 \n"
2598 "punpcklwd %%xmm2,%%xmm2 \n"
2599 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2600 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2601 "pand %%xmm0,%%xmm2 \n"
2602 "pand %%xmm0,%%xmm3 \n"
2603 "pand %%xmm1,%%xmm4 \n"
2604 "pand %%xmm1,%%xmm5 \n"
2605 "por %%xmm4,%%xmm2 \n"
2606 "por %%xmm5,%%xmm3 \n"
2607 "movdqu %%xmm2," MEMACCESS(1) " \n"
2608 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2609 "lea " MEMLEA(0x20,1) ",%1 \n"
2610 "sub $0x8,%2 \n"
2611 "jg 1b \n"
2612 : "+r"(src), // %0
2613 "+r"(dst), // %1
2614 "+r"(width) // %2
2615 :
2616 : "memory", "cc"
2617 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2618 );
2619 }
2620 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
2621
2622 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
2623 // width in pixels
2624 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2625 asm volatile (
2626 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2627 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2628 LABELALIGN
2629 "1: \n"
2630 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
2631 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
2632 "lea " MEMLEA(0x10,0) ",%0 \n"
2633 "vpslld $0x18,%%ymm1,%%ymm1 \n"
2634 "vpslld $0x18,%%ymm2,%%ymm2 \n"
2635 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2636 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2637 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2638 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2639 "lea " MEMLEA(0x40,1) ",%1 \n"
2640 "sub $0x10,%2 \n"
2641 "jg 1b \n"
2642 "vzeroupper \n"
2643 : "+r"(src), // %0
2644 "+r"(dst), // %1
2645 "+r"(width) // %2
2646 :
2647 : "memory", "cc"
2648 , "xmm0", "xmm1", "xmm2"
2649 );
2650 }
2651 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
2652
2653 #ifdef HAS_SETROW_X86
2654 void SetRow_X86(uint8* dst, uint8 v8, int width) {
2655 size_t width_tmp = (size_t)(width >> 2);
2656 const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
2657 asm volatile (
2658 "rep stosl " MEMSTORESTRING(eax,0) " \n"
2659 : "+D"(dst), // %0
2660 "+c"(width_tmp) // %1
2661 : "a"(v32) // %2
2662 : "memory", "cc");
2663 }
2664
2665 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
2666 size_t width_tmp = (size_t)(width);
2667 asm volatile (
2668 "rep stosb " MEMSTORESTRING(al,0) " \n"
2669 : "+D"(dst), // %0
2670 "+c"(width_tmp) // %1
2671 : "a"(v8) // %2
2672 : "memory", "cc");
2673 }
2674
2675 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
2676 size_t width_tmp = (size_t)(width);
2677 asm volatile (
2678 "rep stosl " MEMSTORESTRING(eax,0) " \n"
2679 : "+D"(dst_argb), // %0
2680 "+c"(width_tmp) // %1
2681 : "a"(v32) // %2
2682 : "memory", "cc");
2683 }
2684 #endif // HAS_SETROW_X86
2685
2686 #ifdef HAS_YUY2TOYROW_SSE2
2687 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2688 asm volatile (
2689 "pcmpeqb %%xmm5,%%xmm5 \n"
2690 "psrlw $0x8,%%xmm5 \n"
2691 LABELALIGN
2692 "1: \n"
2693 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2694 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2695 "lea " MEMLEA(0x20,0) ",%0 \n"
2696 "pand %%xmm5,%%xmm0 \n"
2697 "pand %%xmm5,%%xmm1 \n"
2698 "packuswb %%xmm1,%%xmm0 \n"
2699 "movdqu %%xmm0," MEMACCESS(1) " \n"
2700 "lea " MEMLEA(0x10,1) ",%1 \n"
2701 "sub $0x10,%2 \n"
2702 "jg 1b \n"
2703 : "+r"(src_yuy2), // %0
2704 "+r"(dst_y), // %1
2705 "+r"(pix) // %2
2706 :
2707 : "memory", "cc"
2708 , "xmm0", "xmm1", "xmm5"
2709 );
2710 }
2711
2712 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2713 uint8* dst_u, uint8* dst_v, int pix) {
2714 asm volatile (
2715 "pcmpeqb %%xmm5,%%xmm5 \n"
2716 "psrlw $0x8,%%xmm5 \n"
2717 "sub %1,%2 \n"
2718 LABELALIGN
2719 "1: \n"
2720 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2721 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2722 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
2723 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
2724 "lea " MEMLEA(0x20,0) ",%0 \n"
2725 "pavgb %%xmm2,%%xmm0 \n"
2726 "pavgb %%xmm3,%%xmm1 \n"
2727 "psrlw $0x8,%%xmm0 \n"
2728 "psrlw $0x8,%%xmm1 \n"
2729 "packuswb %%xmm1,%%xmm0 \n"
2730 "movdqa %%xmm0,%%xmm1 \n"
2731 "pand %%xmm5,%%xmm0 \n"
2732 "packuswb %%xmm0,%%xmm0 \n"
2733 "psrlw $0x8,%%xmm1 \n"
2734 "packuswb %%xmm1,%%xmm1 \n"
2735 "movq %%xmm0," MEMACCESS(1) " \n"
2736 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2737 "lea " MEMLEA(0x8,1) ",%1 \n"
2738 "sub $0x10,%3 \n"
2739 "jg 1b \n"
2740 : "+r"(src_yuy2), // %0
2741 "+r"(dst_u), // %1
2742 "+r"(dst_v), // %2
2743 "+r"(pix) // %3
2744 : "r"((intptr_t)(stride_yuy2)) // %4
2745 : "memory", "cc", NACL_R14
2746 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2747 );
2748 }
2749
2750 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2751 uint8* dst_u, uint8* dst_v, int pix) {
2752 asm volatile (
2753 "pcmpeqb %%xmm5,%%xmm5 \n"
2754 "psrlw $0x8,%%xmm5 \n"
2755 "sub %1,%2 \n"
2756 LABELALIGN
2757 "1: \n"
2758 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2759 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2760 "lea " MEMLEA(0x20,0) ",%0 \n"
2761 "psrlw $0x8,%%xmm0 \n"
2762 "psrlw $0x8,%%xmm1 \n"
2763 "packuswb %%xmm1,%%xmm0 \n"
2764 "movdqa %%xmm0,%%xmm1 \n"
2765 "pand %%xmm5,%%xmm0 \n"
2766 "packuswb %%xmm0,%%xmm0 \n"
2767 "psrlw $0x8,%%xmm1 \n"
2768 "packuswb %%xmm1,%%xmm1 \n"
2769 "movq %%xmm0," MEMACCESS(1) " \n"
2770 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2771 "lea " MEMLEA(0x8,1) ",%1 \n"
2772 "sub $0x10,%3 \n"
2773 "jg 1b \n"
2774 : "+r"(src_yuy2), // %0
2775 "+r"(dst_u), // %1
2776 "+r"(dst_v), // %2
2777 "+r"(pix) // %3
2778 :
2779 : "memory", "cc", NACL_R14
2780 "xmm0", "xmm1", "xmm5"
2781 );
2782 }
2783
2784 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2785 asm volatile (
2786 LABELALIGN
2787 "1: \n"
2788 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2789 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2790 "lea " MEMLEA(0x20,0) ",%0 \n"
2791 "psrlw $0x8,%%xmm0 \n"
2792 "psrlw $0x8,%%xmm1 \n"
2793 "packuswb %%xmm1,%%xmm0 \n"
2794 "movdqu %%xmm0," MEMACCESS(1) " \n"
2795 "lea " MEMLEA(0x10,1) ",%1 \n"
2796 "sub $0x10,%2 \n"
2797 "jg 1b \n"
2798 : "+r"(src_uyvy), // %0
2799 "+r"(dst_y), // %1
2800 "+r"(pix) // %2
2801 :
2802 : "memory", "cc"
2803 , "xmm0", "xmm1"
2804 );
2805 }
2806
2807 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2808 uint8* dst_u, uint8* dst_v, int pix) {
2809 asm volatile (
2810 "pcmpeqb %%xmm5,%%xmm5 \n"
2811 "psrlw $0x8,%%xmm5 \n"
2812 "sub %1,%2 \n"
2813 LABELALIGN
2814 "1: \n"
2815 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2816 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2817 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
2818 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
2819 "lea " MEMLEA(0x20,0) ",%0 \n"
2820 "pavgb %%xmm2,%%xmm0 \n"
2821 "pavgb %%xmm3,%%xmm1 \n"
2822 "pand %%xmm5,%%xmm0 \n"
2823 "pand %%xmm5,%%xmm1 \n"
2824 "packuswb %%xmm1,%%xmm0 \n"
2825 "movdqa %%xmm0,%%xmm1 \n"
2826 "pand %%xmm5,%%xmm0 \n"
2827 "packuswb %%xmm0,%%xmm0 \n"
2828 "psrlw $0x8,%%xmm1 \n"
2829 "packuswb %%xmm1,%%xmm1 \n"
2830 "movq %%xmm0," MEMACCESS(1) " \n"
2831 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2832 "lea " MEMLEA(0x8,1) ",%1 \n"
2833 "sub $0x10,%3 \n"
2834 "jg 1b \n"
2835 : "+r"(src_uyvy), // %0
2836 "+r"(dst_u), // %1
2837 "+r"(dst_v), // %2
2838 "+r"(pix) // %3
2839 : "r"((intptr_t)(stride_uyvy)) // %4
2840 : "memory", "cc", NACL_R14
2841 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2842 );
2843 }
2844
2845 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2846 uint8* dst_u, uint8* dst_v, int pix) {
2847 asm volatile (
2848 "pcmpeqb %%xmm5,%%xmm5 \n"
2849 "psrlw $0x8,%%xmm5 \n"
2850 "sub %1,%2 \n"
2851 LABELALIGN
2852 "1: \n"
2853 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2854 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2855 "lea " MEMLEA(0x20,0) ",%0 \n"
2856 "pand %%xmm5,%%xmm0 \n"
2857 "pand %%xmm5,%%xmm1 \n"
2858 "packuswb %%xmm1,%%xmm0 \n"
2859 "movdqa %%xmm0,%%xmm1 \n"
2860 "pand %%xmm5,%%xmm0 \n"
2861 "packuswb %%xmm0,%%xmm0 \n"
2862 "psrlw $0x8,%%xmm1 \n"
2863 "packuswb %%xmm1,%%xmm1 \n"
2864 "movq %%xmm0," MEMACCESS(1) " \n"
2865 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2866 "lea " MEMLEA(0x8,1) ",%1 \n"
2867 "sub $0x10,%3 \n"
2868 "jg 1b \n"
2869 : "+r"(src_uyvy), // %0
2870 "+r"(dst_u), // %1
2871 "+r"(dst_v), // %2
2872 "+r"(pix) // %3
2873 :
2874 : "memory", "cc", NACL_R14
2875 "xmm0", "xmm1", "xmm5"
2876 );
2877 }
2878 #endif // HAS_YUY2TOYROW_SSE2
2879
2880 #ifdef HAS_YUY2TOYROW_AVX2
2881 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2882 asm volatile (
2883 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2884 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2885 LABELALIGN
2886 "1: \n"
2887 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2888 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2889 "lea " MEMLEA(0x40,0) ",%0 \n"
2890 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2891 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2892 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2893 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2894 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2895 "lea " MEMLEA(0x20,1) ",%1 \n"
2896 "sub $0x20,%2 \n"
2897 "jg 1b \n"
2898 "vzeroupper \n"
2899 : "+r"(src_yuy2), // %0
2900 "+r"(dst_y), // %1
2901 "+r"(pix) // %2
2902 :
2903 : "memory", "cc"
2904 , "xmm0", "xmm1", "xmm5"
2905 );
2906 }
2907
2908 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
2909 uint8* dst_u, uint8* dst_v, int pix) {
2910 asm volatile (
2911 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2912 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2913 "sub %1,%2 \n"
2914 LABELALIGN
2915 "1: \n"
2916 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2917 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2918 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
2919 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
2920 "lea " MEMLEA(0x40,0) ",%0 \n"
2921 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2922 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
2923 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2924 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2925 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
2926 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2927 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
2928 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2929 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2930 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2931 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
2932 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1)
2933 "lea " MEMLEA(0x10,1) ",%1 \n"
2934 "sub $0x20,%3 \n"
2935 "jg 1b \n"
2936 "vzeroupper \n"
2937 : "+r"(src_yuy2), // %0
2938 "+r"(dst_u), // %1
2939 "+r"(dst_v), // %2
2940 "+r"(pix) // %3
2941 : "r"((intptr_t)(stride_yuy2)) // %4
2942 : "memory", "cc", NACL_R14
2943 "xmm0", "xmm1", "xmm5"
2944 );
2945 }
2946
2947 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
2948 uint8* dst_u, uint8* dst_v, int pix) {
2949 asm volatile (
2950 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2951 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2952 "sub %1,%2 \n"
2953 LABELALIGN
2954 "1: \n"
2955 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2956 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2957 "lea " MEMLEA(0x40,0) ",%0 \n"
2958 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2959 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
2960 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2961 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2962 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
2963 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2964 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
2965 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2966 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2967 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2968 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
2969 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1)
2970 "lea " MEMLEA(0x10,1) ",%1 \n"
2971 "sub $0x20,%3 \n"
2972 "jg 1b \n"
2973 "vzeroupper \n"
2974 : "+r"(src_yuy2), // %0
2975 "+r"(dst_u), // %1
2976 "+r"(dst_v), // %2
2977 "+r"(pix) // %3
2978 :
2979 : "memory", "cc", NACL_R14
2980 "xmm0", "xmm1", "xmm5"
2981 );
2982 }
2983
2984 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2985 asm volatile (
2986 LABELALIGN
2987 "1: \n"
2988 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2989 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2990 "lea " MEMLEA(0x40,0) ",%0 \n"
2991 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
2992 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
2993 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2994 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2995 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2996 "lea " MEMLEA(0x20,1) ",%1 \n"
2997 "sub $0x20,%2 \n"
2998 "jg 1b \n"
2999 "vzeroupper \n"
3000 : "+r"(src_uyvy), // %0
3001 "+r"(dst_y), // %1
3002 "+r"(pix) // %2
3003 :
3004 : "memory", "cc"
3005 , "xmm0", "xmm1", "xmm5"
3006 );
3007 }
3008 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3009 uint8* dst_u, uint8* dst_v, int pix) {
3010 asm volatile (
3011 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3012 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3013 "sub %1,%2 \n"
3014
3015 LABELALIGN
3016 "1: \n"
3017 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3018 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3019 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3020 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3021 "lea " MEMLEA(0x40,0) ",%0 \n"
3022 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3023 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3024 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3025 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3026 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3027 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3028 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3029 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3030 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3031 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3032 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3033 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1)
3034 "lea " MEMLEA(0x10,1) ",%1 \n"
3035 "sub $0x20,%3 \n"
3036 "jg 1b \n"
3037 "vzeroupper \n"
3038 : "+r"(src_uyvy), // %0
3039 "+r"(dst_u), // %1
3040 "+r"(dst_v), // %2
3041 "+r"(pix) // %3
3042 : "r"((intptr_t)(stride_uyvy)) // %4
3043 : "memory", "cc", NACL_R14
3044 "xmm0", "xmm1", "xmm5"
3045 );
3046 }
3047
3048 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3049 uint8* dst_u, uint8* dst_v, int pix) {
3050 asm volatile (
3051 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3052 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3053 "sub %1,%2 \n"
3054 LABELALIGN
3055 "1: \n"
3056 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3057 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3058 "lea " MEMLEA(0x40,0) ",%0 \n"
3059 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3060 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3061 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3062 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3063 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3064 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3065 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3066 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3067 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3068 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3069 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3070 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1)
3071 "lea " MEMLEA(0x10,1) ",%1 \n"
3072 "sub $0x20,%3 \n"
3073 "jg 1b \n"
3074 "vzeroupper \n"
3075 : "+r"(src_uyvy), // %0
3076 "+r"(dst_u), // %1
3077 "+r"(dst_v), // %2
3078 "+r"(pix) // %3
3079 :
3080 : "memory", "cc", NACL_R14
3081 "xmm0", "xmm1", "xmm5"
3082 );
3083 }
3084 #endif // HAS_YUY2TOYROW_AVX2
3085
3086 #ifdef HAS_ARGBBLENDROW_SSE2
3087 // Blend 8 pixels at a time.
3088 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3089 uint8* dst_argb, int width) {
3090 asm volatile (
3091 "pcmpeqb %%xmm7,%%xmm7 \n"
3092 "psrlw $0xf,%%xmm7 \n"
3093 "pcmpeqb %%xmm6,%%xmm6 \n"
3094 "psrlw $0x8,%%xmm6 \n"
3095 "pcmpeqb %%xmm5,%%xmm5 \n"
3096 "psllw $0x8,%%xmm5 \n"
3097 "pcmpeqb %%xmm4,%%xmm4 \n"
3098 "pslld $0x18,%%xmm4 \n"
3099 "sub $0x1,%3 \n"
3100 "je 91f \n"
3101 "jl 99f \n"
3102
3103 // 1 pixel loop until destination pointer is aligned.
3104 "10: \n"
3105 "test $0xf,%2 \n"
3106 "je 19f \n"
3107 "movd " MEMACCESS(0) ",%%xmm3 \n"
3108 "lea " MEMLEA(0x4,0) ",%0 \n"
3109 "movdqa %%xmm3,%%xmm0 \n"
3110 "pxor %%xmm4,%%xmm3 \n"
3111 "movd " MEMACCESS(1) ",%%xmm2 \n"
3112 "psrlw $0x8,%%xmm3 \n"
3113 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3114 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3115 "pand %%xmm6,%%xmm2 \n"
3116 "paddw %%xmm7,%%xmm3 \n"
3117 "pmullw %%xmm3,%%xmm2 \n"
3118 "movd " MEMACCESS(1) ",%%xmm1 \n"
3119 "lea " MEMLEA(0x4,1) ",%1 \n"
3120 "psrlw $0x8,%%xmm1 \n"
3121 "por %%xmm4,%%xmm0 \n"
3122 "pmullw %%xmm3,%%xmm1 \n"
3123 "psrlw $0x8,%%xmm2 \n"
3124 "paddusb %%xmm2,%%xmm0 \n"
3125 "pand %%xmm5,%%xmm1 \n"
3126 "paddusb %%xmm1,%%xmm0 \n"
3127 "movd %%xmm0," MEMACCESS(2) " \n"
3128 "lea " MEMLEA(0x4,2) ",%2 \n"
3129 "sub $0x1,%3 \n"
3130 "jge 10b \n"
3131
3132 "19: \n"
3133 "add $1-4,%3 \n"
3134 "jl 49f \n"
3135
3136 // 4 pixel loop.
3137 LABELALIGN
3138 "41: \n"
3139 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3140 "lea " MEMLEA(0x10,0) ",%0 \n"
3141 "movdqa %%xmm3,%%xmm0 \n"
3142 "pxor %%xmm4,%%xmm3 \n"
3143 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3144 "psrlw $0x8,%%xmm3 \n"
3145 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3146 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3147 "pand %%xmm6,%%xmm2 \n"
3148 "paddw %%xmm7,%%xmm3 \n"
3149 "pmullw %%xmm3,%%xmm2 \n"
3150 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3151 "lea " MEMLEA(0x10,1) ",%1 \n"
3152 "psrlw $0x8,%%xmm1 \n"
3153 "por %%xmm4,%%xmm0 \n"
3154 "pmullw %%xmm3,%%xmm1 \n"
3155 "psrlw $0x8,%%xmm2 \n"
3156 "paddusb %%xmm2,%%xmm0 \n"
3157 "pand %%xmm5,%%xmm1 \n"
3158 "paddusb %%xmm1,%%xmm0 \n"
3159 "movdqu %%xmm0," MEMACCESS(2) " \n"
3160 "lea " MEMLEA(0x10,2) ",%2 \n"
3161 "sub $0x4,%3 \n"
3162 "jge 41b \n"
3163
3164 "49: \n"
3165 "add $0x3,%3 \n"
3166 "jl 99f \n"
3167
3168 // 1 pixel loop.
3169 "91: \n"
3170 "movd " MEMACCESS(0) ",%%xmm3 \n"
3171 "lea " MEMLEA(0x4,0) ",%0 \n"
3172 "movdqa %%xmm3,%%xmm0 \n"
3173 "pxor %%xmm4,%%xmm3 \n"
3174 "movd " MEMACCESS(1) ",%%xmm2 \n"
3175 "psrlw $0x8,%%xmm3 \n"
3176 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3177 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3178 "pand %%xmm6,%%xmm2 \n"
3179 "paddw %%xmm7,%%xmm3 \n"
3180 "pmullw %%xmm3,%%xmm2 \n"
3181 "movd " MEMACCESS(1) ",%%xmm1 \n"
3182 "lea " MEMLEA(0x4,1) ",%1 \n"
3183 "psrlw $0x8,%%xmm1 \n"
3184 "por %%xmm4,%%xmm0 \n"
3185 "pmullw %%xmm3,%%xmm1 \n"
3186 "psrlw $0x8,%%xmm2 \n"
3187 "paddusb %%xmm2,%%xmm0 \n"
3188 "pand %%xmm5,%%xmm1 \n"
3189 "paddusb %%xmm1,%%xmm0 \n"
3190 "movd %%xmm0," MEMACCESS(2) " \n"
3191 "lea " MEMLEA(0x4,2) ",%2 \n"
3192 "sub $0x1,%3 \n"
3193 "jge 91b \n"
3194 "99: \n"
3195 : "+r"(src_argb0), // %0
3196 "+r"(src_argb1), // %1
3197 "+r"(dst_argb), // %2
3198 "+r"(width) // %3
3199 :
3200 : "memory", "cc"
3201 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3202 );
3203 }
3204 #endif // HAS_ARGBBLENDROW_SSE2
3205
3206 #ifdef HAS_ARGBBLENDROW_SSSE3
3207 // Shuffle table for isolating alpha.
3208 static uvec8 kShuffleAlpha = {
3209 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3210 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3211 };
3212
3213 // Blend 8 pixels at a time
3214 // Shuffle table for reversing the bytes.
3215
3216 // Same as SSE2, but replaces
3217 // psrlw xmm3, 8 // alpha
3218 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
3219 // pshuflw xmm3, xmm3,0F5h
3220 // with..
3221 // pshufb xmm3, kShuffleAlpha // alpha
3222
3223 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3224 uint8* dst_argb, int width) {
3225 asm volatile (
3226 "pcmpeqb %%xmm7,%%xmm7 \n"
3227 "psrlw $0xf,%%xmm7 \n"
3228 "pcmpeqb %%xmm6,%%xmm6 \n"
3229 "psrlw $0x8,%%xmm6 \n"
3230 "pcmpeqb %%xmm5,%%xmm5 \n"
3231 "psllw $0x8,%%xmm5 \n"
3232 "pcmpeqb %%xmm4,%%xmm4 \n"
3233 "pslld $0x18,%%xmm4 \n"
3234 "sub $0x1,%3 \n"
3235 "je 91f \n"
3236 "jl 99f \n"
3237
3238 // 1 pixel loop until destination pointer is aligned.
3239 "10: \n"
3240 "test $0xf,%2 \n"
3241 "je 19f \n"
3242 "movd " MEMACCESS(0) ",%%xmm3 \n"
3243 "lea " MEMLEA(0x4,0) ",%0 \n"
3244 "movdqa %%xmm3,%%xmm0 \n"
3245 "pxor %%xmm4,%%xmm3 \n"
3246 "movd " MEMACCESS(1) ",%%xmm2 \n"
3247 "pshufb %4,%%xmm3 \n"
3248 "pand %%xmm6,%%xmm2 \n"
3249 "paddw %%xmm7,%%xmm3 \n"
3250 "pmullw %%xmm3,%%xmm2 \n"
3251 "movd " MEMACCESS(1) ",%%xmm1 \n"
3252 "lea " MEMLEA(0x4,1) ",%1 \n"
3253 "psrlw $0x8,%%xmm1 \n"
3254 "por %%xmm4,%%xmm0 \n"
3255 "pmullw %%xmm3,%%xmm1 \n"
3256 "psrlw $0x8,%%xmm2 \n"
3257 "paddusb %%xmm2,%%xmm0 \n"
3258 "pand %%xmm5,%%xmm1 \n"
3259 "paddusb %%xmm1,%%xmm0 \n"
3260 "movd %%xmm0," MEMACCESS(2) " \n"
3261 "lea " MEMLEA(0x4,2) ",%2 \n"
3262 "sub $0x1,%3 \n"
3263 "jge 10b \n"
3264
3265 "19: \n"
3266 "add $1-4,%3 \n"
3267 "jl 49f \n"
3268
3269 // 4 pixel loop.
3270 LABELALIGN
3271 "40: \n"
3272 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3273 "lea " MEMLEA(0x10,0) ",%0 \n"
3274 "movdqa %%xmm3,%%xmm0 \n"
3275 "pxor %%xmm4,%%xmm3 \n"
3276 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3277 "pshufb %4,%%xmm3 \n"
3278 "pand %%xmm6,%%xmm2 \n"
3279 "paddw %%xmm7,%%xmm3 \n"
3280 "pmullw %%xmm3,%%xmm2 \n"
3281 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3282 "lea " MEMLEA(0x10,1) ",%1 \n"
3283 "psrlw $0x8,%%xmm1 \n"
3284 "por %%xmm4,%%xmm0 \n"
3285 "pmullw %%xmm3,%%xmm1 \n"
3286 "psrlw $0x8,%%xmm2 \n"
3287 "paddusb %%xmm2,%%xmm0 \n"
3288 "pand %%xmm5,%%xmm1 \n"
3289 "paddusb %%xmm1,%%xmm0 \n"
3290 "movdqu %%xmm0," MEMACCESS(2) " \n"
3291 "lea " MEMLEA(0x10,2) ",%2 \n"
3292 "sub $0x4,%3 \n"
3293 "jge 40b \n"
3294
3295 "49: \n"
3296 "add $0x3,%3 \n"
3297 "jl 99f \n"
3298
3299 // 1 pixel loop.
3300 "91: \n"
3301 "movd " MEMACCESS(0) ",%%xmm3 \n"
3302 "lea " MEMLEA(0x4,0) ",%0 \n"
3303 "movdqa %%xmm3,%%xmm0 \n"
3304 "pxor %%xmm4,%%xmm3 \n"
3305 "movd " MEMACCESS(1) ",%%xmm2 \n"
3306 "pshufb %4,%%xmm3 \n"
3307 "pand %%xmm6,%%xmm2 \n"
3308 "paddw %%xmm7,%%xmm3 \n"
3309 "pmullw %%xmm3,%%xmm2 \n"
3310 "movd " MEMACCESS(1) ",%%xmm1 \n"
3311 "lea " MEMLEA(0x4,1) ",%1 \n"
3312 "psrlw $0x8,%%xmm1 \n"
3313 "por %%xmm4,%%xmm0 \n"
3314 "pmullw %%xmm3,%%xmm1 \n"
3315 "psrlw $0x8,%%xmm2 \n"
3316 "paddusb %%xmm2,%%xmm0 \n"
3317 "pand %%xmm5,%%xmm1 \n"
3318 "paddusb %%xmm1,%%xmm0 \n"
3319 "movd %%xmm0," MEMACCESS(2) " \n"
3320 "lea " MEMLEA(0x4,2) ",%2 \n"
3321 "sub $0x1,%3 \n"
3322 "jge 91b \n"
3323 "99: \n"
3324 : "+r"(src_argb0), // %0
3325 "+r"(src_argb1), // %1
3326 "+r"(dst_argb), // %2
3327 "+r"(width) // %3
3328 : "m"(kShuffleAlpha) // %4
3329 : "memory", "cc"
3330 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3331 );
3332 }
3333 #endif // HAS_ARGBBLENDROW_SSSE3
3334
3335 #ifdef HAS_ARGBATTENUATEROW_SSE2
3336 // Attenuate 4 pixels at a time.
3337 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3338 asm volatile (
3339 "pcmpeqb %%xmm4,%%xmm4 \n"
3340 "pslld $0x18,%%xmm4 \n"
3341 "pcmpeqb %%xmm5,%%xmm5 \n"
3342 "psrld $0x8,%%xmm5 \n"
3343
3344 // 4 pixel loop.
3345 LABELALIGN
3346 "1: \n"
3347 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3348 "punpcklbw %%xmm0,%%xmm0 \n"
3349 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3350 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3351 "pmulhuw %%xmm2,%%xmm0 \n"
3352 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3353 "punpckhbw %%xmm1,%%xmm1 \n"
3354 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3355 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3356 "pmulhuw %%xmm2,%%xmm1 \n"
3357 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3358 "lea " MEMLEA(0x10,0) ",%0 \n"
3359 "psrlw $0x8,%%xmm0 \n"
3360 "pand %%xmm4,%%xmm2 \n"
3361 "psrlw $0x8,%%xmm1 \n"
3362 "packuswb %%xmm1,%%xmm0 \n"
3363 "pand %%xmm5,%%xmm0 \n"
3364 "por %%xmm2,%%xmm0 \n"
3365 "movdqu %%xmm0," MEMACCESS(1) " \n"
3366 "lea " MEMLEA(0x10,1) ",%1 \n"
3367 "sub $0x4,%2 \n"
3368 "jg 1b \n"
3369 : "+r"(src_argb), // %0
3370 "+r"(dst_argb), // %1
3371 "+r"(width) // %2
3372 :
3373 : "memory", "cc"
3374 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3375 );
3376 }
3377 #endif // HAS_ARGBATTENUATEROW_SSE2
3378
3379 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3380 // Shuffle table duplicating alpha
3381 static uvec8 kShuffleAlpha0 = {
3382 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3383 };
3384 static uvec8 kShuffleAlpha1 = {
3385 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3386 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3387 };
3388 // Attenuate 4 pixels at a time.
3389 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3390 asm volatile (
3391 "pcmpeqb %%xmm3,%%xmm3 \n"
3392 "pslld $0x18,%%xmm3 \n"
3393 "movdqa %3,%%xmm4 \n"
3394 "movdqa %4,%%xmm5 \n"
3395
3396 // 4 pixel loop.
3397 LABELALIGN
3398 "1: \n"
3399 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3400 "pshufb %%xmm4,%%xmm0 \n"
3401 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3402 "punpcklbw %%xmm1,%%xmm1 \n"
3403 "pmulhuw %%xmm1,%%xmm0 \n"
3404 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3405 "pshufb %%xmm5,%%xmm1 \n"
3406 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3407 "punpckhbw %%xmm2,%%xmm2 \n"
3408 "pmulhuw %%xmm2,%%xmm1 \n"
3409 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3410 "lea " MEMLEA(0x10,0) ",%0 \n"
3411 "pand %%xmm3,%%xmm2 \n"
3412 "psrlw $0x8,%%xmm0 \n"
3413 "psrlw $0x8,%%xmm1 \n"
3414 "packuswb %%xmm1,%%xmm0 \n"
3415 "por %%xmm2,%%xmm0 \n"
3416 "movdqu %%xmm0," MEMACCESS(1) " \n"
3417 "lea " MEMLEA(0x10,1) ",%1 \n"
3418 "sub $0x4,%2 \n"
3419 "jg 1b \n"
3420 : "+r"(src_argb), // %0
3421 "+r"(dst_argb), // %1
3422 "+r"(width) // %2
3423 : "m"(kShuffleAlpha0), // %3
3424 "m"(kShuffleAlpha1) // %4
3425 : "memory", "cc"
3426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3427 );
3428 }
3429 #endif // HAS_ARGBATTENUATEROW_SSSE3
3430
3431 #ifdef HAS_ARGBATTENUATEROW_AVX2
3432 // Shuffle table duplicating alpha.
3433 static const uvec8 kShuffleAlpha_AVX2 = {
3434 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3435 };
3436 // Attenuate 8 pixels at a time.
3437 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3438 asm volatile (
3439 "vbroadcastf128 %3,%%ymm4 \n"
3440 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3441 "vpslld $0x18,%%ymm5,%%ymm5 \n"
3442 "sub %0,%1 \n"
3443
3444 // 8 pixel loop.
3445 LABELALIGN
3446 "1: \n"
3447 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3448 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3449 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3450 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
3451 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
3452 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3453 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3454 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
3455 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3456 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3457 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3458 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
3459 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3460 "lea " MEMLEA(0x20,0) ",%0 \n"
3461 "sub $0x8,%2 \n"
3462 "jg 1b \n"
3463 "vzeroupper \n"
3464 : "+r"(src_argb), // %0
3465 "+r"(dst_argb), // %1
3466 "+r"(width) // %2
3467 : "m"(kShuffleAlpha_AVX2) // %3
3468 : "memory", "cc"
3469 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3470 );
3471 }
3472 #endif // HAS_ARGBATTENUATEROW_AVX2
3473
3474 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3475 // Unattenuate 4 pixels at a time.
3476 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3477 int width) {
3478 uintptr_t alpha = 0;
3479 asm volatile (
3480 // 4 pixel loop.
3481 LABELALIGN
3482 "1: \n"
3483 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3484 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3485 "punpcklbw %%xmm0,%%xmm0 \n"
3486 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3487 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3488 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3489 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3490 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3491 "movlhps %%xmm3,%%xmm2 \n"
3492 "pmulhuw %%xmm2,%%xmm0 \n"
3493 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3494 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3495 "punpckhbw %%xmm1,%%xmm1 \n"
3496 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3497 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3498 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3499 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3500 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3501 "movlhps %%xmm3,%%xmm2 \n"
3502 "pmulhuw %%xmm2,%%xmm1 \n"
3503 "lea " MEMLEA(0x10,0) ",%0 \n"
3504 "packuswb %%xmm1,%%xmm0 \n"
3505 "movdqu %%xmm0," MEMACCESS(1) " \n"
3506 "lea " MEMLEA(0x10,1) ",%1 \n"
3507 "sub $0x4,%2 \n"
3508 "jg 1b \n"
3509 : "+r"(src_argb), // %0
3510 "+r"(dst_argb), // %1
3511 "+r"(width), // %2
3512 "+r"(alpha) // %3
3513 : "r"(fixed_invtbl8) // %4
3514 : "memory", "cc", NACL_R14
3515 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3516 );
3517 }
3518 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3519
3520 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3521 // Shuffle table duplicating alpha.
3522 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3523 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3524 };
3525 // Unattenuate 8 pixels at a time.
3526 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3527 int width) {
3528 uintptr_t alpha = 0;
3529 asm volatile (
3530 "sub %0,%1 \n"
3531 "vbroadcastf128 %5,%%ymm5 \n"
3532
3533 // 8 pixel loop.
3534 LABELALIGN
3535 "1: \n"
3536 // replace VPGATHER
3537 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3538 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3539 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3540 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3541 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3542 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
3543 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3544 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3545 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3546 "movzb " MEMACCESS2(0x13,0) ",%3 \n"
3547 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
3548 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3549 "movzb " MEMACCESS2(0x17,0) ",%3 \n"
3550 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3551 "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
3552 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
3553 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3554 "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
3555 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3556 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
3557 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
3558 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
3559 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
3560 // end of VPGATHER
3561
3562 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3563 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3564 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3565 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
3566 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
3567 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
3568 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
3569 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3570 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3571 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3572 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3573 "lea " MEMLEA(0x20,0) ",%0 \n"
3574 "sub $0x8,%2 \n"
3575 "jg 1b \n"
3576 "vzeroupper \n"
3577 : "+r"(src_argb), // %0
3578 "+r"(dst_argb), // %1
3579 "+r"(width), // %2
3580 "+r"(alpha) // %3
3581 : "r"(fixed_invtbl8), // %4
3582 "m"(kUnattenShuffleAlpha_AVX2) // %5
3583 : "memory", "cc", NACL_R14
3584 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3585 );
3586 }
3587 #endif // HAS_ARGBUNATTENUATEROW_AVX2
3588
3589 #ifdef HAS_ARGBGRAYROW_SSSE3
3590 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3591 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3592 asm volatile (
3593 "movdqa %3,%%xmm4 \n"
3594 "movdqa %4,%%xmm5 \n"
3595
3596 // 8 pixel loop.
3597 LABELALIGN
3598 "1: \n"
3599 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3600 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3601 "pmaddubsw %%xmm4,%%xmm0 \n"
3602 "pmaddubsw %%xmm4,%%xmm1 \n"
3603 "phaddw %%xmm1,%%xmm0 \n"
3604 "paddw %%xmm5,%%xmm0 \n"
3605 "psrlw $0x7,%%xmm0 \n"
3606 "packuswb %%xmm0,%%xmm0 \n"
3607 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3608 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
3609 "lea " MEMLEA(0x20,0) ",%0 \n"
3610 "psrld $0x18,%%xmm2 \n"
3611 "psrld $0x18,%%xmm3 \n"
3612 "packuswb %%xmm3,%%xmm2 \n"
3613 "packuswb %%xmm2,%%xmm2 \n"
3614 "movdqa %%xmm0,%%xmm3 \n"
3615 "punpcklbw %%xmm0,%%xmm0 \n"
3616 "punpcklbw %%xmm2,%%xmm3 \n"
3617 "movdqa %%xmm0,%%xmm1 \n"
3618 "punpcklwd %%xmm3,%%xmm0 \n"
3619 "punpckhwd %%xmm3,%%xmm1 \n"
3620 "movdqu %%xmm0," MEMACCESS(1) " \n"
3621 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
3622 "lea " MEMLEA(0x20,1) ",%1 \n"
3623 "sub $0x8,%2 \n"
3624 "jg 1b \n"
3625 : "+r"(src_argb), // %0
3626 "+r"(dst_argb), // %1
3627 "+r"(width) // %2
3628 : "m"(kARGBToYJ), // %3
3629 "m"(kAddYJ64) // %4
3630 : "memory", "cc"
3631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3632 );
3633 }
3634 #endif // HAS_ARGBGRAYROW_SSSE3
3635
3636 #ifdef HAS_ARGBSEPIAROW_SSSE3
3637 // b = (r * 35 + g * 68 + b * 17) >> 7
3638 // g = (r * 45 + g * 88 + b * 22) >> 7
3639 // r = (r * 50 + g * 98 + b * 24) >> 7
3640 // Constant for ARGB color to sepia tone
3641 static vec8 kARGBToSepiaB = {
3642 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3643 };
3644
3645 static vec8 kARGBToSepiaG = {
3646 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3647 };
3648
3649 static vec8 kARGBToSepiaR = {
3650 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3651 };
3652
3653 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3654 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3655 asm volatile (
3656 "movdqa %2,%%xmm2 \n"
3657 "movdqa %3,%%xmm3 \n"
3658 "movdqa %4,%%xmm4 \n"
3659
3660 // 8 pixel loop.
3661 LABELALIGN
3662 "1: \n"
3663 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3664 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
3665 "pmaddubsw %%xmm2,%%xmm0 \n"
3666 "pmaddubsw %%xmm2,%%xmm6 \n"
3667 "phaddw %%xmm6,%%xmm0 \n"
3668 "psrlw $0x7,%%xmm0 \n"
3669 "packuswb %%xmm0,%%xmm0 \n"
3670 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3671 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3672 "pmaddubsw %%xmm3,%%xmm5 \n"
3673 "pmaddubsw %%xmm3,%%xmm1 \n"
3674 "phaddw %%xmm1,%%xmm5 \n"
3675 "psrlw $0x7,%%xmm5 \n"
3676 "packuswb %%xmm5,%%xmm5 \n"
3677 "punpcklbw %%xmm5,%%xmm0 \n"
3678 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3679 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3680 "pmaddubsw %%xmm4,%%xmm5 \n"
3681 "pmaddubsw %%xmm4,%%xmm1 \n"
3682 "phaddw %%xmm1,%%xmm5 \n"
3683 "psrlw $0x7,%%xmm5 \n"
3684 "packuswb %%xmm5,%%xmm5 \n"
3685 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3686 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3687 "psrld $0x18,%%xmm6 \n"
3688 "psrld $0x18,%%xmm1 \n"
3689 "packuswb %%xmm1,%%xmm6 \n"
3690 "packuswb %%xmm6,%%xmm6 \n"
3691 "punpcklbw %%xmm6,%%xmm5 \n"
3692 "movdqa %%xmm0,%%xmm1 \n"
3693 "punpcklwd %%xmm5,%%xmm0 \n"
3694 "punpckhwd %%xmm5,%%xmm1 \n"
3695 "movdqu %%xmm0," MEMACCESS(0) " \n"
3696 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
3697 "lea " MEMLEA(0x20,0) ",%0 \n"
3698 "sub $0x8,%1 \n"
3699 "jg 1b \n"
3700 : "+r"(dst_argb), // %0
3701 "+r"(width) // %1
3702 : "m"(kARGBToSepiaB), // %2
3703 "m"(kARGBToSepiaG), // %3
3704 "m"(kARGBToSepiaR) // %4
3705 : "memory", "cc"
3706 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3707 );
3708 }
3709 #endif // HAS_ARGBSEPIAROW_SSSE3
3710
3711 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3712 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3713 // Same as Sepia except matrix is provided.
3714 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3715 const int8* matrix_argb, int width) {
3716 asm volatile (
3717 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
3718 "pshufd $0x00,%%xmm5,%%xmm2 \n"
3719 "pshufd $0x55,%%xmm5,%%xmm3 \n"
3720 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
3721 "pshufd $0xff,%%xmm5,%%xmm5 \n"
3722
3723 // 8 pixel loop.
3724 LABELALIGN
3725 "1: \n"
3726 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3727 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3728 "pmaddubsw %%xmm2,%%xmm0 \n"
3729 "pmaddubsw %%xmm2,%%xmm7 \n"
3730 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3731 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3732 "pmaddubsw %%xmm3,%%xmm6 \n"
3733 "pmaddubsw %%xmm3,%%xmm1 \n"
3734 "phaddsw %%xmm7,%%xmm0 \n"
3735 "phaddsw %%xmm1,%%xmm6 \n"
3736 "psraw $0x6,%%xmm0 \n"
3737 "psraw $0x6,%%xmm6 \n"
3738 "packuswb %%xmm0,%%xmm0 \n"
3739 "packuswb %%xmm6,%%xmm6 \n"
3740 "punpcklbw %%xmm6,%%xmm0 \n"
3741 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3742 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3743 "pmaddubsw %%xmm4,%%xmm1 \n"
3744 "pmaddubsw %%xmm4,%%xmm7 \n"
3745 "phaddsw %%xmm7,%%xmm1 \n"
3746 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3747 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3748 "pmaddubsw %%xmm5,%%xmm6 \n"
3749 "pmaddubsw %%xmm5,%%xmm7 \n"
3750 "phaddsw %%xmm7,%%xmm6 \n"
3751 "psraw $0x6,%%xmm1 \n"
3752 "psraw $0x6,%%xmm6 \n"
3753 "packuswb %%xmm1,%%xmm1 \n"
3754 "packuswb %%xmm6,%%xmm6 \n"
3755 "punpcklbw %%xmm6,%%xmm1 \n"
3756 "movdqa %%xmm0,%%xmm6 \n"
3757 "punpcklwd %%xmm1,%%xmm0 \n"
3758 "punpckhwd %%xmm1,%%xmm6 \n"
3759 "movdqu %%xmm0," MEMACCESS(1) " \n"
3760 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
3761 "lea " MEMLEA(0x20,0) ",%0 \n"
3762 "lea " MEMLEA(0x20,1) ",%1 \n"
3763 "sub $0x8,%2 \n"
3764 "jg 1b \n"
3765 : "+r"(src_argb), // %0
3766 "+r"(dst_argb), // %1
3767 "+r"(width) // %2
3768 : "r"(matrix_argb) // %3
3769 : "memory", "cc"
3770 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3771 );
3772 }
3773 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3774
3775 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3776 // Quantize 4 ARGB pixels (16 bytes).
3777 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3778 int interval_offset, int width) {
3779 asm volatile (
3780 "movd %2,%%xmm2 \n"
3781 "movd %3,%%xmm3 \n"
3782 "movd %4,%%xmm4 \n"
3783 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3784 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3785 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3786 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3787 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3788 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3789 "pxor %%xmm5,%%xmm5 \n"
3790 "pcmpeqb %%xmm6,%%xmm6 \n"
3791 "pslld $0x18,%%xmm6 \n"
3792
3793 // 4 pixel loop.
3794 LABELALIGN
3795 "1: \n"
3796 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3797 "punpcklbw %%xmm5,%%xmm0 \n"
3798 "pmulhuw %%xmm2,%%xmm0 \n"
3799 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3800 "punpckhbw %%xmm5,%%xmm1 \n"
3801 "pmulhuw %%xmm2,%%xmm1 \n"
3802 "pmullw %%xmm3,%%xmm0 \n"
3803 "movdqu " MEMACCESS(0) ",%%xmm7 \n"
3804 "pmullw %%xmm3,%%xmm1 \n"
3805 "pand %%xmm6,%%xmm7 \n"
3806 "paddw %%xmm4,%%xmm0 \n"
3807 "paddw %%xmm4,%%xmm1 \n"
3808 "packuswb %%xmm1,%%xmm0 \n"
3809 "por %%xmm7,%%xmm0 \n"
3810 "movdqu %%xmm0," MEMACCESS(0) " \n"
3811 "lea " MEMLEA(0x10,0) ",%0 \n"
3812 "sub $0x4,%1 \n"
3813 "jg 1b \n"
3814 : "+r"(dst_argb), // %0
3815 "+r"(width) // %1
3816 : "r"(scale), // %2
3817 "r"(interval_size), // %3
3818 "r"(interval_offset) // %4
3819 : "memory", "cc"
3820 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3821 );
3822 }
3823 #endif // HAS_ARGBQUANTIZEROW_SSE2
3824
3825 #ifdef HAS_ARGBSHADEROW_SSE2
3826 // Shade 4 pixels at a time by specified value.
3827 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3828 uint32 value) {
3829 asm volatile (
3830 "movd %3,%%xmm2 \n"
3831 "punpcklbw %%xmm2,%%xmm2 \n"
3832 "punpcklqdq %%xmm2,%%xmm2 \n"
3833
3834 // 4 pixel loop.
3835 LABELALIGN
3836 "1: \n"
3837 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3838 "lea " MEMLEA(0x10,0) ",%0 \n"
3839 "movdqa %%xmm0,%%xmm1 \n"
3840 "punpcklbw %%xmm0,%%xmm0 \n"
3841 "punpckhbw %%xmm1,%%xmm1 \n"
3842 "pmulhuw %%xmm2,%%xmm0 \n"
3843 "pmulhuw %%xmm2,%%xmm1 \n"
3844 "psrlw $0x8,%%xmm0 \n"
3845 "psrlw $0x8,%%xmm1 \n"
3846 "packuswb %%xmm1,%%xmm0 \n"
3847 "movdqu %%xmm0," MEMACCESS(1) " \n"
3848 "lea " MEMLEA(0x10,1) ",%1 \n"
3849 "sub $0x4,%2 \n"
3850 "jg 1b \n"
3851 : "+r"(src_argb), // %0
3852 "+r"(dst_argb), // %1
3853 "+r"(width) // %2
3854 : "r"(value) // %3
3855 : "memory", "cc"
3856 , "xmm0", "xmm1", "xmm2"
3857 );
3858 }
3859 #endif // HAS_ARGBSHADEROW_SSE2
3860
3861 #ifdef HAS_ARGBMULTIPLYROW_SSE2
3862 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
3863 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3864 uint8* dst_argb, int width) {
3865 asm volatile (
3866 "pxor %%xmm5,%%xmm5 \n"
3867
3868 // 4 pixel loop.
3869 LABELALIGN
3870 "1: \n"
3871 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3872 "lea " MEMLEA(0x10,0) ",%0 \n"
3873 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3874 "lea " MEMLEA(0x10,1) ",%1 \n"
3875 "movdqu %%xmm0,%%xmm1 \n"
3876 "movdqu %%xmm2,%%xmm3 \n"
3877 "punpcklbw %%xmm0,%%xmm0 \n"
3878 "punpckhbw %%xmm1,%%xmm1 \n"
3879 "punpcklbw %%xmm5,%%xmm2 \n"
3880 "punpckhbw %%xmm5,%%xmm3 \n"
3881 "pmulhuw %%xmm2,%%xmm0 \n"
3882 "pmulhuw %%xmm3,%%xmm1 \n"
3883 "packuswb %%xmm1,%%xmm0 \n"
3884 "movdqu %%xmm0," MEMACCESS(2) " \n"
3885 "lea " MEMLEA(0x10,2) ",%2 \n"
3886 "sub $0x4,%3 \n"
3887 "jg 1b \n"
3888 : "+r"(src_argb0), // %0
3889 "+r"(src_argb1), // %1
3890 "+r"(dst_argb), // %2
3891 "+r"(width) // %3
3892 :
3893 : "memory", "cc"
3894 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3895 );
3896 }
3897 #endif // HAS_ARGBMULTIPLYROW_SSE2
3898
3899 #ifdef HAS_ARGBMULTIPLYROW_AVX2
3900 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
3901 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3902 uint8* dst_argb, int width) {
3903 asm volatile (
3904 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
3905
3906 // 4 pixel loop.
3907 LABELALIGN
3908 "1: \n"
3909 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
3910 "lea " MEMLEA(0x20,0) ",%0 \n"
3911 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
3912 "lea " MEMLEA(0x20,1) ",%1 \n"
3913 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
3914 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
3915 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
3916 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
3917 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3918 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3919 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3920 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
3921 "lea " MEMLEA(0x20,2) ",%2 \n"
3922 "sub $0x8,%3 \n"
3923 "jg 1b \n"
3924 "vzeroupper \n"
3925 : "+r"(src_argb0), // %0
3926 "+r"(src_argb1), // %1
3927 "+r"(dst_argb), // %2
3928 "+r"(width) // %3
3929 :
3930 : "memory", "cc"
3931 #if defined(__AVX2__)
3932 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3933 #endif
3934 );
3935 }
3936 #endif // HAS_ARGBMULTIPLYROW_AVX2
3937
3938 #ifdef HAS_ARGBADDROW_SSE2
3939 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
3940 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3941 uint8* dst_argb, int width) {
3942 asm volatile (
3943 // 4 pixel loop.
3944 LABELALIGN
3945 "1: \n"
3946 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3947 "lea " MEMLEA(0x10,0) ",%0 \n"
3948 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3949 "lea " MEMLEA(0x10,1) ",%1 \n"
3950 "paddusb %%xmm1,%%xmm0 \n"
3951 "movdqu %%xmm0," MEMACCESS(2) " \n"
3952 "lea " MEMLEA(0x10,2) ",%2 \n"
3953 "sub $0x4,%3 \n"
3954 "jg 1b \n"
3955 : "+r"(src_argb0), // %0
3956 "+r"(src_argb1), // %1
3957 "+r"(dst_argb), // %2
3958 "+r"(width) // %3
3959 :
3960 : "memory", "cc"
3961 , "xmm0", "xmm1"
3962 );
3963 }
3964 #endif // HAS_ARGBADDROW_SSE2
3965
3966 #ifdef HAS_ARGBADDROW_AVX2
3967 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
3968 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3969 uint8* dst_argb, int width) {
3970 asm volatile (
3971 // 4 pixel loop.
3972 LABELALIGN
3973 "1: \n"
3974 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3975 "lea " MEMLEA(0x20,0) ",%0 \n"
3976 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
3977 "lea " MEMLEA(0x20,1) ",%1 \n"
3978 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
3979 "lea " MEMLEA(0x20,2) ",%2 \n"
3980 "sub $0x8,%3 \n"
3981 "jg 1b \n"
3982 "vzeroupper \n"
3983 : "+r"(src_argb0), // %0
3984 "+r"(src_argb1), // %1
3985 "+r"(dst_argb), // %2
3986 "+r"(width) // %3
3987 :
3988 : "memory", "cc"
3989 , "xmm0"
3990 );
3991 }
3992 #endif // HAS_ARGBADDROW_AVX2
3993
3994 #ifdef HAS_ARGBSUBTRACTROW_SSE2
3995 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
3996 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3997 uint8* dst_argb, int width) {
3998 asm volatile (
3999 // 4 pixel loop.
4000 LABELALIGN
4001 "1: \n"
4002 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4003 "lea " MEMLEA(0x10,0) ",%0 \n"
4004 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4005 "lea " MEMLEA(0x10,1) ",%1 \n"
4006 "psubusb %%xmm1,%%xmm0 \n"
4007 "movdqu %%xmm0," MEMACCESS(2) " \n"
4008 "lea " MEMLEA(0x10,2) ",%2 \n"
4009 "sub $0x4,%3 \n"
4010 "jg 1b \n"
4011 : "+r"(src_argb0), // %0
4012 "+r"(src_argb1), // %1
4013 "+r"(dst_argb), // %2
4014 "+r"(width) // %3
4015 :
4016 : "memory", "cc"
4017 , "xmm0", "xmm1"
4018 );
4019 }
4020 #endif // HAS_ARGBSUBTRACTROW_SSE2
4021
4022 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4023 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4024 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4025 uint8* dst_argb, int width) {
4026 asm volatile (
4027 // 4 pixel loop.
4028 LABELALIGN
4029 "1: \n"
4030 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4031 "lea " MEMLEA(0x20,0) ",%0 \n"
4032 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4033 "lea " MEMLEA(0x20,1) ",%1 \n"
4034 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4035 "lea " MEMLEA(0x20,2) ",%2 \n"
4036 "sub $0x8,%3 \n"
4037 "jg 1b \n"
4038 "vzeroupper \n"
4039 : "+r"(src_argb0), // %0
4040 "+r"(src_argb1), // %1
4041 "+r"(dst_argb), // %2
4042 "+r"(width) // %3
4043 :
4044 : "memory", "cc"
4045 , "xmm0"
4046 );
4047 }
4048 #endif // HAS_ARGBSUBTRACTROW_AVX2
4049
4050 #ifdef HAS_SOBELXROW_SSE2
4051 // SobelX as a matrix is
4052 // -1 0 1
4053 // -2 0 2
4054 // -1 0 1
4055 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4056 const uint8* src_y2, uint8* dst_sobelx, int width) {
4057 asm volatile (
4058 "sub %0,%1 \n"
4059 "sub %0,%2 \n"
4060 "sub %0,%3 \n"
4061 "pxor %%xmm5,%%xmm5 \n"
4062
4063 // 8 pixel loop.
4064 LABELALIGN
4065 "1: \n"
4066 "movq " MEMACCESS(0) ",%%xmm0 \n"
4067 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
4068 "punpcklbw %%xmm5,%%xmm0 \n"
4069 "punpcklbw %%xmm5,%%xmm1 \n"
4070 "psubw %%xmm1,%%xmm0 \n"
4071 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4072 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
4073 "punpcklbw %%xmm5,%%xmm1 \n"
4074 "punpcklbw %%xmm5,%%xmm2 \n"
4075 "psubw %%xmm2,%%xmm1 \n"
4076 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
4077 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
4078 "punpcklbw %%xmm5,%%xmm2 \n"
4079 "punpcklbw %%xmm5,%%xmm3 \n"
4080 "psubw %%xmm3,%%xmm2 \n"
4081 "paddw %%xmm2,%%xmm0 \n"
4082 "paddw %%xmm1,%%xmm0 \n"
4083 "paddw %%xmm1,%%xmm0 \n"
4084 "pxor %%xmm1,%%xmm1 \n"
4085 "psubw %%xmm0,%%xmm1 \n"
4086 "pmaxsw %%xmm1,%%xmm0 \n"
4087 "packuswb %%xmm0,%%xmm0 \n"
4088 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
4089 "lea " MEMLEA(0x8,0) ",%0 \n"
4090 "sub $0x8,%4 \n"
4091 "jg 1b \n"
4092 : "+r"(src_y0), // %0
4093 "+r"(src_y1), // %1
4094 "+r"(src_y2), // %2
4095 "+r"(dst_sobelx), // %3
4096 "+r"(width) // %4
4097 :
4098 : "memory", "cc", NACL_R14
4099 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4100 );
4101 }
4102 #endif // HAS_SOBELXROW_SSE2
4103
4104 #ifdef HAS_SOBELYROW_SSE2
4105 // SobelY as a matrix is
4106 // -1 -2 -1
4107 // 0 0 0
4108 // 1 2 1
4109 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4110 uint8* dst_sobely, int width) {
4111 asm volatile (
4112 "sub %0,%1 \n"
4113 "sub %0,%2 \n"
4114 "pxor %%xmm5,%%xmm5 \n"
4115
4116 // 8 pixel loop.
4117 LABELALIGN
4118 "1: \n"
4119 "movq " MEMACCESS(0) ",%%xmm0 \n"
4120 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4121 "punpcklbw %%xmm5,%%xmm0 \n"
4122 "punpcklbw %%xmm5,%%xmm1 \n"
4123 "psubw %%xmm1,%%xmm0 \n"
4124 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
4125 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
4126 "punpcklbw %%xmm5,%%xmm1 \n"
4127 "punpcklbw %%xmm5,%%xmm2 \n"
4128 "psubw %%xmm2,%%xmm1 \n"
4129 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
4130 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
4131 "punpcklbw %%xmm5,%%xmm2 \n"
4132 "punpcklbw %%xmm5,%%xmm3 \n"
4133 "psubw %%xmm3,%%xmm2 \n"
4134 "paddw %%xmm2,%%xmm0 \n"
4135 "paddw %%xmm1,%%xmm0 \n"
4136 "paddw %%xmm1,%%xmm0 \n"
4137 "pxor %%xmm1,%%xmm1 \n"
4138 "psubw %%xmm0,%%xmm1 \n"
4139 "pmaxsw %%xmm1,%%xmm0 \n"
4140 "packuswb %%xmm0,%%xmm0 \n"
4141 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
4142 "lea " MEMLEA(0x8,0) ",%0 \n"
4143 "sub $0x8,%3 \n"
4144 "jg 1b \n"
4145 : "+r"(src_y0), // %0
4146 "+r"(src_y1), // %1
4147 "+r"(dst_sobely), // %2
4148 "+r"(width) // %3
4149 :
4150 : "memory", "cc", NACL_R14
4151 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4152 );
4153 }
4154 #endif // HAS_SOBELYROW_SSE2
4155
4156 #ifdef HAS_SOBELROW_SSE2
4157 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4158 // A = 255
4159 // R = Sobel
4160 // G = Sobel
4161 // B = Sobel
4162 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4163 uint8* dst_argb, int width) {
4164 asm volatile (
4165 "sub %0,%1 \n"
4166 "pcmpeqb %%xmm5,%%xmm5 \n"
4167 "pslld $0x18,%%xmm5 \n"
4168
4169 // 8 pixel loop.
4170 LABELALIGN
4171 "1: \n"
4172 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4173 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4174 "lea " MEMLEA(0x10,0) ",%0 \n"
4175 "paddusb %%xmm1,%%xmm0 \n"
4176 "movdqa %%xmm0,%%xmm2 \n"
4177 "punpcklbw %%xmm0,%%xmm2 \n"
4178 "punpckhbw %%xmm0,%%xmm0 \n"
4179 "movdqa %%xmm2,%%xmm1 \n"
4180 "punpcklwd %%xmm2,%%xmm1 \n"
4181 "punpckhwd %%xmm2,%%xmm2 \n"
4182 "por %%xmm5,%%xmm1 \n"
4183 "por %%xmm5,%%xmm2 \n"
4184 "movdqa %%xmm0,%%xmm3 \n"
4185 "punpcklwd %%xmm0,%%xmm3 \n"
4186 "punpckhwd %%xmm0,%%xmm0 \n"
4187 "por %%xmm5,%%xmm3 \n"
4188 "por %%xmm5,%%xmm0 \n"
4189 "movdqu %%xmm1," MEMACCESS(2) " \n"
4190 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
4191 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
4192 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
4193 "lea " MEMLEA(0x40,2) ",%2 \n"
4194 "sub $0x10,%3 \n"
4195 "jg 1b \n"
4196 : "+r"(src_sobelx), // %0
4197 "+r"(src_sobely), // %1
4198 "+r"(dst_argb), // %2
4199 "+r"(width) // %3
4200 :
4201 : "memory", "cc", NACL_R14
4202 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4203 );
4204 }
4205 #endif // HAS_SOBELROW_SSE2
4206
4207 #ifdef HAS_SOBELTOPLANEROW_SSE2
4208 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4209 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4210 uint8* dst_y, int width) {
4211 asm volatile (
4212 "sub %0,%1 \n"
4213 "pcmpeqb %%xmm5,%%xmm5 \n"
4214 "pslld $0x18,%%xmm5 \n"
4215
4216 // 8 pixel loop.
4217 LABELALIGN
4218 "1: \n"
4219 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4220 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4221 "lea " MEMLEA(0x10,0) ",%0 \n"
4222 "paddusb %%xmm1,%%xmm0 \n"
4223 "movdqu %%xmm0," MEMACCESS(2) " \n"
4224 "lea " MEMLEA(0x10,2) ",%2 \n"
4225 "sub $0x10,%3 \n"
4226 "jg 1b \n"
4227 : "+r"(src_sobelx), // %0
4228 "+r"(src_sobely), // %1
4229 "+r"(dst_y), // %2
4230 "+r"(width) // %3
4231 :
4232 : "memory", "cc", NACL_R14
4233 "xmm0", "xmm1"
4234 );
4235 }
4236 #endif // HAS_SOBELTOPLANEROW_SSE2
4237
4238 #ifdef HAS_SOBELXYROW_SSE2
4239 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4240 // A = 255
4241 // R = Sobel X
4242 // G = Sobel
4243 // B = Sobel Y
4244 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4245 uint8* dst_argb, int width) {
4246 asm volatile (
4247 "sub %0,%1 \n"
4248 "pcmpeqb %%xmm5,%%xmm5 \n"
4249
4250 // 8 pixel loop.
4251 LABELALIGN
4252 "1: \n"
4253 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4254 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4255 "lea " MEMLEA(0x10,0) ",%0 \n"
4256 "movdqa %%xmm0,%%xmm2 \n"
4257 "paddusb %%xmm1,%%xmm2 \n"
4258 "movdqa %%xmm0,%%xmm3 \n"
4259 "punpcklbw %%xmm5,%%xmm3 \n"
4260 "punpckhbw %%xmm5,%%xmm0 \n"
4261 "movdqa %%xmm1,%%xmm4 \n"
4262 "punpcklbw %%xmm2,%%xmm4 \n"
4263 "punpckhbw %%xmm2,%%xmm1 \n"
4264 "movdqa %%xmm4,%%xmm6 \n"
4265 "punpcklwd %%xmm3,%%xmm6 \n"
4266 "punpckhwd %%xmm3,%%xmm4 \n"
4267 "movdqa %%xmm1,%%xmm7 \n"
4268 "punpcklwd %%xmm0,%%xmm7 \n"
4269 "punpckhwd %%xmm0,%%xmm1 \n"
4270 "movdqu %%xmm6," MEMACCESS(2) " \n"
4271 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
4272 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
4273 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
4274 "lea " MEMLEA(0x40,2) ",%2 \n"
4275 "sub $0x10,%3 \n"
4276 "jg 1b \n"
4277 : "+r"(src_sobelx), // %0
4278 "+r"(src_sobely), // %1
4279 "+r"(dst_argb), // %2
4280 "+r"(width) // %3
4281 :
4282 : "memory", "cc", NACL_R14
4283 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4284 );
4285 }
4286 #endif // HAS_SOBELXYROW_SSE2
4287
4288 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4289 // Creates a table of cumulative sums where each value is a sum of all values
4290 // above and to the left of the value, inclusive of the value.
4291 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4292 const int32* previous_cumsum, int width) {
4293 asm volatile (
4294 "pxor %%xmm0,%%xmm0 \n"
4295 "pxor %%xmm1,%%xmm1 \n"
4296 "sub $0x4,%3 \n"
4297 "jl 49f \n"
4298 "test $0xf,%1 \n"
4299 "jne 49f \n"
4300
4301 // 4 pixel loop \n"
4302 LABELALIGN
4303 "40: \n"
4304 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4305 "lea " MEMLEA(0x10,0) ",%0 \n"
4306 "movdqa %%xmm2,%%xmm4 \n"
4307 "punpcklbw %%xmm1,%%xmm2 \n"
4308 "movdqa %%xmm2,%%xmm3 \n"
4309 "punpcklwd %%xmm1,%%xmm2 \n"
4310 "punpckhwd %%xmm1,%%xmm3 \n"
4311 "punpckhbw %%xmm1,%%xmm4 \n"
4312 "movdqa %%xmm4,%%xmm5 \n"
4313 "punpcklwd %%xmm1,%%xmm4 \n"
4314 "punpckhwd %%xmm1,%%xmm5 \n"
4315 "paddd %%xmm2,%%xmm0 \n"
4316 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4317 "paddd %%xmm0,%%xmm2 \n"
4318 "paddd %%xmm3,%%xmm0 \n"
4319 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
4320 "paddd %%xmm0,%%xmm3 \n"
4321 "paddd %%xmm4,%%xmm0 \n"
4322 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
4323 "paddd %%xmm0,%%xmm4 \n"
4324 "paddd %%xmm5,%%xmm0 \n"
4325 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
4326 "lea " MEMLEA(0x40,2) ",%2 \n"
4327 "paddd %%xmm0,%%xmm5 \n"
4328 "movdqu %%xmm2," MEMACCESS(1) " \n"
4329 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
4330 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
4331 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
4332 "lea " MEMLEA(0x40,1) ",%1 \n"
4333 "sub $0x4,%3 \n"
4334 "jge 40b \n"
4335
4336 "49: \n"
4337 "add $0x3,%3 \n"
4338 "jl 19f \n"
4339
4340 // 1 pixel loop \n"
4341 LABELALIGN
4342 "10: \n"
4343 "movd " MEMACCESS(0) ",%%xmm2 \n"
4344 "lea " MEMLEA(0x4,0) ",%0 \n"
4345 "punpcklbw %%xmm1,%%xmm2 \n"
4346 "punpcklwd %%xmm1,%%xmm2 \n"
4347 "paddd %%xmm2,%%xmm0 \n"
4348 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4349 "lea " MEMLEA(0x10,2) ",%2 \n"
4350 "paddd %%xmm0,%%xmm2 \n"
4351 "movdqu %%xmm2," MEMACCESS(1) " \n"
4352 "lea " MEMLEA(0x10,1) ",%1 \n"
4353 "sub $0x1,%3 \n"
4354 "jge 10b \n"
4355
4356 "19: \n"
4357 : "+r"(row), // %0
4358 "+r"(cumsum), // %1
4359 "+r"(previous_cumsum), // %2
4360 "+r"(width) // %3
4361 :
4362 : "memory", "cc"
4363 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4364 );
4365 }
4366 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4367
4368 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4369 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4370 int width, int area, uint8* dst,
4371 int count) {
4372 asm volatile (
4373 "movd %5,%%xmm5 \n"
4374 "cvtdq2ps %%xmm5,%%xmm5 \n"
4375 "rcpss %%xmm5,%%xmm4 \n"
4376 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4377 "sub $0x4,%3 \n"
4378 "jl 49f \n"
4379 "cmpl $0x80,%5 \n"
4380 "ja 40f \n"
4381
4382 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4383 "pcmpeqb %%xmm6,%%xmm6 \n"
4384 "psrld $0x10,%%xmm6 \n"
4385 "cvtdq2ps %%xmm6,%%xmm6 \n"
4386 "addps %%xmm6,%%xmm5 \n"
4387 "mulps %%xmm4,%%xmm5 \n"
4388 "cvtps2dq %%xmm5,%%xmm5 \n"
4389 "packssdw %%xmm5,%%xmm5 \n"
4390
4391 // 4 pixel small loop \n"
4392 LABELALIGN
4393 "4: \n"
4394 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4395 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4396 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4397 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4398 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4399 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4400 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4401 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4402 "lea " MEMLEA(0x40,0) ",%0 \n"
4403 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4404 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4405 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4406 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4407 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4408 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4409 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4410 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4411 "lea " MEMLEA(0x40,1) ",%1 \n"
4412 "packssdw %%xmm1,%%xmm0 \n"
4413 "packssdw %%xmm3,%%xmm2 \n"
4414 "pmulhuw %%xmm5,%%xmm0 \n"
4415 "pmulhuw %%xmm5,%%xmm2 \n"
4416 "packuswb %%xmm2,%%xmm0 \n"
4417 "movdqu %%xmm0," MEMACCESS(2) " \n"
4418 "lea " MEMLEA(0x10,2) ",%2 \n"
4419 "sub $0x4,%3 \n"
4420 "jge 4b \n"
4421 "jmp 49f \n"
4422
4423 // 4 pixel loop \n"
4424 LABELALIGN
4425 "40: \n"
4426 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4427 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4428 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4429 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4430 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4431 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4432 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4433 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4434 "lea " MEMLEA(0x40,0) ",%0 \n"
4435 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4436 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4437 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4438 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4439 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4440 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4441 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4442 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4443 "lea " MEMLEA(0x40,1) ",%1 \n"
4444 "cvtdq2ps %%xmm0,%%xmm0 \n"
4445 "cvtdq2ps %%xmm1,%%xmm1 \n"
4446 "mulps %%xmm4,%%xmm0 \n"
4447 "mulps %%xmm4,%%xmm1 \n"
4448 "cvtdq2ps %%xmm2,%%xmm2 \n"
4449 "cvtdq2ps %%xmm3,%%xmm3 \n"
4450 "mulps %%xmm4,%%xmm2 \n"
4451 "mulps %%xmm4,%%xmm3 \n"
4452 "cvtps2dq %%xmm0,%%xmm0 \n"
4453 "cvtps2dq %%xmm1,%%xmm1 \n"
4454 "cvtps2dq %%xmm2,%%xmm2 \n"
4455 "cvtps2dq %%xmm3,%%xmm3 \n"
4456 "packssdw %%xmm1,%%xmm0 \n"
4457 "packssdw %%xmm3,%%xmm2 \n"
4458 "packuswb %%xmm2,%%xmm0 \n"
4459 "movdqu %%xmm0," MEMACCESS(2) " \n"
4460 "lea " MEMLEA(0x10,2) ",%2 \n"
4461 "sub $0x4,%3 \n"
4462 "jge 40b \n"
4463
4464 "49: \n"
4465 "add $0x3,%3 \n"
4466 "jl 19f \n"
4467
4468 // 1 pixel loop \n"
4469 LABELALIGN
4470 "10: \n"
4471 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4472 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4473 "lea " MEMLEA(0x10,0) ",%0 \n"
4474 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4475 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4476 "lea " MEMLEA(0x10,1) ",%1 \n"
4477 "cvtdq2ps %%xmm0,%%xmm0 \n"
4478 "mulps %%xmm4,%%xmm0 \n"
4479 "cvtps2dq %%xmm0,%%xmm0 \n"
4480 "packssdw %%xmm0,%%xmm0 \n"
4481 "packuswb %%xmm0,%%xmm0 \n"
4482 "movd %%xmm0," MEMACCESS(2) " \n"
4483 "lea " MEMLEA(0x4,2) ",%2 \n"
4484 "sub $0x1,%3 \n"
4485 "jge 10b \n"
4486 "19: \n"
4487 : "+r"(topleft), // %0
4488 "+r"(botleft), // %1
4489 "+r"(dst), // %2
4490 "+rm"(count) // %3
4491 : "r"((intptr_t)(width)), // %4
4492 "rm"(area) // %5
4493 : "memory", "cc", NACL_R14
4494 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4495 );
4496 }
4497 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4498
4499 #ifdef HAS_ARGBAFFINEROW_SSE2
4500 // Copy ARGB pixels from source image with slope to a row of destination.
4501 LIBYUV_API
4502 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4503 uint8* dst_argb, const float* src_dudv, int width) {
4504 intptr_t src_argb_stride_temp = src_argb_stride;
4505 intptr_t temp = 0;
4506 asm volatile (
4507 "movq " MEMACCESS(3) ",%%xmm2 \n"
4508 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
4509 "shl $0x10,%1 \n"
4510 "add $0x4,%1 \n"
4511 "movd %1,%%xmm5 \n"
4512 "sub $0x4,%4 \n"
4513 "jl 49f \n"
4514
4515 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4516 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4517 "movdqa %%xmm2,%%xmm0 \n"
4518 "addps %%xmm7,%%xmm0 \n"
4519 "movlhps %%xmm0,%%xmm2 \n"
4520 "movdqa %%xmm7,%%xmm4 \n"
4521 "addps %%xmm4,%%xmm4 \n"
4522 "movdqa %%xmm2,%%xmm3 \n"
4523 "addps %%xmm4,%%xmm3 \n"
4524 "addps %%xmm4,%%xmm4 \n"
4525
4526 // 4 pixel loop \n"
4527 LABELALIGN
4528 "40: \n"
4529 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
4530 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
4531 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
4532 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
4533 "movd %%xmm0,%k1 \n"
4534 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4535 "movd %%xmm0,%k5 \n"
4536 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4537 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
4538 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4539 "punpckldq %%xmm6,%%xmm1 \n"
4540 "addps %%xmm4,%%xmm2 \n"
4541 "movq %%xmm1," MEMACCESS(2) " \n"
4542 "movd %%xmm0,%k1 \n"
4543 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4544 "movd %%xmm0,%k5 \n"
4545 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4546 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4547 "punpckldq %%xmm6,%%xmm0 \n"
4548 "addps %%xmm4,%%xmm3 \n"
4549 "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
4550 "lea " MEMLEA(0x10,2) ",%2 \n"
4551 "sub $0x4,%4 \n"
4552 "jge 40b \n"
4553
4554 "49: \n"
4555 "add $0x3,%4 \n"
4556 "jl 19f \n"
4557
4558 // 1 pixel loop \n"
4559 LABELALIGN
4560 "10: \n"
4561 "cvttps2dq %%xmm2,%%xmm0 \n"
4562 "packssdw %%xmm0,%%xmm0 \n"
4563 "pmaddwd %%xmm5,%%xmm0 \n"
4564 "addps %%xmm7,%%xmm2 \n"
4565 "movd %%xmm0,%k1 \n"
4566 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4567 "movd %%xmm0," MEMACCESS(2) " \n"
4568 "lea " MEMLEA(0x04,2) ",%2 \n"
4569 "sub $0x1,%4 \n"
4570 "jge 10b \n"
4571 "19: \n"
4572 : "+r"(src_argb), // %0
4573 "+r"(src_argb_stride_temp), // %1
4574 "+r"(dst_argb), // %2
4575 "+r"(src_dudv), // %3
4576 "+rm"(width), // %4
4577 "+r"(temp) // %5
4578 :
4579 : "memory", "cc", NACL_R14
4580 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4581 );
4582 }
4583 #endif // HAS_ARGBAFFINEROW_SSE2
4584
4585 #ifdef HAS_INTERPOLATEROW_SSSE3
4586 // Bilinear filter 16x2 -> 16x1
4587 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4588 ptrdiff_t src_stride, int dst_width,
4589 int source_y_fraction) {
4590 asm volatile (
4591 "sub %1,%0 \n"
4592 "shr %3 \n"
4593 "cmp $0x0,%3 \n"
4594 "je 100f \n"
4595 "cmp $0x20,%3 \n"
4596 "je 75f \n"
4597 "cmp $0x40,%3 \n"
4598 "je 50f \n"
4599 "cmp $0x60,%3 \n"
4600 "je 25f \n"
4601
4602 "movd %3,%%xmm0 \n"
4603 "neg %3 \n"
4604 "add $0x80,%3 \n"
4605 "movd %3,%%xmm5 \n"
4606 "punpcklbw %%xmm0,%%xmm5 \n"
4607 "punpcklwd %%xmm5,%%xmm5 \n"
4608 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4609
4610 // General purpose row blend.
4611 LABELALIGN
4612 "1: \n"
4613 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4614 MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4615 "movdqa %%xmm0,%%xmm1 \n"
4616 "punpcklbw %%xmm2,%%xmm0 \n"
4617 "punpckhbw %%xmm2,%%xmm1 \n"
4618 "pmaddubsw %%xmm5,%%xmm0 \n"
4619 "pmaddubsw %%xmm5,%%xmm1 \n"
4620 "psrlw $0x7,%%xmm0 \n"
4621 "psrlw $0x7,%%xmm1 \n"
4622 "packuswb %%xmm1,%%xmm0 \n"
4623 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4624 "lea " MEMLEA(0x10,1) ",%1 \n"
4625 "sub $0x10,%2 \n"
4626 "jg 1b \n"
4627 "jmp 99f \n"
4628
4629 // Blend 25 / 75.
4630 LABELALIGN
4631 "25: \n"
4632 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4633 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4634 "pavgb %%xmm1,%%xmm0 \n"
4635 "pavgb %%xmm1,%%xmm0 \n"
4636 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4637 "lea " MEMLEA(0x10,1) ",%1 \n"
4638 "sub $0x10,%2 \n"
4639 "jg 25b \n"
4640 "jmp 99f \n"
4641
4642 // Blend 50 / 50.
4643 LABELALIGN
4644 "50: \n"
4645 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4646 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4647 "pavgb %%xmm1,%%xmm0 \n"
4648 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4649 "lea " MEMLEA(0x10,1) ",%1 \n"
4650 "sub $0x10,%2 \n"
4651 "jg 50b \n"
4652 "jmp 99f \n"
4653
4654 // Blend 75 / 25.
4655 LABELALIGN
4656 "75: \n"
4657 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4658 MEMOPREG(movdqu,0x00,1,4,1,xmm0)
4659 "pavgb %%xmm1,%%xmm0 \n"
4660 "pavgb %%xmm1,%%xmm0 \n"
4661 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4662 "lea " MEMLEA(0x10,1) ",%1 \n"
4663 "sub $0x10,%2 \n"
4664 "jg 75b \n"
4665 "jmp 99f \n"
4666
4667 // Blend 100 / 0 - Copy row unchanged.
4668 LABELALIGN
4669 "100: \n"
4670 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4671 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4672 "lea " MEMLEA(0x10,1) ",%1 \n"
4673 "sub $0x10,%2 \n"
4674 "jg 100b \n"
4675
4676 "99: \n"
4677 : "+r"(dst_ptr), // %0
4678 "+r"(src_ptr), // %1
4679 "+r"(dst_width), // %2
4680 "+r"(source_y_fraction) // %3
4681 : "r"((intptr_t)(src_stride)) // %4
4682 : "memory", "cc", NACL_R14
4683 "xmm0", "xmm1", "xmm2", "xmm5"
4684 );
4685 }
4686 #endif // HAS_INTERPOLATEROW_SSSE3
4687
4688 #ifdef HAS_INTERPOLATEROW_AVX2
4689 // Bilinear filter 32x2 -> 32x1
4690 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4691 ptrdiff_t src_stride, int dst_width,
4692 int source_y_fraction) {
4693 asm volatile (
4694 "shr %3 \n"
4695 "cmp $0x0,%3 \n"
4696 "je 100f \n"
4697 "sub %1,%0 \n"
4698 "cmp $0x20,%3 \n"
4699 "je 75f \n"
4700 "cmp $0x40,%3 \n"
4701 "je 50f \n"
4702 "cmp $0x60,%3 \n"
4703 "je 25f \n"
4704
4705 "vmovd %3,%%xmm0 \n"
4706 "neg %3 \n"
4707 "add $0x80,%3 \n"
4708 "vmovd %3,%%xmm5 \n"
4709 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
4710 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
4711 "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
4712 "vpermd %%ymm5,%%ymm0,%%ymm5 \n"
4713
4714 // General purpose row blend.
4715 LABELALIGN
4716 "1: \n"
4717 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4718 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4719 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
4720 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
4721 "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n"
4722 "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n"
4723 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
4724 "vpsrlw $0x7,%%ymm1,%%ymm1 \n"
4725 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4726 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4727 "lea " MEMLEA(0x20,1) ",%1 \n"
4728 "sub $0x20,%2 \n"
4729 "jg 1b \n"
4730 "jmp 99f \n"
4731
4732 // Blend 25 / 75.
4733 LABELALIGN
4734 "25: \n"
4735 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4736 MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
4737 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4738 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4739 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4740 "lea " MEMLEA(0x20,1) ",%1 \n"
4741 "sub $0x20,%2 \n"
4742 "jg 25b \n"
4743 "jmp 99f \n"
4744
4745 // Blend 50 / 50.
4746 LABELALIGN
4747 "50: \n"
4748 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4749 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
4750 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4751 "lea " MEMLEA(0x20,1) ",%1 \n"
4752 "sub $0x20,%2 \n"
4753 "jg 50b \n"
4754 "jmp 99f \n"
4755
4756 // Blend 75 / 25.
4757 LABELALIGN
4758 "75: \n"
4759 "vmovdqu " MEMACCESS(1) ",%%ymm1 \n"
4760 MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
4761 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4762 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4763 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4764 "lea " MEMLEA(0x20,1) ",%1 \n"
4765 "sub $0x20,%2 \n"
4766 "jg 75b \n"
4767 "jmp 99f \n"
4768
4769 // Blend 100 / 0 - Copy row unchanged.
4770 LABELALIGN
4771 "100: \n"
4772 "rep movsb " MEMMOVESTRING(1,0) " \n"
4773 "jmp 999f \n"
4774
4775 "99: \n"
4776 "vzeroupper \n"
4777 "999: \n"
4778 : "+D"(dst_ptr), // %0
4779 "+S"(src_ptr), // %1
4780 "+c"(dst_width), // %2
4781 "+r"(source_y_fraction) // %3
4782 : "r"((intptr_t)(src_stride)) // %4
4783 : "memory", "cc", NACL_R14
4784 "xmm0", "xmm1", "xmm2", "xmm5"
4785 );
4786 }
4787 #endif // HAS_INTERPOLATEROW_AVX2
4788
4789 #ifdef HAS_INTERPOLATEROW_SSE2
4790 // Bilinear filter 16x2 -> 16x1
4791 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
4792 ptrdiff_t src_stride, int dst_width,
4793 int source_y_fraction) {
4794 asm volatile (
4795 "sub %1,%0 \n"
4796 "shr %3 \n"
4797 "cmp $0x0,%3 \n"
4798 "je 100f \n"
4799 "cmp $0x20,%3 \n"
4800 "je 75f \n"
4801 "cmp $0x40,%3 \n"
4802 "je 50f \n"
4803 "cmp $0x60,%3 \n"
4804 "je 25f \n"
4805
4806 "movd %3,%%xmm0 \n"
4807 "neg %3 \n"
4808 "add $0x80,%3 \n"
4809 "movd %3,%%xmm5 \n"
4810 "punpcklbw %%xmm0,%%xmm5 \n"
4811 "punpcklwd %%xmm5,%%xmm5 \n"
4812 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4813 "pxor %%xmm4,%%xmm4 \n"
4814
4815 // General purpose row blend.
4816 LABELALIGN
4817 "1: \n"
4818 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4819 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
4820 "movdqa %%xmm0,%%xmm1 \n"
4821 "movdqa %%xmm2,%%xmm3 \n"
4822 "punpcklbw %%xmm4,%%xmm2 \n"
4823 "punpckhbw %%xmm4,%%xmm3 \n"
4824 "punpcklbw %%xmm4,%%xmm0 \n"
4825 "punpckhbw %%xmm4,%%xmm1 \n"
4826 "psubw %%xmm0,%%xmm2 \n"
4827 "psubw %%xmm1,%%xmm3 \n"
4828 "paddw %%xmm2,%%xmm2 \n"
4829 "paddw %%xmm3,%%xmm3 \n"
4830 "pmulhw %%xmm5,%%xmm2 \n"
4831 "pmulhw %%xmm5,%%xmm3 \n"
4832 "paddw %%xmm2,%%xmm0 \n"
4833 "paddw %%xmm3,%%xmm1 \n"
4834 "packuswb %%xmm1,%%xmm0 \n"
4835 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4836 "lea " MEMLEA(0x10,1) ",%1 \n"
4837 "sub $0x10,%2 \n"
4838 "jg 1b \n"
4839 "jmp 99f \n"
4840
4841 // Blend 25 / 75.
4842 LABELALIGN
4843 "25: \n"
4844 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4845 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
4846 "pavgb %%xmm1,%%xmm0 \n"
4847 "pavgb %%xmm1,%%xmm0 \n"
4848 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4849 "lea " MEMLEA(0x10,1) ",%1 \n"
4850 "sub $0x10,%2 \n"
4851 "jg 25b \n"
4852 "jmp 99f \n"
4853
4854 // Blend 50 / 50.
4855 LABELALIGN
4856 "50: \n"
4857 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4858 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
4859 "pavgb %%xmm1,%%xmm0 \n"
4860 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4861 "lea " MEMLEA(0x10,1) ",%1 \n"
4862 "sub $0x10,%2 \n"
4863 "jg 50b \n"
4864 "jmp 99f \n"
4865
4866 // Blend 75 / 25.
4867 LABELALIGN
4868 "75: \n"
4869 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4870 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
4871 "pavgb %%xmm1,%%xmm0 \n"
4872 "pavgb %%xmm1,%%xmm0 \n"
4873 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4874 "lea " MEMLEA(0x10,1) ",%1 \n"
4875 "sub $0x10,%2 \n"
4876 "jg 75b \n"
4877 "jmp 99f \n"
4878
4879 // Blend 100 / 0 - Copy row unchanged.
4880 LABELALIGN
4881 "100: \n"
4882 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4883 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4884 "lea " MEMLEA(0x10,1) ",%1 \n"
4885 "sub $0x10,%2 \n"
4886 "jg 100b \n"
4887
4888 "99: \n"
4889 : "+r"(dst_ptr), // %0
4890 "+r"(src_ptr), // %1
4891 "+r"(dst_width), // %2
4892 "+r"(source_y_fraction) // %3
4893 : "r"((intptr_t)(src_stride)) // %4
4894 : "memory", "cc", NACL_R14
4895 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4896 );
4897 }
4898 #endif // HAS_INTERPOLATEROW_SSE2
4899
4900 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
4901 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
4902 uint32 selector, int pix) {
4903 asm volatile (
4904 "pcmpeqb %%xmm5,%%xmm5 \n"
4905 "psrld $0x18,%%xmm5 \n"
4906 LABELALIGN
4907 "1: \n"
4908 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4909 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4910 "lea " MEMLEA(0x20,0) ",%0 \n"
4911 "psrld $0x8,%%xmm0 \n"
4912 "psrld $0x8,%%xmm1 \n"
4913 "pand %%xmm5,%%xmm0 \n"
4914 "pand %%xmm5,%%xmm1 \n"
4915 "packssdw %%xmm1,%%xmm0 \n"
4916 "packuswb %%xmm1,%%xmm0 \n"
4917 "movq %%xmm0," MEMACCESS(1) " \n"
4918 "lea " MEMLEA(0x8,1) ",%1 \n"
4919 "sub $0x8,%2 \n"
4920 "jg 1b \n"
4921 : "+r"(src_argb), // %0
4922 "+r"(dst_bayer), // %1
4923 "+r"(pix) // %2
4924 :
4925 : "memory", "cc"
4926 , "xmm0", "xmm1", "xmm5"
4927 );
4928 }
4929 #endif // HAS_ARGBTOBAYERGGROW_SSE2
4930
4931 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4932 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4933 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4934 const uint8* shuffler, int pix) {
4935 asm volatile (
4936 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4937 LABELALIGN
4938 "1: \n"
4939 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4940 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4941 "lea " MEMLEA(0x20,0) ",%0 \n"
4942 "pshufb %%xmm5,%%xmm0 \n"
4943 "pshufb %%xmm5,%%xmm1 \n"
4944 "movdqu %%xmm0," MEMACCESS(1) " \n"
4945 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
4946 "lea " MEMLEA(0x20,1) ",%1 \n"
4947 "sub $0x8,%2 \n"
4948 "jg 1b \n"
4949 : "+r"(src_argb), // %0
4950 "+r"(dst_argb), // %1
4951 "+r"(pix) // %2
4952 : "r"(shuffler) // %3
4953 : "memory", "cc"
4954 , "xmm0", "xmm1", "xmm5"
4955 );
4956 }
4957 #endif // HAS_ARGBSHUFFLEROW_SSSE3
4958
4959 #ifdef HAS_ARGBSHUFFLEROW_AVX2
4960 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4961 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4962 const uint8* shuffler, int pix) {
4963 asm volatile (
4964 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
4965 LABELALIGN
4966 "1: \n"
4967 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4968 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
4969 "lea " MEMLEA(0x40,0) ",%0 \n"
4970 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
4971 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
4972 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
4973 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
4974 "lea " MEMLEA(0x40,1) ",%1 \n"
4975 "sub $0x10,%2 \n"
4976 "jg 1b \n"
4977 "vzeroupper \n"
4978 : "+r"(src_argb), // %0
4979 "+r"(dst_argb), // %1
4980 "+r"(pix) // %2
4981 : "r"(shuffler) // %3
4982 : "memory", "cc"
4983 , "xmm0", "xmm1", "xmm5"
4984 );
4985 }
4986 #endif // HAS_ARGBSHUFFLEROW_AVX2
4987
4988 #ifdef HAS_ARGBSHUFFLEROW_SSE2
4989 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4990 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4991 const uint8* shuffler, int pix) {
4992 uintptr_t pixel_temp = 0u;
4993 asm volatile (
4994 "pxor %%xmm5,%%xmm5 \n"
4995 "mov " MEMACCESS(4) ",%k2 \n"
4996 "cmp $0x3000102,%k2 \n"
4997 "je 3012f \n"
4998 "cmp $0x10203,%k2 \n"
4999 "je 123f \n"
5000 "cmp $0x30201,%k2 \n"
5001 "je 321f \n"
5002 "cmp $0x2010003,%k2 \n"
5003 "je 2103f \n"
5004
5005 LABELALIGN
5006 "1: \n"
5007 "movzb " MEMACCESS(4) ",%2 \n"
5008 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5009 "mov %b2," MEMACCESS(1) " \n"
5010 "movzb " MEMACCESS2(0x1,4) ",%2 \n"
5011 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5012 "mov %b2," MEMACCESS2(0x1,1) " \n"
5013 "movzb " MEMACCESS2(0x2,4) ",%2 \n"
5014 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5015 "mov %b2," MEMACCESS2(0x2,1) " \n"
5016 "movzb " MEMACCESS2(0x3,4) ",%2 \n"
5017 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5018 "mov %b2," MEMACCESS2(0x3,1) " \n"
5019 "lea " MEMLEA(0x4,0) ",%0 \n"
5020 "lea " MEMLEA(0x4,1) ",%1 \n"
5021 "sub $0x1,%3 \n"
5022 "jg 1b \n"
5023 "jmp 99f \n"
5024
5025 LABELALIGN
5026 "123: \n"
5027 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5028 "lea " MEMLEA(0x10,0) ",%0 \n"
5029 "movdqa %%xmm0,%%xmm1 \n"
5030 "punpcklbw %%xmm5,%%xmm0 \n"
5031 "punpckhbw %%xmm5,%%xmm1 \n"
5032 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
5033 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
5034 "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
5035 "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
5036 "packuswb %%xmm1,%%xmm0 \n"
5037 "movdqu %%xmm0," MEMACCESS(1) " \n"
5038 "lea " MEMLEA(0x10,1) ",%1 \n"
5039 "sub $0x4,%3 \n"
5040 "jg 123b \n"
5041 "jmp 99f \n"
5042
5043 LABELALIGN
5044 "321: \n"
5045 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5046 "lea " MEMLEA(0x10,0) ",%0 \n"
5047 "movdqa %%xmm0,%%xmm1 \n"
5048 "punpcklbw %%xmm5,%%xmm0 \n"
5049 "punpckhbw %%xmm5,%%xmm1 \n"
5050 "pshufhw $0x39,%%xmm0,%%xmm0 \n"
5051 "pshuflw $0x39,%%xmm0,%%xmm0 \n"
5052 "pshufhw $0x39,%%xmm1,%%xmm1 \n"
5053 "pshuflw $0x39,%%xmm1,%%xmm1 \n"
5054 "packuswb %%xmm1,%%xmm0 \n"
5055 "movdqu %%xmm0," MEMACCESS(1) " \n"
5056 "lea " MEMLEA(0x10,1) ",%1 \n"
5057 "sub $0x4,%3 \n"
5058 "jg 321b \n"
5059 "jmp 99f \n"
5060
5061 LABELALIGN
5062 "2103: \n"
5063 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5064 "lea " MEMLEA(0x10,0) ",%0 \n"
5065 "movdqa %%xmm0,%%xmm1 \n"
5066 "punpcklbw %%xmm5,%%xmm0 \n"
5067 "punpckhbw %%xmm5,%%xmm1 \n"
5068 "pshufhw $0x93,%%xmm0,%%xmm0 \n"
5069 "pshuflw $0x93,%%xmm0,%%xmm0 \n"
5070 "pshufhw $0x93,%%xmm1,%%xmm1 \n"
5071 "pshuflw $0x93,%%xmm1,%%xmm1 \n"
5072 "packuswb %%xmm1,%%xmm0 \n"
5073 "movdqu %%xmm0," MEMACCESS(1) " \n"
5074 "lea " MEMLEA(0x10,1) ",%1 \n"
5075 "sub $0x4,%3 \n"
5076 "jg 2103b \n"
5077 "jmp 99f \n"
5078
5079 LABELALIGN
5080 "3012: \n"
5081 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5082 "lea " MEMLEA(0x10,0) ",%0 \n"
5083 "movdqa %%xmm0,%%xmm1 \n"
5084 "punpcklbw %%xmm5,%%xmm0 \n"
5085 "punpckhbw %%xmm5,%%xmm1 \n"
5086 "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
5087 "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
5088 "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
5089 "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
5090 "packuswb %%xmm1,%%xmm0 \n"
5091 "movdqu %%xmm0," MEMACCESS(1) " \n"
5092 "lea " MEMLEA(0x10,1) ",%1 \n"
5093 "sub $0x4,%3 \n"
5094 "jg 3012b \n"
5095
5096 "99: \n"
5097 : "+r"(src_argb), // %0
5098 "+r"(dst_argb), // %1
5099 "+d"(pixel_temp), // %2
5100 "+r"(pix) // %3
5101 : "r"(shuffler) // %4
5102 : "memory", "cc", NACL_R14
5103 "xmm0", "xmm1", "xmm5"
5104 );
5105 }
5106 #endif // HAS_ARGBSHUFFLEROW_SSE2
5107
5108 #ifdef HAS_I422TOYUY2ROW_SSE2
5109 void I422ToYUY2Row_SSE2(const uint8* src_y,
5110 const uint8* src_u,
5111 const uint8* src_v,
5112 uint8* dst_frame, int width) {
5113 asm volatile (
5114 "sub %1,%2 \n"
5115 LABELALIGN
5116 "1: \n"
5117 "movq " MEMACCESS(1) ",%%xmm2 \n"
5118 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5119 "lea " MEMLEA(0x8,1) ",%1 \n"
5120 "punpcklbw %%xmm3,%%xmm2 \n"
5121 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5122 "lea " MEMLEA(0x10,0) ",%0 \n"
5123 "movdqa %%xmm0,%%xmm1 \n"
5124 "punpcklbw %%xmm2,%%xmm0 \n"
5125 "punpckhbw %%xmm2,%%xmm1 \n"
5126 "movdqu %%xmm0," MEMACCESS(3) " \n"
5127 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
5128 "lea " MEMLEA(0x20,3) ",%3 \n"
5129 "sub $0x10,%4 \n"
5130 "jg 1b \n"
5131 : "+r"(src_y), // %0
5132 "+r"(src_u), // %1
5133 "+r"(src_v), // %2
5134 "+r"(dst_frame), // %3
5135 "+rm"(width) // %4
5136 :
5137 : "memory", "cc", NACL_R14
5138 "xmm0", "xmm1", "xmm2", "xmm3"
5139 );
5140 }
5141 #endif // HAS_I422TOYUY2ROW_SSE2
5142
5143 #ifdef HAS_I422TOUYVYROW_SSE2
5144 void I422ToUYVYRow_SSE2(const uint8* src_y,
5145 const uint8* src_u,
5146 const uint8* src_v,
5147 uint8* dst_frame, int width) {
5148 asm volatile (
5149 "sub %1,%2 \n"
5150 LABELALIGN
5151 "1: \n"
5152 "movq " MEMACCESS(1) ",%%xmm2 \n"
5153 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5154 "lea " MEMLEA(0x8,1) ",%1 \n"
5155 "punpcklbw %%xmm3,%%xmm2 \n"
5156 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5157 "movdqa %%xmm2,%%xmm1 \n"
5158 "lea " MEMLEA(0x10,0) ",%0 \n"
5159 "punpcklbw %%xmm0,%%xmm1 \n"
5160 "punpckhbw %%xmm0,%%xmm2 \n"
5161 "movdqu %%xmm1," MEMACCESS(3) " \n"
5162 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
5163 "lea " MEMLEA(0x20,3) ",%3 \n"
5164 "sub $0x10,%4 \n"
5165 "jg 1b \n"
5166 : "+r"(src_y), // %0
5167 "+r"(src_u), // %1
5168 "+r"(src_v), // %2
5169 "+r"(dst_frame), // %3
5170 "+rm"(width) // %4
5171 :
5172 : "memory", "cc", NACL_R14
5173 "xmm0", "xmm1", "xmm2", "xmm3"
5174 );
5175 }
5176 #endif // HAS_I422TOUYVYROW_SSE2
5177
5178 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5179 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5180 uint8* dst_argb, const float* poly,
5181 int width) {
5182 asm volatile (
5183 "pxor %%xmm3,%%xmm3 \n"
5184
5185 // 2 pixel loop.
5186 LABELALIGN
5187 "1: \n"
5188 "movq " MEMACCESS(0) ",%%xmm0 \n"
5189 "lea " MEMLEA(0x8,0) ",%0 \n"
5190 "punpcklbw %%xmm3,%%xmm0 \n"
5191 "movdqa %%xmm0,%%xmm4 \n"
5192 "punpcklwd %%xmm3,%%xmm0 \n"
5193 "punpckhwd %%xmm3,%%xmm4 \n"
5194 "cvtdq2ps %%xmm0,%%xmm0 \n"
5195 "cvtdq2ps %%xmm4,%%xmm4 \n"
5196 "movdqa %%xmm0,%%xmm1 \n"
5197 "movdqa %%xmm4,%%xmm5 \n"
5198 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
5199 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
5200 "addps " MEMACCESS(3) ",%%xmm0 \n"
5201 "addps " MEMACCESS(3) ",%%xmm4 \n"
5202 "movdqa %%xmm1,%%xmm2 \n"
5203 "movdqa %%xmm5,%%xmm6 \n"
5204 "mulps %%xmm1,%%xmm2 \n"
5205 "mulps %%xmm5,%%xmm6 \n"
5206 "mulps %%xmm2,%%xmm1 \n"
5207 "mulps %%xmm6,%%xmm5 \n"
5208 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
5209 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
5210 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
5211 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
5212 "addps %%xmm2,%%xmm0 \n"
5213 "addps %%xmm6,%%xmm4 \n"
5214 "addps %%xmm1,%%xmm0 \n"
5215 "addps %%xmm5,%%xmm4 \n"
5216 "cvttps2dq %%xmm0,%%xmm0 \n"
5217 "cvttps2dq %%xmm4,%%xmm4 \n"
5218 "packuswb %%xmm4,%%xmm0 \n"
5219 "packuswb %%xmm0,%%xmm0 \n"
5220 "movq %%xmm0," MEMACCESS(1) " \n"
5221 "lea " MEMLEA(0x8,1) ",%1 \n"
5222 "sub $0x2,%2 \n"
5223 "jg 1b \n"
5224 : "+r"(src_argb), // %0
5225 "+r"(dst_argb), // %1
5226 "+r"(width) // %2
5227 : "r"(poly) // %3
5228 : "memory", "cc"
5229 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5230 );
5231 }
5232 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5233
5234 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5235 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5236 uint8* dst_argb, const float* poly,
5237 int width) {
5238 asm volatile (
5239 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
5240 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5241 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5242 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5243
5244 // 2 pixel loop.
5245 LABELALIGN
5246 "1: \n"
5247 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
5248 "lea " MEMLEA(0x8,0) ",%0 \n"
5249 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
5250 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
5251 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
5252 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
5253 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
5254 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
5255 "vcvttps2dq %%ymm0,%%ymm0 \n"
5256 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
5257 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5258 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
5259 "vmovq %%xmm0," MEMACCESS(1) " \n"
5260 "lea " MEMLEA(0x8,1) ",%1 \n"
5261 "sub $0x2,%2 \n"
5262 "jg 1b \n"
5263 "vzeroupper \n"
5264 : "+r"(src_argb), // %0
5265 "+r"(dst_argb), // %1
5266 "+r"(width) // %2
5267 : "r"(poly) // %3
5268 : "memory", "cc",
5269 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5270 );
5271 }
5272 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5273
5274 #ifdef HAS_ARGBCOLORTABLEROW_X86
5275 // Tranform ARGB pixels with color table.
5276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5277 int width) {
5278 uintptr_t pixel_temp = 0u;
5279 asm volatile (
5280 // 1 pixel loop.
5281 LABELALIGN
5282 "1: \n"
5283 "movzb " MEMACCESS(0) ",%1 \n"
5284 "lea " MEMLEA(0x4,0) ",%0 \n"
5285 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5286 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5287 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5288 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5289 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5290 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5291 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5292 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5293 "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
5294 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
5295 "mov %b1," MEMACCESS2(-0x1,0) " \n"
5296 "dec %2 \n"
5297 "jg 1b \n"
5298 : "+r"(dst_argb), // %0
5299 "+d"(pixel_temp), // %1
5300 "+r"(width) // %2
5301 : "r"(table_argb) // %3
5302 : "memory", "cc");
5303 }
5304 #endif // HAS_ARGBCOLORTABLEROW_X86
5305
5306 #ifdef HAS_RGBCOLORTABLEROW_X86
5307 // Tranform RGB pixels with color table.
5308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5309 uintptr_t pixel_temp = 0u;
5310 asm volatile (
5311 // 1 pixel loop.
5312 LABELALIGN
5313 "1: \n"
5314 "movzb " MEMACCESS(0) ",%1 \n"
5315 "lea " MEMLEA(0x4,0) ",%0 \n"
5316 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5317 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5318 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5319 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5320 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5321 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5322 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5323 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5324 "dec %2 \n"
5325 "jg 1b \n"
5326 : "+r"(dst_argb), // %0
5327 "+d"(pixel_temp), // %1
5328 "+r"(width) // %2
5329 : "r"(table_argb) // %3
5330 : "memory", "cc");
5331 }
5332 #endif // HAS_RGBCOLORTABLEROW_X86
5333
5334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5335 // Tranform RGB pixels with luma table.
5336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5337 int width,
5338 const uint8* luma, uint32 lumacoeff) {
5339 uintptr_t pixel_temp = 0u;
5340 uintptr_t table_temp = 0u;
5341 asm volatile (
5342 "movd %6,%%xmm3 \n"
5343 "pshufd $0x0,%%xmm3,%%xmm3 \n"
5344 "pcmpeqb %%xmm4,%%xmm4 \n"
5345 "psllw $0x8,%%xmm4 \n"
5346 "pxor %%xmm5,%%xmm5 \n"
5347
5348 // 4 pixel loop.
5349 LABELALIGN
5350 "1: \n"
5351 "movdqu " MEMACCESS(2) ",%%xmm0 \n"
5352 "pmaddubsw %%xmm3,%%xmm0 \n"
5353 "phaddw %%xmm0,%%xmm0 \n"
5354 "pand %%xmm4,%%xmm0 \n"
5355 "punpcklwd %%xmm5,%%xmm0 \n"
5356 "movd %%xmm0,%k1 \n" // 32 bit offset
5357 "add %5,%1 \n"
5358 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5359
5360 "movzb " MEMACCESS(2) ",%0 \n"
5361 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5362 "mov %b0," MEMACCESS(3) " \n"
5363 "movzb " MEMACCESS2(0x1,2) ",%0 \n"
5364 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5365 "mov %b0," MEMACCESS2(0x1,3) " \n"
5366 "movzb " MEMACCESS2(0x2,2) ",%0 \n"
5367 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5368 "mov %b0," MEMACCESS2(0x2,3) " \n"
5369 "movzb " MEMACCESS2(0x3,2) ",%0 \n"
5370 "mov %b0," MEMACCESS2(0x3,3) " \n"
5371
5372 "movd %%xmm0,%k1 \n" // 32 bit offset
5373 "add %5,%1 \n"
5374 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5375
5376 "movzb " MEMACCESS2(0x4,2) ",%0 \n"
5377 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5378 "mov %b0," MEMACCESS2(0x4,3) " \n"
5379 "movzb " MEMACCESS2(0x5,2) ",%0 \n"
5380 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5381 "mov %b0," MEMACCESS2(0x5,3) " \n"
5382 "movzb " MEMACCESS2(0x6,2) ",%0 \n"
5383 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5384 "mov %b0," MEMACCESS2(0x6,3) " \n"
5385 "movzb " MEMACCESS2(0x7,2) ",%0 \n"
5386 "mov %b0," MEMACCESS2(0x7,3) " \n"
5387
5388 "movd %%xmm0,%k1 \n" // 32 bit offset
5389 "add %5,%1 \n"
5390 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5391
5392 "movzb " MEMACCESS2(0x8,2) ",%0 \n"
5393 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5394 "mov %b0," MEMACCESS2(0x8,3) " \n"
5395 "movzb " MEMACCESS2(0x9,2) ",%0 \n"
5396 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5397 "mov %b0," MEMACCESS2(0x9,3) " \n"
5398 "movzb " MEMACCESS2(0xa,2) ",%0 \n"
5399 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5400 "mov %b0," MEMACCESS2(0xa,3) " \n"
5401 "movzb " MEMACCESS2(0xb,2) ",%0 \n"
5402 "mov %b0," MEMACCESS2(0xb,3) " \n"
5403
5404 "movd %%xmm0,%k1 \n" // 32 bit offset
5405 "add %5,%1 \n"
5406
5407 "movzb " MEMACCESS2(0xc,2) ",%0 \n"
5408 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5409 "mov %b0," MEMACCESS2(0xc,3) " \n"
5410 "movzb " MEMACCESS2(0xd,2) ",%0 \n"
5411 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5412 "mov %b0," MEMACCESS2(0xd,3) " \n"
5413 "movzb " MEMACCESS2(0xe,2) ",%0 \n"
5414 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5415 "mov %b0," MEMACCESS2(0xe,3) " \n"
5416 "movzb " MEMACCESS2(0xf,2) ",%0 \n"
5417 "mov %b0," MEMACCESS2(0xf,3) " \n"
5418 "lea " MEMLEA(0x10,2) ",%2 \n"
5419 "lea " MEMLEA(0x10,3) ",%3 \n"
5420 "sub $0x4,%4 \n"
5421 "jg 1b \n"
5422 : "+d"(pixel_temp), // %0
5423 "+a"(table_temp), // %1
5424 "+r"(src_argb), // %2
5425 "+r"(dst_argb), // %3
5426 "+rm"(width) // %4
5427 : "r"(luma), // %5
5428 "rm"(lumacoeff) // %6
5429 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5430 );
5431 }
5432 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5433
5434 #endif // defined(__x86_64__) || defined(__i386__)
5435
5436 #ifdef __cplusplus
5437 } // extern "C"
5438 } // namespace libyuv
5439 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_neon64.cc ('k') | source/libvpx/third_party/libyuv/source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698