OLD | NEW |
| (Empty) |
1 // VERSION 2 | |
2 /* | |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | |
4 * | |
5 * Use of this source code is governed by a BSD-style license | |
6 * that can be found in the LICENSE file in the root of the source | |
7 * tree. An additional intellectual property rights grant can be found | |
8 * in the file PATENTS. All contributing project authors may | |
9 * be found in the AUTHORS file in the root of the source tree. | |
10 */ | |
11 | |
12 #include "libyuv/row.h" | |
13 | |
14 #ifdef __cplusplus | |
15 namespace libyuv { | |
16 extern "C" { | |
17 #endif | |
18 | |
19 // This module is for GCC x86 and x64. | |
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) | |
21 | |
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) | |
23 | |
24 // Constants for ARGB | |
25 static vec8 kARGBToY = { | |
26 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 | |
27 }; | |
28 | |
29 // JPeg full range. | |
30 static vec8 kARGBToYJ = { | |
31 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 | |
32 }; | |
33 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) | |
34 | |
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) | |
36 | |
37 static vec8 kARGBToU = { | |
38 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 | |
39 }; | |
40 | |
41 static vec8 kARGBToUJ = { | |
42 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 | |
43 }; | |
44 | |
45 static vec8 kARGBToV = { | |
46 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, | |
47 }; | |
48 | |
49 static vec8 kARGBToVJ = { | |
50 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 | |
51 }; | |
52 | |
53 // Constants for BGRA | |
54 static vec8 kBGRAToY = { | |
55 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 | |
56 }; | |
57 | |
58 static vec8 kBGRAToU = { | |
59 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 | |
60 }; | |
61 | |
62 static vec8 kBGRAToV = { | |
63 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 | |
64 }; | |
65 | |
66 // Constants for ABGR | |
67 static vec8 kABGRToY = { | |
68 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 | |
69 }; | |
70 | |
71 static vec8 kABGRToU = { | |
72 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 | |
73 }; | |
74 | |
75 static vec8 kABGRToV = { | |
76 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 | |
77 }; | |
78 | |
79 // Constants for RGBA. | |
80 static vec8 kRGBAToY = { | |
81 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 | |
82 }; | |
83 | |
84 static vec8 kRGBAToU = { | |
85 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 | |
86 }; | |
87 | |
88 static vec8 kRGBAToV = { | |
89 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 | |
90 }; | |
91 | |
92 static uvec8 kAddY16 = { | |
93 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u | |
94 }; | |
95 | |
96 // 7 bit fixed point 0.5. | |
97 static vec16 kAddYJ64 = { | |
98 64, 64, 64, 64, 64, 64, 64, 64 | |
99 }; | |
100 | |
101 static uvec8 kAddUV128 = { | |
102 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, | |
103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | |
104 }; | |
105 | |
106 static uvec16 kAddUVJ128 = { | |
107 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u | |
108 }; | |
109 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) | |
110 | |
111 #ifdef HAS_RGB24TOARGBROW_SSSE3 | |
112 | |
113 // Shuffle table for converting RGB24 to ARGB. | |
114 static uvec8 kShuffleMaskRGB24ToARGB = { | |
115 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u | |
116 }; | |
117 | |
118 // Shuffle table for converting RAW to ARGB. | |
119 static uvec8 kShuffleMaskRAWToARGB = { | |
120 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u | |
121 }; | |
122 | |
123 // Shuffle table for converting ARGB to RGB24. | |
124 static uvec8 kShuffleMaskARGBToRGB24 = { | |
125 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u | |
126 }; | |
127 | |
128 // Shuffle table for converting ARGB to RAW. | |
129 static uvec8 kShuffleMaskARGBToRAW = { | |
130 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u | |
131 }; | |
132 | |
133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 | |
134 static uvec8 kShuffleMaskARGBToRGB24_0 = { | |
135 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | |
136 }; | |
137 | |
138 // Shuffle table for converting ARGB to RAW. | |
139 static uvec8 kShuffleMaskARGBToRAW_0 = { | |
140 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u | |
141 }; | |
142 #endif // HAS_RGB24TOARGBROW_SSSE3 | |
143 | |
144 #if defined(TESTING) && defined(__x86_64__) | |
145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | |
146 asm volatile ( | |
147 ".p2align 5 \n" | |
148 "mov %%eax,%%eax \n" | |
149 "mov %%ebx,%%ebx \n" | |
150 "mov %%ecx,%%ecx \n" | |
151 "mov %%edx,%%edx \n" | |
152 "mov %%esi,%%esi \n" | |
153 "mov %%edi,%%edi \n" | |
154 "mov %%ebp,%%ebp \n" | |
155 "mov %%esp,%%esp \n" | |
156 ".p2align 5 \n" | |
157 "mov %%r8d,%%r8d \n" | |
158 "mov %%r9d,%%r9d \n" | |
159 "mov %%r10d,%%r10d \n" | |
160 "mov %%r11d,%%r11d \n" | |
161 "mov %%r12d,%%r12d \n" | |
162 "mov %%r13d,%%r13d \n" | |
163 "mov %%r14d,%%r14d \n" | |
164 "mov %%r15d,%%r15d \n" | |
165 ".p2align 5 \n" | |
166 "lea (%%rax),%%eax \n" | |
167 "lea (%%rbx),%%ebx \n" | |
168 "lea (%%rcx),%%ecx \n" | |
169 "lea (%%rdx),%%edx \n" | |
170 "lea (%%rsi),%%esi \n" | |
171 "lea (%%rdi),%%edi \n" | |
172 "lea (%%rbp),%%ebp \n" | |
173 "lea (%%rsp),%%esp \n" | |
174 ".p2align 5 \n" | |
175 "lea (%%r8),%%r8d \n" | |
176 "lea (%%r9),%%r9d \n" | |
177 "lea (%%r10),%%r10d \n" | |
178 "lea (%%r11),%%r11d \n" | |
179 "lea (%%r12),%%r12d \n" | |
180 "lea (%%r13),%%r13d \n" | |
181 "lea (%%r14),%%r14d \n" | |
182 "lea (%%r15),%%r15d \n" | |
183 | |
184 ".p2align 5 \n" | |
185 "lea 0x10(%%rax),%%eax \n" | |
186 "lea 0x10(%%rbx),%%ebx \n" | |
187 "lea 0x10(%%rcx),%%ecx \n" | |
188 "lea 0x10(%%rdx),%%edx \n" | |
189 "lea 0x10(%%rsi),%%esi \n" | |
190 "lea 0x10(%%rdi),%%edi \n" | |
191 "lea 0x10(%%rbp),%%ebp \n" | |
192 "lea 0x10(%%rsp),%%esp \n" | |
193 ".p2align 5 \n" | |
194 "lea 0x10(%%r8),%%r8d \n" | |
195 "lea 0x10(%%r9),%%r9d \n" | |
196 "lea 0x10(%%r10),%%r10d \n" | |
197 "lea 0x10(%%r11),%%r11d \n" | |
198 "lea 0x10(%%r12),%%r12d \n" | |
199 "lea 0x10(%%r13),%%r13d \n" | |
200 "lea 0x10(%%r14),%%r14d \n" | |
201 "lea 0x10(%%r15),%%r15d \n" | |
202 | |
203 ".p2align 5 \n" | |
204 "add 0x10,%%eax \n" | |
205 "add 0x10,%%ebx \n" | |
206 "add 0x10,%%ecx \n" | |
207 "add 0x10,%%edx \n" | |
208 "add 0x10,%%esi \n" | |
209 "add 0x10,%%edi \n" | |
210 "add 0x10,%%ebp \n" | |
211 "add 0x10,%%esp \n" | |
212 ".p2align 5 \n" | |
213 "add 0x10,%%r8d \n" | |
214 "add 0x10,%%r9d \n" | |
215 "add 0x10,%%r10d \n" | |
216 "add 0x10,%%r11d \n" | |
217 "add 0x10,%%r12d \n" | |
218 "add 0x10,%%r13d \n" | |
219 "add 0x10,%%r14d \n" | |
220 "add 0x10,%%r15d \n" | |
221 | |
222 ".p2align 2 \n" | |
223 "1: \n" | |
224 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
225 "lea " MEMLEA(0x8,0) ",%0 \n" | |
226 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
227 "lea " MEMLEA(0x20,1) ",%1 \n" | |
228 "sub $0x8,%2 \n" | |
229 "jg 1b \n" | |
230 : "+r"(src_y), // %0 | |
231 "+r"(dst_argb), // %1 | |
232 "+r"(pix) // %2 | |
233 : | |
234 : "memory", "cc", "xmm0", "xmm1", "xmm5" | |
235 ); | |
236 } | |
237 #endif // TESTING | |
238 | |
239 #ifdef HAS_I400TOARGBROW_SSE2 | |
240 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | |
241 asm volatile ( | |
242 "pcmpeqb %%xmm5,%%xmm5 \n" | |
243 "pslld $0x18,%%xmm5 \n" | |
244 LABELALIGN | |
245 "1: \n" | |
246 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
247 "lea " MEMLEA(0x8,0) ",%0 \n" | |
248 "punpcklbw %%xmm0,%%xmm0 \n" | |
249 "movdqa %%xmm0,%%xmm1 \n" | |
250 "punpcklwd %%xmm0,%%xmm0 \n" | |
251 "punpckhwd %%xmm1,%%xmm1 \n" | |
252 "por %%xmm5,%%xmm0 \n" | |
253 "por %%xmm5,%%xmm1 \n" | |
254 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
256 "lea " MEMLEA(0x20,1) ",%1 \n" | |
257 "sub $0x8,%2 \n" | |
258 "jg 1b \n" | |
259 : "+r"(src_y), // %0 | |
260 "+r"(dst_argb), // %1 | |
261 "+r"(pix) // %2 | |
262 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | |
263 ); | |
264 } | |
265 #endif // HAS_I400TOARGBROW_SSE2 | |
266 | |
267 #ifdef HAS_RGB24TOARGBROW_SSSE3 | |
268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | |
269 asm volatile ( | |
270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 | |
271 "pslld $0x18,%%xmm5 \n" | |
272 "movdqa %3,%%xmm4 \n" | |
273 LABELALIGN | |
274 "1: \n" | |
275 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
276 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
277 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" | |
278 "lea " MEMLEA(0x30,0) ",%0 \n" | |
279 "movdqa %%xmm3,%%xmm2 \n" | |
280 "palignr $0x8,%%xmm1,%%xmm2 \n" | |
281 "pshufb %%xmm4,%%xmm2 \n" | |
282 "por %%xmm5,%%xmm2 \n" | |
283 "palignr $0xc,%%xmm0,%%xmm1 \n" | |
284 "pshufb %%xmm4,%%xmm0 \n" | |
285 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | |
286 "por %%xmm5,%%xmm0 \n" | |
287 "pshufb %%xmm4,%%xmm1 \n" | |
288 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
289 "por %%xmm5,%%xmm1 \n" | |
290 "palignr $0x4,%%xmm3,%%xmm3 \n" | |
291 "pshufb %%xmm4,%%xmm3 \n" | |
292 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
293 "por %%xmm5,%%xmm3 \n" | |
294 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" | |
295 "lea " MEMLEA(0x40,1) ",%1 \n" | |
296 "sub $0x10,%2 \n" | |
297 "jg 1b \n" | |
298 : "+r"(src_rgb24), // %0 | |
299 "+r"(dst_argb), // %1 | |
300 "+r"(pix) // %2 | |
301 : "m"(kShuffleMaskRGB24ToARGB) // %3 | |
302 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
303 ); | |
304 } | |
305 | |
306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { | |
307 asm volatile ( | |
308 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 | |
309 "pslld $0x18,%%xmm5 \n" | |
310 "movdqa %3,%%xmm4 \n" | |
311 LABELALIGN | |
312 "1: \n" | |
313 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
314 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
315 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" | |
316 "lea " MEMLEA(0x30,0) ",%0 \n" | |
317 "movdqa %%xmm3,%%xmm2 \n" | |
318 "palignr $0x8,%%xmm1,%%xmm2 \n" | |
319 "pshufb %%xmm4,%%xmm2 \n" | |
320 "por %%xmm5,%%xmm2 \n" | |
321 "palignr $0xc,%%xmm0,%%xmm1 \n" | |
322 "pshufb %%xmm4,%%xmm0 \n" | |
323 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | |
324 "por %%xmm5,%%xmm0 \n" | |
325 "pshufb %%xmm4,%%xmm1 \n" | |
326 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
327 "por %%xmm5,%%xmm1 \n" | |
328 "palignr $0x4,%%xmm3,%%xmm3 \n" | |
329 "pshufb %%xmm4,%%xmm3 \n" | |
330 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
331 "por %%xmm5,%%xmm3 \n" | |
332 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" | |
333 "lea " MEMLEA(0x40,1) ",%1 \n" | |
334 "sub $0x10,%2 \n" | |
335 "jg 1b \n" | |
336 : "+r"(src_raw), // %0 | |
337 "+r"(dst_argb), // %1 | |
338 "+r"(pix) // %2 | |
339 : "m"(kShuffleMaskRAWToARGB) // %3 | |
340 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
341 ); | |
342 } | |
343 | |
344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | |
345 asm volatile ( | |
346 "mov $0x1080108,%%eax \n" | |
347 "movd %%eax,%%xmm5 \n" | |
348 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
349 "mov $0x20802080,%%eax \n" | |
350 "movd %%eax,%%xmm6 \n" | |
351 "pshufd $0x0,%%xmm6,%%xmm6 \n" | |
352 "pcmpeqb %%xmm3,%%xmm3 \n" | |
353 "psllw $0xb,%%xmm3 \n" | |
354 "pcmpeqb %%xmm4,%%xmm4 \n" | |
355 "psllw $0xa,%%xmm4 \n" | |
356 "psrlw $0x5,%%xmm4 \n" | |
357 "pcmpeqb %%xmm7,%%xmm7 \n" | |
358 "psllw $0x8,%%xmm7 \n" | |
359 "sub %0,%1 \n" | |
360 "sub %0,%1 \n" | |
361 LABELALIGN | |
362 "1: \n" | |
363 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
364 "movdqa %%xmm0,%%xmm1 \n" | |
365 "movdqa %%xmm0,%%xmm2 \n" | |
366 "pand %%xmm3,%%xmm1 \n" | |
367 "psllw $0xb,%%xmm2 \n" | |
368 "pmulhuw %%xmm5,%%xmm1 \n" | |
369 "pmulhuw %%xmm5,%%xmm2 \n" | |
370 "psllw $0x8,%%xmm1 \n" | |
371 "por %%xmm2,%%xmm1 \n" | |
372 "pand %%xmm4,%%xmm0 \n" | |
373 "pmulhuw %%xmm6,%%xmm0 \n" | |
374 "por %%xmm7,%%xmm0 \n" | |
375 "movdqa %%xmm1,%%xmm2 \n" | |
376 "punpcklbw %%xmm0,%%xmm1 \n" | |
377 "punpckhbw %%xmm0,%%xmm2 \n" | |
378 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) | |
379 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) | |
380 "lea " MEMLEA(0x10,0) ",%0 \n" | |
381 "sub $0x8,%2 \n" | |
382 "jg 1b \n" | |
383 : "+r"(src), // %0 | |
384 "+r"(dst), // %1 | |
385 "+r"(pix) // %2 | |
386 : | |
387 : "memory", "cc", "eax", NACL_R14 | |
388 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
389 ); | |
390 } | |
391 | |
392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | |
393 asm volatile ( | |
394 "mov $0x1080108,%%eax \n" | |
395 "movd %%eax,%%xmm5 \n" | |
396 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
397 "mov $0x42004200,%%eax \n" | |
398 "movd %%eax,%%xmm6 \n" | |
399 "pshufd $0x0,%%xmm6,%%xmm6 \n" | |
400 "pcmpeqb %%xmm3,%%xmm3 \n" | |
401 "psllw $0xb,%%xmm3 \n" | |
402 "movdqa %%xmm3,%%xmm4 \n" | |
403 "psrlw $0x6,%%xmm4 \n" | |
404 "pcmpeqb %%xmm7,%%xmm7 \n" | |
405 "psllw $0x8,%%xmm7 \n" | |
406 "sub %0,%1 \n" | |
407 "sub %0,%1 \n" | |
408 LABELALIGN | |
409 "1: \n" | |
410 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
411 "movdqa %%xmm0,%%xmm1 \n" | |
412 "movdqa %%xmm0,%%xmm2 \n" | |
413 "psllw $0x1,%%xmm1 \n" | |
414 "psllw $0xb,%%xmm2 \n" | |
415 "pand %%xmm3,%%xmm1 \n" | |
416 "pmulhuw %%xmm5,%%xmm2 \n" | |
417 "pmulhuw %%xmm5,%%xmm1 \n" | |
418 "psllw $0x8,%%xmm1 \n" | |
419 "por %%xmm2,%%xmm1 \n" | |
420 "movdqa %%xmm0,%%xmm2 \n" | |
421 "pand %%xmm4,%%xmm0 \n" | |
422 "psraw $0x8,%%xmm2 \n" | |
423 "pmulhuw %%xmm6,%%xmm0 \n" | |
424 "pand %%xmm7,%%xmm2 \n" | |
425 "por %%xmm2,%%xmm0 \n" | |
426 "movdqa %%xmm1,%%xmm2 \n" | |
427 "punpcklbw %%xmm0,%%xmm1 \n" | |
428 "punpckhbw %%xmm0,%%xmm2 \n" | |
429 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) | |
430 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) | |
431 "lea " MEMLEA(0x10,0) ",%0 \n" | |
432 "sub $0x8,%2 \n" | |
433 "jg 1b \n" | |
434 : "+r"(src), // %0 | |
435 "+r"(dst), // %1 | |
436 "+r"(pix) // %2 | |
437 : | |
438 : "memory", "cc", "eax", NACL_R14 | |
439 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
440 ); | |
441 } | |
442 | |
443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { | |
444 asm volatile ( | |
445 "mov $0xf0f0f0f,%%eax \n" | |
446 "movd %%eax,%%xmm4 \n" | |
447 "pshufd $0x0,%%xmm4,%%xmm4 \n" | |
448 "movdqa %%xmm4,%%xmm5 \n" | |
449 "pslld $0x4,%%xmm5 \n" | |
450 "sub %0,%1 \n" | |
451 "sub %0,%1 \n" | |
452 LABELALIGN | |
453 "1: \n" | |
454 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
455 "movdqa %%xmm0,%%xmm2 \n" | |
456 "pand %%xmm4,%%xmm0 \n" | |
457 "pand %%xmm5,%%xmm2 \n" | |
458 "movdqa %%xmm0,%%xmm1 \n" | |
459 "movdqa %%xmm2,%%xmm3 \n" | |
460 "psllw $0x4,%%xmm1 \n" | |
461 "psrlw $0x4,%%xmm3 \n" | |
462 "por %%xmm1,%%xmm0 \n" | |
463 "por %%xmm3,%%xmm2 \n" | |
464 "movdqa %%xmm0,%%xmm1 \n" | |
465 "punpcklbw %%xmm2,%%xmm0 \n" | |
466 "punpckhbw %%xmm2,%%xmm1 \n" | |
467 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) | |
468 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) | |
469 "lea " MEMLEA(0x10,0) ",%0 \n" | |
470 "sub $0x8,%2 \n" | |
471 "jg 1b \n" | |
472 : "+r"(src), // %0 | |
473 "+r"(dst), // %1 | |
474 "+r"(pix) // %2 | |
475 : | |
476 : "memory", "cc", "eax", NACL_R14 | |
477 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
478 ); | |
479 } | |
480 | |
481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { | |
482 asm volatile ( | |
483 "movdqa %3,%%xmm6 \n" | |
484 LABELALIGN | |
485 "1: \n" | |
486 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
487 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
488 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
489 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
490 "lea " MEMLEA(0x40,0) ",%0 \n" | |
491 "pshufb %%xmm6,%%xmm0 \n" | |
492 "pshufb %%xmm6,%%xmm1 \n" | |
493 "pshufb %%xmm6,%%xmm2 \n" | |
494 "pshufb %%xmm6,%%xmm3 \n" | |
495 "movdqa %%xmm1,%%xmm4 \n" | |
496 "psrldq $0x4,%%xmm1 \n" | |
497 "pslldq $0xc,%%xmm4 \n" | |
498 "movdqa %%xmm2,%%xmm5 \n" | |
499 "por %%xmm4,%%xmm0 \n" | |
500 "pslldq $0x8,%%xmm5 \n" | |
501 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
502 "por %%xmm5,%%xmm1 \n" | |
503 "psrldq $0x8,%%xmm2 \n" | |
504 "pslldq $0x4,%%xmm3 \n" | |
505 "por %%xmm3,%%xmm2 \n" | |
506 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
507 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | |
508 "lea " MEMLEA(0x30,1) ",%1 \n" | |
509 "sub $0x10,%2 \n" | |
510 "jg 1b \n" | |
511 : "+r"(src), // %0 | |
512 "+r"(dst), // %1 | |
513 "+r"(pix) // %2 | |
514 : "m"(kShuffleMaskARGBToRGB24) // %3 | |
515 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
516 ); | |
517 } | |
518 | |
519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { | |
520 asm volatile ( | |
521 "movdqa %3,%%xmm6 \n" | |
522 LABELALIGN | |
523 "1: \n" | |
524 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
525 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
526 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
527 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
528 "lea " MEMLEA(0x40,0) ",%0 \n" | |
529 "pshufb %%xmm6,%%xmm0 \n" | |
530 "pshufb %%xmm6,%%xmm1 \n" | |
531 "pshufb %%xmm6,%%xmm2 \n" | |
532 "pshufb %%xmm6,%%xmm3 \n" | |
533 "movdqa %%xmm1,%%xmm4 \n" | |
534 "psrldq $0x4,%%xmm1 \n" | |
535 "pslldq $0xc,%%xmm4 \n" | |
536 "movdqa %%xmm2,%%xmm5 \n" | |
537 "por %%xmm4,%%xmm0 \n" | |
538 "pslldq $0x8,%%xmm5 \n" | |
539 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
540 "por %%xmm5,%%xmm1 \n" | |
541 "psrldq $0x8,%%xmm2 \n" | |
542 "pslldq $0x4,%%xmm3 \n" | |
543 "por %%xmm3,%%xmm2 \n" | |
544 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
545 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" | |
546 "lea " MEMLEA(0x30,1) ",%1 \n" | |
547 "sub $0x10,%2 \n" | |
548 "jg 1b \n" | |
549 : "+r"(src), // %0 | |
550 "+r"(dst), // %1 | |
551 "+r"(pix) // %2 | |
552 : "m"(kShuffleMaskARGBToRAW) // %3 | |
553 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
554 ); | |
555 } | |
556 | |
557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { | |
558 asm volatile ( | |
559 "pcmpeqb %%xmm3,%%xmm3 \n" | |
560 "psrld $0x1b,%%xmm3 \n" | |
561 "pcmpeqb %%xmm4,%%xmm4 \n" | |
562 "psrld $0x1a,%%xmm4 \n" | |
563 "pslld $0x5,%%xmm4 \n" | |
564 "pcmpeqb %%xmm5,%%xmm5 \n" | |
565 "pslld $0xb,%%xmm5 \n" | |
566 LABELALIGN | |
567 "1: \n" | |
568 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
569 "movdqa %%xmm0,%%xmm1 \n" | |
570 "movdqa %%xmm0,%%xmm2 \n" | |
571 "pslld $0x8,%%xmm0 \n" | |
572 "psrld $0x3,%%xmm1 \n" | |
573 "psrld $0x5,%%xmm2 \n" | |
574 "psrad $0x10,%%xmm0 \n" | |
575 "pand %%xmm3,%%xmm1 \n" | |
576 "pand %%xmm4,%%xmm2 \n" | |
577 "pand %%xmm5,%%xmm0 \n" | |
578 "por %%xmm2,%%xmm1 \n" | |
579 "por %%xmm1,%%xmm0 \n" | |
580 "packssdw %%xmm0,%%xmm0 \n" | |
581 "lea " MEMLEA(0x10,0) ",%0 \n" | |
582 "movq %%xmm0," MEMACCESS(1) " \n" | |
583 "lea " MEMLEA(0x8,1) ",%1 \n" | |
584 "sub $0x4,%2 \n" | |
585 "jg 1b \n" | |
586 : "+r"(src), // %0 | |
587 "+r"(dst), // %1 | |
588 "+r"(pix) // %2 | |
589 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
590 ); | |
591 } | |
592 | |
593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { | |
594 asm volatile ( | |
595 "pcmpeqb %%xmm4,%%xmm4 \n" | |
596 "psrld $0x1b,%%xmm4 \n" | |
597 "movdqa %%xmm4,%%xmm5 \n" | |
598 "pslld $0x5,%%xmm5 \n" | |
599 "movdqa %%xmm4,%%xmm6 \n" | |
600 "pslld $0xa,%%xmm6 \n" | |
601 "pcmpeqb %%xmm7,%%xmm7 \n" | |
602 "pslld $0xf,%%xmm7 \n" | |
603 LABELALIGN | |
604 "1: \n" | |
605 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
606 "movdqa %%xmm0,%%xmm1 \n" | |
607 "movdqa %%xmm0,%%xmm2 \n" | |
608 "movdqa %%xmm0,%%xmm3 \n" | |
609 "psrad $0x10,%%xmm0 \n" | |
610 "psrld $0x3,%%xmm1 \n" | |
611 "psrld $0x6,%%xmm2 \n" | |
612 "psrld $0x9,%%xmm3 \n" | |
613 "pand %%xmm7,%%xmm0 \n" | |
614 "pand %%xmm4,%%xmm1 \n" | |
615 "pand %%xmm5,%%xmm2 \n" | |
616 "pand %%xmm6,%%xmm3 \n" | |
617 "por %%xmm1,%%xmm0 \n" | |
618 "por %%xmm3,%%xmm2 \n" | |
619 "por %%xmm2,%%xmm0 \n" | |
620 "packssdw %%xmm0,%%xmm0 \n" | |
621 "lea " MEMLEA(0x10,0) ",%0 \n" | |
622 "movq %%xmm0," MEMACCESS(1) " \n" | |
623 "lea " MEMLEA(0x8,1) ",%1 \n" | |
624 "sub $0x4,%2 \n" | |
625 "jg 1b \n" | |
626 : "+r"(src), // %0 | |
627 "+r"(dst), // %1 | |
628 "+r"(pix) // %2 | |
629 :: "memory", "cc", | |
630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
631 ); | |
632 } | |
633 | |
634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { | |
635 asm volatile ( | |
636 "pcmpeqb %%xmm4,%%xmm4 \n" | |
637 "psllw $0xc,%%xmm4 \n" | |
638 "movdqa %%xmm4,%%xmm3 \n" | |
639 "psrlw $0x8,%%xmm3 \n" | |
640 LABELALIGN | |
641 "1: \n" | |
642 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
643 "movdqa %%xmm0,%%xmm1 \n" | |
644 "pand %%xmm3,%%xmm0 \n" | |
645 "pand %%xmm4,%%xmm1 \n" | |
646 "psrlq $0x4,%%xmm0 \n" | |
647 "psrlq $0x8,%%xmm1 \n" | |
648 "por %%xmm1,%%xmm0 \n" | |
649 "packuswb %%xmm0,%%xmm0 \n" | |
650 "lea " MEMLEA(0x10,0) ",%0 \n" | |
651 "movq %%xmm0," MEMACCESS(1) " \n" | |
652 "lea " MEMLEA(0x8,1) ",%1 \n" | |
653 "sub $0x4,%2 \n" | |
654 "jg 1b \n" | |
655 : "+r"(src), // %0 | |
656 "+r"(dst), // %1 | |
657 "+r"(pix) // %2 | |
658 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | |
659 ); | |
660 } | |
661 #endif // HAS_RGB24TOARGBROW_SSSE3 | |
662 | |
663 #ifdef HAS_ARGBTOYROW_SSSE3 | |
664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. | |
665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | |
666 asm volatile ( | |
667 "movdqa %3,%%xmm4 \n" | |
668 "movdqa %4,%%xmm5 \n" | |
669 LABELALIGN | |
670 "1: \n" | |
671 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
672 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
673 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
674 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
675 "pmaddubsw %%xmm4,%%xmm0 \n" | |
676 "pmaddubsw %%xmm4,%%xmm1 \n" | |
677 "pmaddubsw %%xmm4,%%xmm2 \n" | |
678 "pmaddubsw %%xmm4,%%xmm3 \n" | |
679 "lea " MEMLEA(0x40,0) ",%0 \n" | |
680 "phaddw %%xmm1,%%xmm0 \n" | |
681 "phaddw %%xmm3,%%xmm2 \n" | |
682 "psrlw $0x7,%%xmm0 \n" | |
683 "psrlw $0x7,%%xmm2 \n" | |
684 "packuswb %%xmm2,%%xmm0 \n" | |
685 "paddb %%xmm5,%%xmm0 \n" | |
686 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
687 "lea " MEMLEA(0x10,1) ",%1 \n" | |
688 "sub $0x10,%2 \n" | |
689 "jg 1b \n" | |
690 : "+r"(src_argb), // %0 | |
691 "+r"(dst_y), // %1 | |
692 "+r"(pix) // %2 | |
693 : "m"(kARGBToY), // %3 | |
694 "m"(kAddY16) // %4 | |
695 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
696 ); | |
697 } | |
698 #endif // HAS_ARGBTOYROW_SSSE3 | |
699 | |
700 #ifdef HAS_ARGBTOYJROW_SSSE3 | |
701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. | |
702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. | |
703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { | |
704 asm volatile ( | |
705 "movdqa %3,%%xmm4 \n" | |
706 "movdqa %4,%%xmm5 \n" | |
707 LABELALIGN | |
708 "1: \n" | |
709 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
710 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
711 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
712 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
713 "pmaddubsw %%xmm4,%%xmm0 \n" | |
714 "pmaddubsw %%xmm4,%%xmm1 \n" | |
715 "pmaddubsw %%xmm4,%%xmm2 \n" | |
716 "pmaddubsw %%xmm4,%%xmm3 \n" | |
717 "lea " MEMLEA(0x40,0) ",%0 \n" | |
718 "phaddw %%xmm1,%%xmm0 \n" | |
719 "phaddw %%xmm3,%%xmm2 \n" | |
720 "paddw %%xmm5,%%xmm0 \n" | |
721 "paddw %%xmm5,%%xmm2 \n" | |
722 "psrlw $0x7,%%xmm0 \n" | |
723 "psrlw $0x7,%%xmm2 \n" | |
724 "packuswb %%xmm2,%%xmm0 \n" | |
725 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
726 "lea " MEMLEA(0x10,1) ",%1 \n" | |
727 "sub $0x10,%2 \n" | |
728 "jg 1b \n" | |
729 : "+r"(src_argb), // %0 | |
730 "+r"(dst_y), // %1 | |
731 "+r"(pix) // %2 | |
732 : "m"(kARGBToYJ), // %3 | |
733 "m"(kAddYJ64) // %4 | |
734 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
735 ); | |
736 } | |
737 #endif // HAS_ARGBTOYJROW_SSSE3 | |
738 | |
739 #ifdef HAS_ARGBTOYROW_AVX2 | |
740 // vpermd for vphaddw + vpackuswb vpermd. | |
741 static const lvec32 kPermdARGBToY_AVX = { | |
742 0, 4, 1, 5, 2, 6, 3, 7 | |
743 }; | |
744 | |
745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | |
746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | |
747 asm volatile ( | |
748 "vbroadcastf128 %3,%%ymm4 \n" | |
749 "vbroadcastf128 %4,%%ymm5 \n" | |
750 "vmovdqu %5,%%ymm6 \n" | |
751 LABELALIGN | |
752 "1: \n" | |
753 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
754 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
755 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | |
756 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" | |
757 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" | |
758 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" | |
759 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" | |
760 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" | |
761 "lea " MEMLEA(0x80,0) ",%0 \n" | |
762 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. | |
763 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" | |
764 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" | |
765 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" | |
766 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. | |
767 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. | |
768 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y | |
769 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
770 "lea " MEMLEA(0x20,1) ",%1 \n" | |
771 "sub $0x20,%2 \n" | |
772 "jg 1b \n" | |
773 "vzeroupper \n" | |
774 : "+r"(src_argb), // %0 | |
775 "+r"(dst_y), // %1 | |
776 "+r"(pix) // %2 | |
777 : "m"(kARGBToY), // %3 | |
778 "m"(kAddY16), // %4 | |
779 "m"(kPermdARGBToY_AVX) // %5 | |
780 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
781 ); | |
782 } | |
783 #endif // HAS_ARGBTOYROW_AVX2 | |
784 | |
785 #ifdef HAS_ARGBTOYJROW_AVX2 | |
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | |
787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { | |
788 asm volatile ( | |
789 "vbroadcastf128 %3,%%ymm4 \n" | |
790 "vbroadcastf128 %4,%%ymm5 \n" | |
791 "vmovdqu %5,%%ymm6 \n" | |
792 LABELALIGN | |
793 "1: \n" | |
794 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
795 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
796 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | |
797 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" | |
798 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" | |
799 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" | |
800 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" | |
801 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" | |
802 "lea " MEMLEA(0x80,0) ",%0 \n" | |
803 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. | |
804 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" | |
805 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. | |
806 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" | |
807 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" | |
808 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" | |
809 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. | |
810 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. | |
811 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
812 "lea " MEMLEA(0x20,1) ",%1 \n" | |
813 "sub $0x20,%2 \n" | |
814 "jg 1b \n" | |
815 "vzeroupper \n" | |
816 : "+r"(src_argb), // %0 | |
817 "+r"(dst_y), // %1 | |
818 "+r"(pix) // %2 | |
819 : "m"(kARGBToYJ), // %3 | |
820 "m"(kAddYJ64), // %4 | |
821 "m"(kPermdARGBToY_AVX) // %5 | |
822 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
823 ); | |
824 } | |
825 #endif // HAS_ARGBTOYJROW_AVX2 | |
826 | |
827 #ifdef HAS_ARGBTOUVROW_SSSE3 | |
828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | |
829 uint8* dst_u, uint8* dst_v, int width) { | |
830 asm volatile ( | |
831 "movdqa %5,%%xmm3 \n" | |
832 "movdqa %6,%%xmm4 \n" | |
833 "movdqa %7,%%xmm5 \n" | |
834 "sub %1,%2 \n" | |
835 LABELALIGN | |
836 "1: \n" | |
837 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
838 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
839 "pavgb %%xmm7,%%xmm0 \n" | |
840 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
841 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
842 "pavgb %%xmm7,%%xmm1 \n" | |
843 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
844 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
845 "pavgb %%xmm7,%%xmm2 \n" | |
846 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
847 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
848 "pavgb %%xmm7,%%xmm6 \n" | |
849 | |
850 "lea " MEMLEA(0x40,0) ",%0 \n" | |
851 "movdqa %%xmm0,%%xmm7 \n" | |
852 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
853 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
854 "pavgb %%xmm7,%%xmm0 \n" | |
855 "movdqa %%xmm2,%%xmm7 \n" | |
856 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
857 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
858 "pavgb %%xmm7,%%xmm2 \n" | |
859 "movdqa %%xmm0,%%xmm1 \n" | |
860 "movdqa %%xmm2,%%xmm6 \n" | |
861 "pmaddubsw %%xmm4,%%xmm0 \n" | |
862 "pmaddubsw %%xmm4,%%xmm2 \n" | |
863 "pmaddubsw %%xmm3,%%xmm1 \n" | |
864 "pmaddubsw %%xmm3,%%xmm6 \n" | |
865 "phaddw %%xmm2,%%xmm0 \n" | |
866 "phaddw %%xmm6,%%xmm1 \n" | |
867 "psraw $0x8,%%xmm0 \n" | |
868 "psraw $0x8,%%xmm1 \n" | |
869 "packsswb %%xmm1,%%xmm0 \n" | |
870 "paddb %%xmm5,%%xmm0 \n" | |
871 "movlps %%xmm0," MEMACCESS(1) " \n" | |
872 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
873 "lea " MEMLEA(0x8,1) ",%1 \n" | |
874 "sub $0x10,%3 \n" | |
875 "jg 1b \n" | |
876 : "+r"(src_argb0), // %0 | |
877 "+r"(dst_u), // %1 | |
878 "+r"(dst_v), // %2 | |
879 "+rm"(width) // %3 | |
880 : "r"((intptr_t)(src_stride_argb)), // %4 | |
881 "m"(kARGBToV), // %5 | |
882 "m"(kARGBToU), // %6 | |
883 "m"(kAddUV128) // %7 | |
884 : "memory", "cc", NACL_R14 | |
885 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
886 ); | |
887 } | |
888 #endif // HAS_ARGBTOUVROW_SSSE3 | |
889 | |
890 #ifdef HAS_ARGBTOUVROW_AVX2 | |
891 // vpshufb for vphaddw + vpackuswb packed to shorts. | |
892 static const lvec8 kShufARGBToUV_AVX = { | |
893 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, | |
894 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 | |
895 }; | |
896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | |
897 uint8* dst_u, uint8* dst_v, int width) { | |
898 asm volatile ( | |
899 "vbroadcastf128 %5,%%ymm5 \n" | |
900 "vbroadcastf128 %6,%%ymm6 \n" | |
901 "vbroadcastf128 %7,%%ymm7 \n" | |
902 "sub %1,%2 \n" | |
903 LABELALIGN | |
904 "1: \n" | |
905 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
906 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
907 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | |
908 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" | |
909 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | |
910 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | |
911 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) | |
912 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) | |
913 "lea " MEMLEA(0x80,0) ",%0 \n" | |
914 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" | |
915 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" | |
916 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" | |
917 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" | |
918 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" | |
919 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" | |
920 | |
921 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" | |
922 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" | |
923 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" | |
924 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" | |
925 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" | |
926 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" | |
927 "vpsraw $0x8,%%ymm1,%%ymm1 \n" | |
928 "vpsraw $0x8,%%ymm0,%%ymm0 \n" | |
929 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" | |
930 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
931 "vpshufb %8,%%ymm0,%%ymm0 \n" | |
932 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" | |
933 | |
934 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" | |
935 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) | |
936 "lea " MEMLEA(0x10,1) ",%1 \n" | |
937 "sub $0x20,%3 \n" | |
938 "jg 1b \n" | |
939 "vzeroupper \n" | |
940 : "+r"(src_argb0), // %0 | |
941 "+r"(dst_u), // %1 | |
942 "+r"(dst_v), // %2 | |
943 "+rm"(width) // %3 | |
944 : "r"((intptr_t)(src_stride_argb)), // %4 | |
945 "m"(kAddUV128), // %5 | |
946 "m"(kARGBToV), // %6 | |
947 "m"(kARGBToU), // %7 | |
948 "m"(kShufARGBToUV_AVX) // %8 | |
949 : "memory", "cc", NACL_R14 | |
950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
951 ); | |
952 } | |
953 #endif // HAS_ARGBTOUVROW_AVX2 | |
954 | |
955 #ifdef HAS_ARGBTOUVJROW_SSSE3 | |
956 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. | |
957 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | |
958 uint8* dst_u, uint8* dst_v, int width) { | |
959 asm volatile ( | |
960 "movdqa %5,%%xmm3 \n" | |
961 "movdqa %6,%%xmm4 \n" | |
962 "movdqa %7,%%xmm5 \n" | |
963 "sub %1,%2 \n" | |
964 LABELALIGN | |
965 "1: \n" | |
966 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
967 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
968 "pavgb %%xmm7,%%xmm0 \n" | |
969 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
970 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
971 "pavgb %%xmm7,%%xmm1 \n" | |
972 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
973 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
974 "pavgb %%xmm7,%%xmm2 \n" | |
975 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
976 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
977 "pavgb %%xmm7,%%xmm6 \n" | |
978 | |
979 "lea " MEMLEA(0x40,0) ",%0 \n" | |
980 "movdqa %%xmm0,%%xmm7 \n" | |
981 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
982 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
983 "pavgb %%xmm7,%%xmm0 \n" | |
984 "movdqa %%xmm2,%%xmm7 \n" | |
985 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
986 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
987 "pavgb %%xmm7,%%xmm2 \n" | |
988 "movdqa %%xmm0,%%xmm1 \n" | |
989 "movdqa %%xmm2,%%xmm6 \n" | |
990 "pmaddubsw %%xmm4,%%xmm0 \n" | |
991 "pmaddubsw %%xmm4,%%xmm2 \n" | |
992 "pmaddubsw %%xmm3,%%xmm1 \n" | |
993 "pmaddubsw %%xmm3,%%xmm6 \n" | |
994 "phaddw %%xmm2,%%xmm0 \n" | |
995 "phaddw %%xmm6,%%xmm1 \n" | |
996 "paddw %%xmm5,%%xmm0 \n" | |
997 "paddw %%xmm5,%%xmm1 \n" | |
998 "psraw $0x8,%%xmm0 \n" | |
999 "psraw $0x8,%%xmm1 \n" | |
1000 "packsswb %%xmm1,%%xmm0 \n" | |
1001 "movlps %%xmm0," MEMACCESS(1) " \n" | |
1002 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
1003 "lea " MEMLEA(0x8,1) ",%1 \n" | |
1004 "sub $0x10,%3 \n" | |
1005 "jg 1b \n" | |
1006 : "+r"(src_argb0), // %0 | |
1007 "+r"(dst_u), // %1 | |
1008 "+r"(dst_v), // %2 | |
1009 "+rm"(width) // %3 | |
1010 : "r"((intptr_t)(src_stride_argb)), // %4 | |
1011 "m"(kARGBToVJ), // %5 | |
1012 "m"(kARGBToUJ), // %6 | |
1013 "m"(kAddUVJ128) // %7 | |
1014 : "memory", "cc", NACL_R14 | |
1015 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
1016 ); | |
1017 } | |
1018 #endif // HAS_ARGBTOUVJROW_SSSE3 | |
1019 | |
1020 #ifdef HAS_ARGBTOUV444ROW_SSSE3 | |
1021 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | |
1022 int width) { | |
1023 asm volatile ( | |
1024 "movdqa %4,%%xmm3 \n" | |
1025 "movdqa %5,%%xmm4 \n" | |
1026 "movdqa %6,%%xmm5 \n" | |
1027 "sub %1,%2 \n" | |
1028 LABELALIGN | |
1029 "1: \n" | |
1030 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1031 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1032 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1033 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
1034 "pmaddubsw %%xmm4,%%xmm0 \n" | |
1035 "pmaddubsw %%xmm4,%%xmm1 \n" | |
1036 "pmaddubsw %%xmm4,%%xmm2 \n" | |
1037 "pmaddubsw %%xmm4,%%xmm6 \n" | |
1038 "phaddw %%xmm1,%%xmm0 \n" | |
1039 "phaddw %%xmm6,%%xmm2 \n" | |
1040 "psraw $0x8,%%xmm0 \n" | |
1041 "psraw $0x8,%%xmm2 \n" | |
1042 "packsswb %%xmm2,%%xmm0 \n" | |
1043 "paddb %%xmm5,%%xmm0 \n" | |
1044 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
1045 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1046 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1047 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1048 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
1049 "pmaddubsw %%xmm3,%%xmm0 \n" | |
1050 "pmaddubsw %%xmm3,%%xmm1 \n" | |
1051 "pmaddubsw %%xmm3,%%xmm2 \n" | |
1052 "pmaddubsw %%xmm3,%%xmm6 \n" | |
1053 "phaddw %%xmm1,%%xmm0 \n" | |
1054 "phaddw %%xmm6,%%xmm2 \n" | |
1055 "psraw $0x8,%%xmm0 \n" | |
1056 "psraw $0x8,%%xmm2 \n" | |
1057 "packsswb %%xmm2,%%xmm0 \n" | |
1058 "paddb %%xmm5,%%xmm0 \n" | |
1059 "lea " MEMLEA(0x40,0) ",%0 \n" | |
1060 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) | |
1061 "lea " MEMLEA(0x10,1) ",%1 \n" | |
1062 "sub $0x10,%3 \n" | |
1063 "jg 1b \n" | |
1064 : "+r"(src_argb), // %0 | |
1065 "+r"(dst_u), // %1 | |
1066 "+r"(dst_v), // %2 | |
1067 "+rm"(width) // %3 | |
1068 : "m"(kARGBToV), // %4 | |
1069 "m"(kARGBToU), // %5 | |
1070 "m"(kAddUV128) // %6 | |
1071 : "memory", "cc", NACL_R14 | |
1072 "xmm0", "xmm1", "xmm2", "xmm6" | |
1073 ); | |
1074 } | |
1075 #endif // HAS_ARGBTOUV444ROW_SSSE3 | |
1076 | |
1077 #ifdef HAS_ARGBTOUV422ROW_SSSE3 | |
1078 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, | |
1079 uint8* dst_u, uint8* dst_v, int width) { | |
1080 asm volatile ( | |
1081 "movdqa %4,%%xmm3 \n" | |
1082 "movdqa %5,%%xmm4 \n" | |
1083 "movdqa %6,%%xmm5 \n" | |
1084 "sub %1,%2 \n" | |
1085 LABELALIGN | |
1086 "1: \n" | |
1087 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1088 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1089 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1090 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
1091 "lea " MEMLEA(0x40,0) ",%0 \n" | |
1092 "movdqa %%xmm0,%%xmm7 \n" | |
1093 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
1094 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
1095 "pavgb %%xmm7,%%xmm0 \n" | |
1096 "movdqa %%xmm2,%%xmm7 \n" | |
1097 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
1098 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
1099 "pavgb %%xmm7,%%xmm2 \n" | |
1100 "movdqa %%xmm0,%%xmm1 \n" | |
1101 "movdqa %%xmm2,%%xmm6 \n" | |
1102 "pmaddubsw %%xmm4,%%xmm0 \n" | |
1103 "pmaddubsw %%xmm4,%%xmm2 \n" | |
1104 "pmaddubsw %%xmm3,%%xmm1 \n" | |
1105 "pmaddubsw %%xmm3,%%xmm6 \n" | |
1106 "phaddw %%xmm2,%%xmm0 \n" | |
1107 "phaddw %%xmm6,%%xmm1 \n" | |
1108 "psraw $0x8,%%xmm0 \n" | |
1109 "psraw $0x8,%%xmm1 \n" | |
1110 "packsswb %%xmm1,%%xmm0 \n" | |
1111 "paddb %%xmm5,%%xmm0 \n" | |
1112 "movlps %%xmm0," MEMACCESS(1) " \n" | |
1113 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
1114 "lea " MEMLEA(0x8,1) ",%1 \n" | |
1115 "sub $0x10,%3 \n" | |
1116 "jg 1b \n" | |
1117 : "+r"(src_argb0), // %0 | |
1118 "+r"(dst_u), // %1 | |
1119 "+r"(dst_v), // %2 | |
1120 "+rm"(width) // %3 | |
1121 : "m"(kARGBToV), // %4 | |
1122 "m"(kARGBToU), // %5 | |
1123 "m"(kAddUV128) // %6 | |
1124 : "memory", "cc", NACL_R14 | |
1125 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
1126 ); | |
1127 } | |
1128 #endif // HAS_ARGBTOUV422ROW_SSSE3 | |
1129 | |
1130 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { | |
1131 asm volatile ( | |
1132 "movdqa %4,%%xmm5 \n" | |
1133 "movdqa %3,%%xmm4 \n" | |
1134 LABELALIGN | |
1135 "1: \n" | |
1136 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1137 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1138 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1139 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
1140 "pmaddubsw %%xmm4,%%xmm0 \n" | |
1141 "pmaddubsw %%xmm4,%%xmm1 \n" | |
1142 "pmaddubsw %%xmm4,%%xmm2 \n" | |
1143 "pmaddubsw %%xmm4,%%xmm3 \n" | |
1144 "lea " MEMLEA(0x40,0) ",%0 \n" | |
1145 "phaddw %%xmm1,%%xmm0 \n" | |
1146 "phaddw %%xmm3,%%xmm2 \n" | |
1147 "psrlw $0x7,%%xmm0 \n" | |
1148 "psrlw $0x7,%%xmm2 \n" | |
1149 "packuswb %%xmm2,%%xmm0 \n" | |
1150 "paddb %%xmm5,%%xmm0 \n" | |
1151 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
1152 "lea " MEMLEA(0x10,1) ",%1 \n" | |
1153 "sub $0x10,%2 \n" | |
1154 "jg 1b \n" | |
1155 : "+r"(src_bgra), // %0 | |
1156 "+r"(dst_y), // %1 | |
1157 "+r"(pix) // %2 | |
1158 : "m"(kBGRAToY), // %3 | |
1159 "m"(kAddY16) // %4 | |
1160 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
1161 ); | |
1162 } | |
1163 | |
1164 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, | |
1165 uint8* dst_u, uint8* dst_v, int width) { | |
1166 asm volatile ( | |
1167 "movdqa %5,%%xmm3 \n" | |
1168 "movdqa %6,%%xmm4 \n" | |
1169 "movdqa %7,%%xmm5 \n" | |
1170 "sub %1,%2 \n" | |
1171 LABELALIGN | |
1172 "1: \n" | |
1173 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1174 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
1175 "pavgb %%xmm7,%%xmm0 \n" | |
1176 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1177 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
1178 "pavgb %%xmm7,%%xmm1 \n" | |
1179 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1180 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
1181 "pavgb %%xmm7,%%xmm2 \n" | |
1182 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
1183 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
1184 "pavgb %%xmm7,%%xmm6 \n" | |
1185 | |
1186 "lea " MEMLEA(0x40,0) ",%0 \n" | |
1187 "movdqa %%xmm0,%%xmm7 \n" | |
1188 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
1189 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
1190 "pavgb %%xmm7,%%xmm0 \n" | |
1191 "movdqa %%xmm2,%%xmm7 \n" | |
1192 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
1193 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
1194 "pavgb %%xmm7,%%xmm2 \n" | |
1195 "movdqa %%xmm0,%%xmm1 \n" | |
1196 "movdqa %%xmm2,%%xmm6 \n" | |
1197 "pmaddubsw %%xmm4,%%xmm0 \n" | |
1198 "pmaddubsw %%xmm4,%%xmm2 \n" | |
1199 "pmaddubsw %%xmm3,%%xmm1 \n" | |
1200 "pmaddubsw %%xmm3,%%xmm6 \n" | |
1201 "phaddw %%xmm2,%%xmm0 \n" | |
1202 "phaddw %%xmm6,%%xmm1 \n" | |
1203 "psraw $0x8,%%xmm0 \n" | |
1204 "psraw $0x8,%%xmm1 \n" | |
1205 "packsswb %%xmm1,%%xmm0 \n" | |
1206 "paddb %%xmm5,%%xmm0 \n" | |
1207 "movlps %%xmm0," MEMACCESS(1) " \n" | |
1208 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
1209 "lea " MEMLEA(0x8,1) ",%1 \n" | |
1210 "sub $0x10,%3 \n" | |
1211 "jg 1b \n" | |
1212 : "+r"(src_bgra0), // %0 | |
1213 "+r"(dst_u), // %1 | |
1214 "+r"(dst_v), // %2 | |
1215 "+rm"(width) // %3 | |
1216 : "r"((intptr_t)(src_stride_bgra)), // %4 | |
1217 "m"(kBGRAToV), // %5 | |
1218 "m"(kBGRAToU), // %6 | |
1219 "m"(kAddUV128) // %7 | |
1220 : "memory", "cc", NACL_R14 | |
1221 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
1222 ); | |
1223 } | |
1224 | |
1225 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { | |
1226 asm volatile ( | |
1227 "movdqa %4,%%xmm5 \n" | |
1228 "movdqa %3,%%xmm4 \n" | |
1229 LABELALIGN | |
1230 "1: \n" | |
1231 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1232 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1233 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1234 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
1235 "pmaddubsw %%xmm4,%%xmm0 \n" | |
1236 "pmaddubsw %%xmm4,%%xmm1 \n" | |
1237 "pmaddubsw %%xmm4,%%xmm2 \n" | |
1238 "pmaddubsw %%xmm4,%%xmm3 \n" | |
1239 "lea " MEMLEA(0x40,0) ",%0 \n" | |
1240 "phaddw %%xmm1,%%xmm0 \n" | |
1241 "phaddw %%xmm3,%%xmm2 \n" | |
1242 "psrlw $0x7,%%xmm0 \n" | |
1243 "psrlw $0x7,%%xmm2 \n" | |
1244 "packuswb %%xmm2,%%xmm0 \n" | |
1245 "paddb %%xmm5,%%xmm0 \n" | |
1246 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
1247 "lea " MEMLEA(0x10,1) ",%1 \n" | |
1248 "sub $0x10,%2 \n" | |
1249 "jg 1b \n" | |
1250 : "+r"(src_abgr), // %0 | |
1251 "+r"(dst_y), // %1 | |
1252 "+r"(pix) // %2 | |
1253 : "m"(kABGRToY), // %3 | |
1254 "m"(kAddY16) // %4 | |
1255 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
1256 ); | |
1257 } | |
1258 | |
1259 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { | |
1260 asm volatile ( | |
1261 "movdqa %4,%%xmm5 \n" | |
1262 "movdqa %3,%%xmm4 \n" | |
1263 LABELALIGN | |
1264 "1: \n" | |
1265 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1266 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1267 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1268 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
1269 "pmaddubsw %%xmm4,%%xmm0 \n" | |
1270 "pmaddubsw %%xmm4,%%xmm1 \n" | |
1271 "pmaddubsw %%xmm4,%%xmm2 \n" | |
1272 "pmaddubsw %%xmm4,%%xmm3 \n" | |
1273 "lea " MEMLEA(0x40,0) ",%0 \n" | |
1274 "phaddw %%xmm1,%%xmm0 \n" | |
1275 "phaddw %%xmm3,%%xmm2 \n" | |
1276 "psrlw $0x7,%%xmm0 \n" | |
1277 "psrlw $0x7,%%xmm2 \n" | |
1278 "packuswb %%xmm2,%%xmm0 \n" | |
1279 "paddb %%xmm5,%%xmm0 \n" | |
1280 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
1281 "lea " MEMLEA(0x10,1) ",%1 \n" | |
1282 "sub $0x10,%2 \n" | |
1283 "jg 1b \n" | |
1284 : "+r"(src_rgba), // %0 | |
1285 "+r"(dst_y), // %1 | |
1286 "+r"(pix) // %2 | |
1287 : "m"(kRGBAToY), // %3 | |
1288 "m"(kAddY16) // %4 | |
1289 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
1290 ); | |
1291 } | |
1292 | |
1293 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, | |
1294 uint8* dst_u, uint8* dst_v, int width) { | |
1295 asm volatile ( | |
1296 "movdqa %5,%%xmm3 \n" | |
1297 "movdqa %6,%%xmm4 \n" | |
1298 "movdqa %7,%%xmm5 \n" | |
1299 "sub %1,%2 \n" | |
1300 LABELALIGN | |
1301 "1: \n" | |
1302 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1303 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
1304 "pavgb %%xmm7,%%xmm0 \n" | |
1305 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1306 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
1307 "pavgb %%xmm7,%%xmm1 \n" | |
1308 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1309 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
1310 "pavgb %%xmm7,%%xmm2 \n" | |
1311 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
1312 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
1313 "pavgb %%xmm7,%%xmm6 \n" | |
1314 | |
1315 "lea " MEMLEA(0x40,0) ",%0 \n" | |
1316 "movdqa %%xmm0,%%xmm7 \n" | |
1317 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
1318 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
1319 "pavgb %%xmm7,%%xmm0 \n" | |
1320 "movdqa %%xmm2,%%xmm7 \n" | |
1321 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
1322 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
1323 "pavgb %%xmm7,%%xmm2 \n" | |
1324 "movdqa %%xmm0,%%xmm1 \n" | |
1325 "movdqa %%xmm2,%%xmm6 \n" | |
1326 "pmaddubsw %%xmm4,%%xmm0 \n" | |
1327 "pmaddubsw %%xmm4,%%xmm2 \n" | |
1328 "pmaddubsw %%xmm3,%%xmm1 \n" | |
1329 "pmaddubsw %%xmm3,%%xmm6 \n" | |
1330 "phaddw %%xmm2,%%xmm0 \n" | |
1331 "phaddw %%xmm6,%%xmm1 \n" | |
1332 "psraw $0x8,%%xmm0 \n" | |
1333 "psraw $0x8,%%xmm1 \n" | |
1334 "packsswb %%xmm1,%%xmm0 \n" | |
1335 "paddb %%xmm5,%%xmm0 \n" | |
1336 "movlps %%xmm0," MEMACCESS(1) " \n" | |
1337 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
1338 "lea " MEMLEA(0x8,1) ",%1 \n" | |
1339 "sub $0x10,%3 \n" | |
1340 "jg 1b \n" | |
1341 : "+r"(src_abgr0), // %0 | |
1342 "+r"(dst_u), // %1 | |
1343 "+r"(dst_v), // %2 | |
1344 "+rm"(width) // %3 | |
1345 : "r"((intptr_t)(src_stride_abgr)), // %4 | |
1346 "m"(kABGRToV), // %5 | |
1347 "m"(kABGRToU), // %6 | |
1348 "m"(kAddUV128) // %7 | |
1349 : "memory", "cc", NACL_R14 | |
1350 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
1351 ); | |
1352 } | |
1353 | |
1354 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, | |
1355 uint8* dst_u, uint8* dst_v, int width) { | |
1356 asm volatile ( | |
1357 "movdqa %5,%%xmm3 \n" | |
1358 "movdqa %6,%%xmm4 \n" | |
1359 "movdqa %7,%%xmm5 \n" | |
1360 "sub %1,%2 \n" | |
1361 LABELALIGN | |
1362 "1: \n" | |
1363 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
1364 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | |
1365 "pavgb %%xmm7,%%xmm0 \n" | |
1366 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
1367 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 | |
1368 "pavgb %%xmm7,%%xmm1 \n" | |
1369 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
1370 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 | |
1371 "pavgb %%xmm7,%%xmm2 \n" | |
1372 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" | |
1373 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 | |
1374 "pavgb %%xmm7,%%xmm6 \n" | |
1375 | |
1376 "lea " MEMLEA(0x40,0) ",%0 \n" | |
1377 "movdqa %%xmm0,%%xmm7 \n" | |
1378 "shufps $0x88,%%xmm1,%%xmm0 \n" | |
1379 "shufps $0xdd,%%xmm1,%%xmm7 \n" | |
1380 "pavgb %%xmm7,%%xmm0 \n" | |
1381 "movdqa %%xmm2,%%xmm7 \n" | |
1382 "shufps $0x88,%%xmm6,%%xmm2 \n" | |
1383 "shufps $0xdd,%%xmm6,%%xmm7 \n" | |
1384 "pavgb %%xmm7,%%xmm2 \n" | |
1385 "movdqa %%xmm0,%%xmm1 \n" | |
1386 "movdqa %%xmm2,%%xmm6 \n" | |
1387 "pmaddubsw %%xmm4,%%xmm0 \n" | |
1388 "pmaddubsw %%xmm4,%%xmm2 \n" | |
1389 "pmaddubsw %%xmm3,%%xmm1 \n" | |
1390 "pmaddubsw %%xmm3,%%xmm6 \n" | |
1391 "phaddw %%xmm2,%%xmm0 \n" | |
1392 "phaddw %%xmm6,%%xmm1 \n" | |
1393 "psraw $0x8,%%xmm0 \n" | |
1394 "psraw $0x8,%%xmm1 \n" | |
1395 "packsswb %%xmm1,%%xmm0 \n" | |
1396 "paddb %%xmm5,%%xmm0 \n" | |
1397 "movlps %%xmm0," MEMACCESS(1) " \n" | |
1398 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) | |
1399 "lea " MEMLEA(0x8,1) ",%1 \n" | |
1400 "sub $0x10,%3 \n" | |
1401 "jg 1b \n" | |
1402 : "+r"(src_rgba0), // %0 | |
1403 "+r"(dst_u), // %1 | |
1404 "+r"(dst_v), // %2 | |
1405 "+rm"(width) // %3 | |
1406 : "r"((intptr_t)(src_stride_rgba)), // %4 | |
1407 "m"(kRGBAToV), // %5 | |
1408 "m"(kRGBAToU), // %6 | |
1409 "m"(kAddUV128) // %7 | |
1410 : "memory", "cc", NACL_R14 | |
1411 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | |
1412 ); | |
1413 } | |
1414 | |
1415 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) | |
1416 | |
1417 // YUV to RGB conversion constants. | |
1418 // Y contribution to R,G,B. Scale and bias. | |
1419 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ | |
1420 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ | |
1421 | |
1422 // U and V contributions to R,G,B. | |
1423 #define UB -128 /* -min(128, round(2.018 * 64)) */ | |
1424 #define UG 25 /* -round(-0.391 * 64) */ | |
1425 #define VG 52 /* -round(-0.813 * 64) */ | |
1426 #define VR -102 /* -round(1.596 * 64) */ | |
1427 | |
1428 // Bias values to subtract 16 from Y and 128 from U and V. | |
1429 #define BB (UB * 128 - YGB) | |
1430 #define BG (UG * 128 + VG * 128 - YGB) | |
1431 #define BR (VR * 128 - YGB) | |
1432 | |
1433 struct YuvConstants { | |
1434 lvec8 kUVToB; // 0 | |
1435 lvec8 kUVToG; // 32 | |
1436 lvec8 kUVToR; // 64 | |
1437 lvec16 kUVBiasB; // 96 | |
1438 lvec16 kUVBiasG; // 128 | |
1439 lvec16 kUVBiasR; // 160 | |
1440 lvec16 kYToRgb; // 192 | |
1441 }; | |
1442 | |
1443 // BT601 constants for YUV to RGB. | |
1444 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { | |
1445 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, | |
1446 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, | |
1447 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, | |
1448 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, | |
1449 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, | |
1450 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, | |
1451 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | |
1452 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | |
1453 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | |
1454 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | |
1455 }; | |
1456 | |
1457 // BT601 constants for NV21 where chroma plane is VU instead of UV. | |
1458 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { | |
1459 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, | |
1460 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, | |
1461 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, | |
1462 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, | |
1463 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, | |
1464 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, | |
1465 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | |
1466 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | |
1467 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | |
1468 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | |
1469 }; | |
1470 | |
1471 // Read 8 UV from 411 | |
1472 #define READYUV444 \ | |
1473 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
1474 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
1475 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | |
1476 "punpcklbw %%xmm1,%%xmm0 \n" | |
1477 | |
1478 // Read 4 UV from 422, upsample to 8 UV | |
1479 #define READYUV422 \ | |
1480 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
1481 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
1482 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | |
1483 "punpcklbw %%xmm1,%%xmm0 \n" \ | |
1484 "punpcklwd %%xmm0,%%xmm0 \n" | |
1485 | |
1486 // Read 2 UV from 411, upsample to 8 UV | |
1487 #define READYUV411 \ | |
1488 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
1489 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
1490 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ | |
1491 "punpcklbw %%xmm1,%%xmm0 \n" \ | |
1492 "punpcklwd %%xmm0,%%xmm0 \n" \ | |
1493 "punpckldq %%xmm0,%%xmm0 \n" | |
1494 | |
1495 // Read 4 UV from NV12, upsample to 8 UV | |
1496 #define READNV12 \ | |
1497 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | |
1498 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | |
1499 "punpcklwd %%xmm0,%%xmm0 \n" | |
1500 | |
1501 // Convert 8 pixels: 8 UV and 8 Y | |
1502 #define YUVTORGB(YuvConstants) \ | |
1503 "movdqa %%xmm0,%%xmm1 \n" \ | |
1504 "movdqa %%xmm0,%%xmm2 \n" \ | |
1505 "movdqa %%xmm0,%%xmm3 \n" \ | |
1506 "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \ | |
1507 "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \ | |
1508 "psubw %%xmm1,%%xmm0 \n" \ | |
1509 "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \ | |
1510 "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \ | |
1511 "psubw %%xmm2,%%xmm1 \n" \ | |
1512 "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \ | |
1513 "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \ | |
1514 "psubw %%xmm3,%%xmm2 \n" \ | |
1515 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | |
1516 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ | |
1517 "punpcklbw %%xmm3,%%xmm3 \n" \ | |
1518 "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \ | |
1519 "paddsw %%xmm3,%%xmm0 \n" \ | |
1520 "paddsw %%xmm3,%%xmm1 \n" \ | |
1521 "paddsw %%xmm3,%%xmm2 \n" \ | |
1522 "psraw $0x6,%%xmm0 \n" \ | |
1523 "psraw $0x6,%%xmm1 \n" \ | |
1524 "psraw $0x6,%%xmm2 \n" \ | |
1525 "packuswb %%xmm0,%%xmm0 \n" \ | |
1526 "packuswb %%xmm1,%%xmm1 \n" \ | |
1527 "packuswb %%xmm2,%%xmm2 \n" | |
1528 | |
1529 // Store 8 ARGB values. Assumes XMM5 is zero. | |
1530 #define STOREARGB \ | |
1531 "punpcklbw %%xmm1,%%xmm0 \n" \ | |
1532 "punpcklbw %%xmm5,%%xmm2 \n" \ | |
1533 "movdqa %%xmm0,%%xmm1 \n" \ | |
1534 "punpcklwd %%xmm2,%%xmm0 \n" \ | |
1535 "punpckhwd %%xmm2,%%xmm1 \n" \ | |
1536 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | |
1537 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ | |
1538 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" | |
1539 | |
1540 // Store 8 BGRA values. Assumes XMM5 is zero. | |
1541 #define STOREBGRA \ | |
1542 "pcmpeqb %%xmm5,%%xmm5 \n" \ | |
1543 "punpcklbw %%xmm0,%%xmm1 \n" \ | |
1544 "punpcklbw %%xmm2,%%xmm5 \n" \ | |
1545 "movdqa %%xmm5,%%xmm0 \n" \ | |
1546 "punpcklwd %%xmm1,%%xmm5 \n" \ | |
1547 "punpckhwd %%xmm1,%%xmm0 \n" \ | |
1548 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ | |
1549 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \ | |
1550 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" | |
1551 | |
1552 // Store 8 ABGR values. Assumes XMM5 is zero. | |
1553 #define STOREABGR \ | |
1554 "punpcklbw %%xmm1,%%xmm2 \n" \ | |
1555 "punpcklbw %%xmm5,%%xmm0 \n" \ | |
1556 "movdqa %%xmm2,%%xmm1 \n" \ | |
1557 "punpcklwd %%xmm0,%%xmm2 \n" \ | |
1558 "punpckhwd %%xmm0,%%xmm1 \n" \ | |
1559 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ | |
1560 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \ | |
1561 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" | |
1562 | |
1563 // Store 8 RGBA values. Assumes XMM5 is zero. | |
1564 #define STORERGBA \ | |
1565 "pcmpeqb %%xmm5,%%xmm5 \n" \ | |
1566 "punpcklbw %%xmm2,%%xmm1 \n" \ | |
1567 "punpcklbw %%xmm0,%%xmm5 \n" \ | |
1568 "movdqa %%xmm5,%%xmm0 \n" \ | |
1569 "punpcklwd %%xmm1,%%xmm5 \n" \ | |
1570 "punpckhwd %%xmm1,%%xmm0 \n" \ | |
1571 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | |
1572 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \ | |
1573 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" | |
1574 | |
1575 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, | |
1576 const uint8* u_buf, | |
1577 const uint8* v_buf, | |
1578 uint8* dst_argb, | |
1579 int width) { | |
1580 asm volatile ( | |
1581 "sub %[u_buf],%[v_buf] \n" | |
1582 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1583 LABELALIGN | |
1584 "1: \n" | |
1585 READYUV444 | |
1586 YUVTORGB(kYuvConstants) | |
1587 STOREARGB | |
1588 "sub $0x8,%[width] \n" | |
1589 "jg 1b \n" | |
1590 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1591 [u_buf]"+r"(u_buf), // %[u_buf] | |
1592 [v_buf]"+r"(v_buf), // %[v_buf] | |
1593 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
1594 [width]"+rm"(width) // %[width] | |
1595 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1596 : "memory", "cc", NACL_R14 | |
1597 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1598 ); | |
1599 } | |
1600 | |
1601 // TODO(fbarchard): Consider putting masks into constants. | |
1602 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, | |
1603 const uint8* u_buf, | |
1604 const uint8* v_buf, | |
1605 uint8* dst_rgb24, | |
1606 int width) { | |
1607 asm volatile ( | |
1608 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" | |
1609 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" | |
1610 "sub %[u_buf],%[v_buf] \n" | |
1611 LABELALIGN | |
1612 "1: \n" | |
1613 READYUV422 | |
1614 YUVTORGB(kYuvConstants) | |
1615 "punpcklbw %%xmm1,%%xmm0 \n" | |
1616 "punpcklbw %%xmm2,%%xmm2 \n" | |
1617 "movdqa %%xmm0,%%xmm1 \n" | |
1618 "punpcklwd %%xmm2,%%xmm0 \n" | |
1619 "punpckhwd %%xmm2,%%xmm1 \n" | |
1620 "pshufb %%xmm5,%%xmm0 \n" | |
1621 "pshufb %%xmm6,%%xmm1 \n" | |
1622 "palignr $0xc,%%xmm0,%%xmm1 \n" | |
1623 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" | |
1624 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" | |
1625 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" | |
1626 "subl $0x8,%[width] \n" | |
1627 "jg 1b \n" | |
1628 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1629 [u_buf]"+r"(u_buf), // %[u_buf] | |
1630 [v_buf]"+r"(v_buf), // %[v_buf] | |
1631 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] | |
1632 // TODO(fbarchard): Make width a register for 32 bit. | |
1633 #if defined(__i386__) && defined(__pic__) | |
1634 [width]"+m"(width) // %[width] | |
1635 #else | |
1636 [width]"+rm"(width) // %[width] | |
1637 #endif | |
1638 : [kYuvConstants]"r"(&kYuvConstants.kUVToB), | |
1639 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), | |
1640 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) | |
1641 : "memory", "cc", NACL_R14 | |
1642 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" | |
1643 ); | |
1644 } | |
1645 | |
1646 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, | |
1647 const uint8* u_buf, | |
1648 const uint8* v_buf, | |
1649 uint8* dst_raw, | |
1650 int width) { | |
1651 asm volatile ( | |
1652 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" | |
1653 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" | |
1654 "sub %[u_buf],%[v_buf] \n" | |
1655 LABELALIGN | |
1656 "1: \n" | |
1657 READYUV422 | |
1658 YUVTORGB(kYuvConstants) | |
1659 "punpcklbw %%xmm1,%%xmm0 \n" | |
1660 "punpcklbw %%xmm2,%%xmm2 \n" | |
1661 "movdqa %%xmm0,%%xmm1 \n" | |
1662 "punpcklwd %%xmm2,%%xmm0 \n" | |
1663 "punpckhwd %%xmm2,%%xmm1 \n" | |
1664 "pshufb %%xmm5,%%xmm0 \n" | |
1665 "pshufb %%xmm6,%%xmm1 \n" | |
1666 "palignr $0xc,%%xmm0,%%xmm1 \n" | |
1667 "movq %%xmm0," MEMACCESS([dst_raw]) " \n" | |
1668 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" | |
1669 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" | |
1670 "subl $0x8,%[width] \n" | |
1671 "jg 1b \n" | |
1672 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1673 [u_buf]"+r"(u_buf), // %[u_buf] | |
1674 [v_buf]"+r"(v_buf), // %[v_buf] | |
1675 [dst_raw]"+r"(dst_raw), // %[dst_raw] | |
1676 // TODO(fbarchard): Make width a register for 32 bit. | |
1677 #if defined(__i386__) && defined(__pic__) | |
1678 [width]"+m"(width) // %[width] | |
1679 #else | |
1680 [width]"+rm"(width) // %[width] | |
1681 #endif | |
1682 : [kYuvConstants]"r"(&kYuvConstants.kUVToB), | |
1683 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), | |
1684 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) | |
1685 : "memory", "cc", NACL_R14 | |
1686 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" | |
1687 ); | |
1688 } | |
1689 | |
1690 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, | |
1691 const uint8* u_buf, | |
1692 const uint8* v_buf, | |
1693 uint8* dst_argb, | |
1694 int width) { | |
1695 asm volatile ( | |
1696 "sub %[u_buf],%[v_buf] \n" | |
1697 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1698 LABELALIGN | |
1699 "1: \n" | |
1700 READYUV422 | |
1701 YUVTORGB(kYuvConstants) | |
1702 STOREARGB | |
1703 "sub $0x8,%[width] \n" | |
1704 "jg 1b \n" | |
1705 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1706 [u_buf]"+r"(u_buf), // %[u_buf] | |
1707 [v_buf]"+r"(v_buf), // %[v_buf] | |
1708 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
1709 [width]"+rm"(width) // %[width] | |
1710 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1711 : "memory", "cc", NACL_R14 | |
1712 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1713 ); | |
1714 } | |
1715 | |
1716 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | |
1717 const uint8* u_buf, | |
1718 const uint8* v_buf, | |
1719 uint8* dst_argb, | |
1720 int width) { | |
1721 asm volatile ( | |
1722 "sub %[u_buf],%[v_buf] \n" | |
1723 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1724 LABELALIGN | |
1725 "1: \n" | |
1726 READYUV411 | |
1727 YUVTORGB(kYuvConstants) | |
1728 STOREARGB | |
1729 "sub $0x8,%[width] \n" | |
1730 "jg 1b \n" | |
1731 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1732 [u_buf]"+r"(u_buf), // %[u_buf] | |
1733 [v_buf]"+r"(v_buf), // %[v_buf] | |
1734 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
1735 [width]"+rm"(width) // %[width] | |
1736 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1737 : "memory", "cc", NACL_R14 | |
1738 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1739 ); | |
1740 } | |
1741 | |
1742 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | |
1743 const uint8* uv_buf, | |
1744 uint8* dst_argb, | |
1745 int width) { | |
1746 asm volatile ( | |
1747 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1748 LABELALIGN | |
1749 "1: \n" | |
1750 READNV12 | |
1751 YUVTORGB(kYuvConstants) | |
1752 STOREARGB | |
1753 "sub $0x8,%[width] \n" | |
1754 "jg 1b \n" | |
1755 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1756 [uv_buf]"+r"(uv_buf), // %[uv_buf] | |
1757 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
1758 [width]"+rm"(width) // %[width] | |
1759 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1760 // Does not use r14. | |
1761 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1762 ); | |
1763 } | |
1764 | |
1765 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, | |
1766 const uint8* uv_buf, | |
1767 uint8* dst_argb, | |
1768 int width) { | |
1769 asm volatile ( | |
1770 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1771 LABELALIGN | |
1772 "1: \n" | |
1773 READNV12 | |
1774 YUVTORGB(kYuvConstants) | |
1775 STOREARGB | |
1776 "sub $0x8,%[width] \n" | |
1777 "jg 1b \n" | |
1778 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1779 [uv_buf]"+r"(uv_buf), // %[uv_buf] | |
1780 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
1781 [width]"+rm"(width) // %[width] | |
1782 : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants] | |
1783 // Does not use r14. | |
1784 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1785 ); | |
1786 } | |
1787 | |
1788 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, | |
1789 const uint8* u_buf, | |
1790 const uint8* v_buf, | |
1791 uint8* dst_bgra, | |
1792 int width) { | |
1793 asm volatile ( | |
1794 "sub %[u_buf],%[v_buf] \n" | |
1795 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1796 LABELALIGN | |
1797 "1: \n" | |
1798 READYUV422 | |
1799 YUVTORGB(kYuvConstants) | |
1800 STOREBGRA | |
1801 "sub $0x8,%[width] \n" | |
1802 "jg 1b \n" | |
1803 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1804 [u_buf]"+r"(u_buf), // %[u_buf] | |
1805 [v_buf]"+r"(v_buf), // %[v_buf] | |
1806 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | |
1807 [width]"+rm"(width) // %[width] | |
1808 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1809 : "memory", "cc", NACL_R14 | |
1810 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1811 ); | |
1812 } | |
1813 | |
1814 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, | |
1815 const uint8* u_buf, | |
1816 const uint8* v_buf, | |
1817 uint8* dst_abgr, | |
1818 int width) { | |
1819 asm volatile ( | |
1820 "sub %[u_buf],%[v_buf] \n" | |
1821 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1822 LABELALIGN | |
1823 "1: \n" | |
1824 READYUV422 | |
1825 YUVTORGB(kYuvConstants) | |
1826 STOREABGR | |
1827 "sub $0x8,%[width] \n" | |
1828 "jg 1b \n" | |
1829 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1830 [u_buf]"+r"(u_buf), // %[u_buf] | |
1831 [v_buf]"+r"(v_buf), // %[v_buf] | |
1832 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | |
1833 [width]"+rm"(width) // %[width] | |
1834 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1835 : "memory", "cc", NACL_R14 | |
1836 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1837 ); | |
1838 } | |
1839 | |
1840 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, | |
1841 const uint8* u_buf, | |
1842 const uint8* v_buf, | |
1843 uint8* dst_rgba, | |
1844 int width) { | |
1845 asm volatile ( | |
1846 "sub %[u_buf],%[v_buf] \n" | |
1847 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1848 LABELALIGN | |
1849 "1: \n" | |
1850 READYUV422 | |
1851 YUVTORGB(kYuvConstants) | |
1852 STORERGBA | |
1853 "sub $0x8,%[width] \n" | |
1854 "jg 1b \n" | |
1855 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1856 [u_buf]"+r"(u_buf), // %[u_buf] | |
1857 [v_buf]"+r"(v_buf), // %[v_buf] | |
1858 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] | |
1859 [width]"+rm"(width) // %[width] | |
1860 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1861 : "memory", "cc", NACL_R14 | |
1862 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1863 ); | |
1864 } | |
1865 | |
1866 #endif // HAS_I422TOARGBROW_SSSE3 | |
1867 | |
1868 // Read 8 UV from 422, upsample to 16 UV. | |
1869 #define READYUV422_AVX2 \ | |
1870 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
1871 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
1872 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | |
1873 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | |
1874 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | |
1875 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" | |
1876 | |
1877 // Convert 16 pixels: 16 UV and 16 Y. | |
1878 #define YUVTORGB_AVX2(YuvConstants) \ | |
1879 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ | |
1880 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ | |
1881 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ | |
1882 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ | |
1883 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | |
1884 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \ | |
1885 "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \ | |
1886 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \ | |
1887 "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \ | |
1888 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | |
1889 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | |
1890 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ | |
1891 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ | |
1892 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ | |
1893 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ | |
1894 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ | |
1895 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ | |
1896 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | |
1897 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | |
1898 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | |
1899 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | |
1900 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | |
1901 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | |
1902 | |
1903 #if defined(HAS_I422TOBGRAROW_AVX2) | |
1904 // 16 pixels | |
1905 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | |
1906 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | |
1907 const uint8* u_buf, | |
1908 const uint8* v_buf, | |
1909 uint8* dst_bgra, | |
1910 int width) { | |
1911 asm volatile ( | |
1912 "sub %[u_buf],%[v_buf] \n" | |
1913 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
1914 LABELALIGN | |
1915 "1: \n" | |
1916 READYUV422_AVX2 | |
1917 YUVTORGB_AVX2(kYuvConstants) | |
1918 | |
1919 // Step 3: Weave into BGRA | |
1920 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB | |
1921 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
1922 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR | |
1923 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
1924 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels | |
1925 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels | |
1926 | |
1927 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" | |
1928 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" | |
1929 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" | |
1930 "sub $0x10,%[width] \n" | |
1931 "jg 1b \n" | |
1932 "vzeroupper \n" | |
1933 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1934 [u_buf]"+r"(u_buf), // %[u_buf] | |
1935 [v_buf]"+r"(v_buf), // %[v_buf] | |
1936 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | |
1937 [width]"+rm"(width) // %[width] | |
1938 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1939 : "memory", "cc", NACL_R14 | |
1940 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1941 ); | |
1942 } | |
1943 #endif // HAS_I422TOBGRAROW_AVX2 | |
1944 | |
1945 #if defined(HAS_I422TOARGBROW_AVX2) | |
1946 // 16 pixels | |
1947 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | |
1948 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, | |
1949 const uint8* u_buf, | |
1950 const uint8* v_buf, | |
1951 uint8* dst_argb, | |
1952 int width) { | |
1953 asm volatile ( | |
1954 "sub %[u_buf],%[v_buf] \n" | |
1955 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
1956 LABELALIGN | |
1957 "1: \n" | |
1958 READYUV422_AVX2 | |
1959 YUVTORGB_AVX2(kYuvConstants) | |
1960 | |
1961 // Step 3: Weave into ARGB | |
1962 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG | |
1963 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
1964 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA | |
1965 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
1966 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels | |
1967 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels | |
1968 | |
1969 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" | |
1970 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" | |
1971 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
1972 "sub $0x10,%[width] \n" | |
1973 "jg 1b \n" | |
1974 "vzeroupper \n" | |
1975 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1976 [u_buf]"+r"(u_buf), // %[u_buf] | |
1977 [v_buf]"+r"(v_buf), // %[v_buf] | |
1978 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
1979 [width]"+rm"(width) // %[width] | |
1980 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
1981 : "memory", "cc", NACL_R14 | |
1982 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
1983 ); | |
1984 } | |
1985 #endif // HAS_I422TOARGBROW_AVX2 | |
1986 | |
1987 #if defined(HAS_I422TOABGRROW_AVX2) | |
1988 // 16 pixels | |
1989 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | |
1990 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, | |
1991 const uint8* u_buf, | |
1992 const uint8* v_buf, | |
1993 uint8* dst_argb, | |
1994 int width) { | |
1995 asm volatile ( | |
1996 "sub %[u_buf],%[v_buf] \n" | |
1997 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
1998 LABELALIGN | |
1999 "1: \n" | |
2000 READYUV422_AVX2 | |
2001 YUVTORGB_AVX2(kYuvConstants) | |
2002 | |
2003 // Step 3: Weave into ABGR | |
2004 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG | |
2005 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
2006 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA | |
2007 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
2008 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels | |
2009 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels | |
2010 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | |
2011 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | |
2012 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
2013 "sub $0x10,%[width] \n" | |
2014 "jg 1b \n" | |
2015 "vzeroupper \n" | |
2016 : [y_buf]"+r"(y_buf), // %[y_buf] | |
2017 [u_buf]"+r"(u_buf), // %[u_buf] | |
2018 [v_buf]"+r"(v_buf), // %[v_buf] | |
2019 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
2020 [width]"+rm"(width) // %[width] | |
2021 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
2022 : "memory", "cc", NACL_R14 | |
2023 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
2024 ); | |
2025 } | |
2026 #endif // HAS_I422TOABGRROW_AVX2 | |
2027 | |
2028 #if defined(HAS_I422TORGBAROW_AVX2) | |
2029 // 16 pixels | |
2030 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | |
2031 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, | |
2032 const uint8* u_buf, | |
2033 const uint8* v_buf, | |
2034 uint8* dst_argb, | |
2035 int width) { | |
2036 asm volatile ( | |
2037 "sub %[u_buf],%[v_buf] \n" | |
2038 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
2039 LABELALIGN | |
2040 "1: \n" | |
2041 READYUV422_AVX2 | |
2042 YUVTORGB_AVX2(kYuvConstants) | |
2043 | |
2044 // Step 3: Weave into RGBA | |
2045 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" | |
2046 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
2047 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" | |
2048 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
2049 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" | |
2050 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" | |
2051 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | |
2052 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | |
2053 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
2054 "sub $0x10,%[width] \n" | |
2055 "jg 1b \n" | |
2056 "vzeroupper \n" | |
2057 : [y_buf]"+r"(y_buf), // %[y_buf] | |
2058 [u_buf]"+r"(u_buf), // %[u_buf] | |
2059 [v_buf]"+r"(v_buf), // %[v_buf] | |
2060 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
2061 [width]"+rm"(width) // %[width] | |
2062 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | |
2063 : "memory", "cc", NACL_R14 | |
2064 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
2065 ); | |
2066 } | |
2067 #endif // HAS_I422TORGBAROW_AVX2 | |
2068 | |
2069 #ifdef HAS_YTOARGBROW_SSE2 | |
2070 void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | |
2071 asm volatile ( | |
2072 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | |
2073 "movd %%eax,%%xmm2 \n" | |
2074 "pshufd $0x0,%%xmm2,%%xmm2 \n" | |
2075 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 | |
2076 "movd %%eax,%%xmm3 \n" | |
2077 "pshufd $0x0,%%xmm3,%%xmm3 \n" | |
2078 "pcmpeqb %%xmm4,%%xmm4 \n" | |
2079 "pslld $0x18,%%xmm4 \n" | |
2080 LABELALIGN | |
2081 "1: \n" | |
2082 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 | |
2083 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
2084 "lea " MEMLEA(0x8,0) ",%0 \n" | |
2085 "punpcklbw %%xmm0,%%xmm0 \n" | |
2086 "pmulhuw %%xmm2,%%xmm0 \n" | |
2087 "psubusw %%xmm3,%%xmm0 \n" | |
2088 "psrlw $6, %%xmm0 \n" | |
2089 "packuswb %%xmm0,%%xmm0 \n" | |
2090 | |
2091 // Step 2: Weave into ARGB | |
2092 "punpcklbw %%xmm0,%%xmm0 \n" | |
2093 "movdqa %%xmm0,%%xmm1 \n" | |
2094 "punpcklwd %%xmm0,%%xmm0 \n" | |
2095 "punpckhwd %%xmm1,%%xmm1 \n" | |
2096 "por %%xmm4,%%xmm0 \n" | |
2097 "por %%xmm4,%%xmm1 \n" | |
2098 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2099 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
2100 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2101 | |
2102 "sub $0x8,%2 \n" | |
2103 "jg 1b \n" | |
2104 : "+r"(y_buf), // %0 | |
2105 "+r"(dst_argb), // %1 | |
2106 "+rm"(width) // %2 | |
2107 : | |
2108 : "memory", "cc", "eax" | |
2109 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | |
2110 ); | |
2111 } | |
2112 #endif // HAS_YTOARGBROW_SSE2 | |
2113 | |
2114 #ifdef HAS_YTOARGBROW_AVX2 | |
2115 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). | |
2116 // note: vpunpcklbw mutates and vpackuswb unmutates. | |
2117 void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { | |
2118 asm volatile ( | |
2119 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 | |
2120 "vmovd %%eax,%%xmm2 \n" | |
2121 "vbroadcastss %%xmm2,%%ymm2 \n" | |
2122 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 | |
2123 "vmovd %%eax,%%xmm3 \n" | |
2124 "vbroadcastss %%xmm3,%%ymm3 \n" | |
2125 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" | |
2126 "vpslld $0x18,%%ymm4,%%ymm4 \n" | |
2127 | |
2128 LABELALIGN | |
2129 "1: \n" | |
2130 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 | |
2131 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2132 "lea " MEMLEA(0x10,0) ",%0 \n" | |
2133 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
2134 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" | |
2135 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | |
2136 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" | |
2137 "vpsrlw $0x6,%%ymm0,%%ymm0 \n" | |
2138 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
2139 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" | |
2140 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
2141 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" | |
2142 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" | |
2143 "vpor %%ymm4,%%ymm0,%%ymm0 \n" | |
2144 "vpor %%ymm4,%%ymm1,%%ymm1 \n" | |
2145 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
2146 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | |
2147 "lea " MEMLEA(0x40,1) ",%1 \n" | |
2148 "sub $0x10,%2 \n" | |
2149 "jg 1b \n" | |
2150 "vzeroupper \n" | |
2151 : "+r"(y_buf), // %0 | |
2152 "+r"(dst_argb), // %1 | |
2153 "+rm"(width) // %2 | |
2154 : | |
2155 : "memory", "cc", "eax" | |
2156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | |
2157 ); | |
2158 } | |
2159 #endif // HAS_YTOARGBROW_AVX2 | |
2160 | |
2161 #ifdef HAS_MIRRORROW_SSSE3 | |
2162 // Shuffle table for reversing the bytes. | |
2163 static uvec8 kShuffleMirror = { | |
2164 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | |
2165 }; | |
2166 | |
2167 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { | |
2168 intptr_t temp_width = (intptr_t)(width); | |
2169 asm volatile ( | |
2170 "movdqa %3,%%xmm5 \n" | |
2171 LABELALIGN | |
2172 "1: \n" | |
2173 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 | |
2174 "pshufb %%xmm5,%%xmm0 \n" | |
2175 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2176 "lea " MEMLEA(0x10,1) ",%1 \n" | |
2177 "sub $0x10,%2 \n" | |
2178 "jg 1b \n" | |
2179 : "+r"(src), // %0 | |
2180 "+r"(dst), // %1 | |
2181 "+r"(temp_width) // %2 | |
2182 : "m"(kShuffleMirror) // %3 | |
2183 : "memory", "cc", NACL_R14 | |
2184 "xmm0", "xmm5" | |
2185 ); | |
2186 } | |
2187 #endif // HAS_MIRRORROW_SSSE3 | |
2188 | |
2189 #ifdef HAS_MIRRORROW_AVX2 | |
2190 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | |
2191 intptr_t temp_width = (intptr_t)(width); | |
2192 asm volatile ( | |
2193 "vbroadcastf128 %3,%%ymm5 \n" | |
2194 LABELALIGN | |
2195 "1: \n" | |
2196 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 | |
2197 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" | |
2198 "vpermq $0x4e,%%ymm0,%%ymm0 \n" | |
2199 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
2200 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2201 "sub $0x20,%2 \n" | |
2202 "jg 1b \n" | |
2203 "vzeroupper \n" | |
2204 : "+r"(src), // %0 | |
2205 "+r"(dst), // %1 | |
2206 "+r"(temp_width) // %2 | |
2207 : "m"(kShuffleMirror) // %3 | |
2208 : "memory", "cc", NACL_R14 | |
2209 "xmm0", "xmm5" | |
2210 ); | |
2211 } | |
2212 #endif // HAS_MIRRORROW_AVX2 | |
2213 | |
2214 #ifdef HAS_MIRRORROW_SSE2 | |
2215 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | |
2216 intptr_t temp_width = (intptr_t)(width); | |
2217 asm volatile ( | |
2218 LABELALIGN | |
2219 "1: \n" | |
2220 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 | |
2221 "movdqa %%xmm0,%%xmm1 \n" | |
2222 "psllw $0x8,%%xmm0 \n" | |
2223 "psrlw $0x8,%%xmm1 \n" | |
2224 "por %%xmm1,%%xmm0 \n" | |
2225 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" | |
2226 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" | |
2227 "pshufd $0x4e,%%xmm0,%%xmm0 \n" | |
2228 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2229 "lea " MEMLEA(0x10,1)",%1 \n" | |
2230 "sub $0x10,%2 \n" | |
2231 "jg 1b \n" | |
2232 : "+r"(src), // %0 | |
2233 "+r"(dst), // %1 | |
2234 "+r"(temp_width) // %2 | |
2235 : | |
2236 : "memory", "cc", NACL_R14 | |
2237 "xmm0", "xmm1" | |
2238 ); | |
2239 } | |
2240 #endif // HAS_MIRRORROW_SSE2 | |
2241 | |
2242 #ifdef HAS_MIRRORROW_UV_SSSE3 | |
2243 // Shuffle table for reversing the bytes of UV channels. | |
2244 static uvec8 kShuffleMirrorUV = { | |
2245 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | |
2246 }; | |
2247 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | |
2248 int width) { | |
2249 intptr_t temp_width = (intptr_t)(width); | |
2250 asm volatile ( | |
2251 "movdqa %4,%%xmm1 \n" | |
2252 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" | |
2253 "sub %1,%2 \n" | |
2254 LABELALIGN | |
2255 "1: \n" | |
2256 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2257 "lea " MEMLEA(-0x10,0) ",%0 \n" | |
2258 "pshufb %%xmm1,%%xmm0 \n" | |
2259 "movlpd %%xmm0," MEMACCESS(1) " \n" | |
2260 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) | |
2261 "lea " MEMLEA(0x8,1) ",%1 \n" | |
2262 "sub $8,%3 \n" | |
2263 "jg 1b \n" | |
2264 : "+r"(src), // %0 | |
2265 "+r"(dst_u), // %1 | |
2266 "+r"(dst_v), // %2 | |
2267 "+r"(temp_width) // %3 | |
2268 : "m"(kShuffleMirrorUV) // %4 | |
2269 : "memory", "cc", NACL_R14 | |
2270 "xmm0", "xmm1" | |
2271 ); | |
2272 } | |
2273 #endif // HAS_MIRRORROW_UV_SSSE3 | |
2274 | |
2275 #ifdef HAS_ARGBMIRRORROW_SSE2 | |
2276 | |
2277 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | |
2278 intptr_t temp_width = (intptr_t)(width); | |
2279 asm volatile ( | |
2280 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" | |
2281 LABELALIGN | |
2282 "1: \n" | |
2283 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2284 "pshufd $0x1b,%%xmm0,%%xmm0 \n" | |
2285 "lea " MEMLEA(-0x10,0) ",%0 \n" | |
2286 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2287 "lea " MEMLEA(0x10,1) ",%1 \n" | |
2288 "sub $0x4,%2 \n" | |
2289 "jg 1b \n" | |
2290 : "+r"(src), // %0 | |
2291 "+r"(dst), // %1 | |
2292 "+r"(temp_width) // %2 | |
2293 : | |
2294 : "memory", "cc" | |
2295 , "xmm0" | |
2296 ); | |
2297 } | |
2298 #endif // HAS_ARGBMIRRORROW_SSE2 | |
2299 | |
2300 #ifdef HAS_ARGBMIRRORROW_AVX2 | |
2301 // Shuffle table for reversing the bytes. | |
2302 static const ulvec32 kARGBShuffleMirror_AVX2 = { | |
2303 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | |
2304 }; | |
2305 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | |
2306 intptr_t temp_width = (intptr_t)(width); | |
2307 asm volatile ( | |
2308 "vmovdqu %3,%%ymm5 \n" | |
2309 LABELALIGN | |
2310 "1: \n" | |
2311 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 | |
2312 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
2313 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2314 "sub $0x8,%2 \n" | |
2315 "jg 1b \n" | |
2316 "vzeroupper \n" | |
2317 : "+r"(src), // %0 | |
2318 "+r"(dst), // %1 | |
2319 "+r"(temp_width) // %2 | |
2320 : "m"(kARGBShuffleMirror_AVX2) // %3 | |
2321 : "memory", "cc", NACL_R14 | |
2322 "xmm0", "xmm5" | |
2323 ); | |
2324 } | |
2325 #endif // HAS_ARGBMIRRORROW_AVX2 | |
2326 | |
2327 #ifdef HAS_SPLITUVROW_AVX2 | |
2328 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | |
2329 asm volatile ( | |
2330 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
2331 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
2332 "sub %1,%2 \n" | |
2333 LABELALIGN | |
2334 "1: \n" | |
2335 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
2336 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
2337 "lea " MEMLEA(0x40,0) ",%0 \n" | |
2338 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" | |
2339 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" | |
2340 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | |
2341 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | |
2342 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
2343 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" | |
2344 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
2345 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
2346 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
2347 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) | |
2348 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2349 "sub $0x20,%3 \n" | |
2350 "jg 1b \n" | |
2351 "vzeroupper \n" | |
2352 : "+r"(src_uv), // %0 | |
2353 "+r"(dst_u), // %1 | |
2354 "+r"(dst_v), // %2 | |
2355 "+r"(pix) // %3 | |
2356 : | |
2357 : "memory", "cc", NACL_R14 | |
2358 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
2359 ); | |
2360 } | |
2361 #endif // HAS_SPLITUVROW_AVX2 | |
2362 | |
2363 #ifdef HAS_SPLITUVROW_SSE2 | |
2364 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { | |
2365 asm volatile ( | |
2366 "pcmpeqb %%xmm5,%%xmm5 \n" | |
2367 "psrlw $0x8,%%xmm5 \n" | |
2368 "sub %1,%2 \n" | |
2369 LABELALIGN | |
2370 "1: \n" | |
2371 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2372 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
2373 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2374 "movdqa %%xmm0,%%xmm2 \n" | |
2375 "movdqa %%xmm1,%%xmm3 \n" | |
2376 "pand %%xmm5,%%xmm0 \n" | |
2377 "pand %%xmm5,%%xmm1 \n" | |
2378 "packuswb %%xmm1,%%xmm0 \n" | |
2379 "psrlw $0x8,%%xmm2 \n" | |
2380 "psrlw $0x8,%%xmm3 \n" | |
2381 "packuswb %%xmm3,%%xmm2 \n" | |
2382 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2383 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) | |
2384 "lea " MEMLEA(0x10,1) ",%1 \n" | |
2385 "sub $0x10,%3 \n" | |
2386 "jg 1b \n" | |
2387 : "+r"(src_uv), // %0 | |
2388 "+r"(dst_u), // %1 | |
2389 "+r"(dst_v), // %2 | |
2390 "+r"(pix) // %3 | |
2391 : | |
2392 : "memory", "cc", NACL_R14 | |
2393 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
2394 ); | |
2395 } | |
2396 #endif // HAS_SPLITUVROW_SSE2 | |
2397 | |
2398 #ifdef HAS_MERGEUVROW_AVX2 | |
2399 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | |
2400 int width) { | |
2401 asm volatile ( | |
2402 "sub %0,%1 \n" | |
2403 LABELALIGN | |
2404 "1: \n" | |
2405 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
2406 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 | |
2407 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2408 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" | |
2409 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" | |
2410 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" | |
2411 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" | |
2412 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" | |
2413 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" | |
2414 "lea " MEMLEA(0x40,2) ",%2 \n" | |
2415 "sub $0x20,%3 \n" | |
2416 "jg 1b \n" | |
2417 "vzeroupper \n" | |
2418 : "+r"(src_u), // %0 | |
2419 "+r"(src_v), // %1 | |
2420 "+r"(dst_uv), // %2 | |
2421 "+r"(width) // %3 | |
2422 : | |
2423 : "memory", "cc", NACL_R14 | |
2424 "xmm0", "xmm1", "xmm2" | |
2425 ); | |
2426 } | |
2427 #endif // HAS_MERGEUVROW_AVX2 | |
2428 | |
2429 #ifdef HAS_MERGEUVROW_SSE2 | |
2430 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | |
2431 int width) { | |
2432 asm volatile ( | |
2433 "sub %0,%1 \n" | |
2434 LABELALIGN | |
2435 "1: \n" | |
2436 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2437 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | |
2438 "lea " MEMLEA(0x10,0) ",%0 \n" | |
2439 "movdqa %%xmm0,%%xmm2 \n" | |
2440 "punpcklbw %%xmm1,%%xmm0 \n" | |
2441 "punpckhbw %%xmm1,%%xmm2 \n" | |
2442 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
2443 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" | |
2444 "lea " MEMLEA(0x20,2) ",%2 \n" | |
2445 "sub $0x10,%3 \n" | |
2446 "jg 1b \n" | |
2447 : "+r"(src_u), // %0 | |
2448 "+r"(src_v), // %1 | |
2449 "+r"(dst_uv), // %2 | |
2450 "+r"(width) // %3 | |
2451 : | |
2452 : "memory", "cc", NACL_R14 | |
2453 "xmm0", "xmm1", "xmm2" | |
2454 ); | |
2455 } | |
2456 #endif // HAS_MERGEUVROW_SSE2 | |
2457 | |
2458 #ifdef HAS_COPYROW_SSE2 | |
2459 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { | |
2460 asm volatile ( | |
2461 LABELALIGN | |
2462 "1: \n" | |
2463 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2464 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
2465 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2466 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2467 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
2468 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2469 "sub $0x20,%2 \n" | |
2470 "jg 1b \n" | |
2471 : "+r"(src), // %0 | |
2472 "+r"(dst), // %1 | |
2473 "+r"(count) // %2 | |
2474 : | |
2475 : "memory", "cc" | |
2476 , "xmm0", "xmm1" | |
2477 ); | |
2478 } | |
2479 #endif // HAS_COPYROW_SSE2 | |
2480 | |
2481 #ifdef HAS_COPYROW_AVX | |
2482 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { | |
2483 asm volatile ( | |
2484 LABELALIGN | |
2485 "1: \n" | |
2486 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
2487 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
2488 "lea " MEMLEA(0x40,0) ",%0 \n" | |
2489 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
2490 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | |
2491 "lea " MEMLEA(0x40,1) ",%1 \n" | |
2492 "sub $0x40,%2 \n" | |
2493 "jg 1b \n" | |
2494 : "+r"(src), // %0 | |
2495 "+r"(dst), // %1 | |
2496 "+r"(count) // %2 | |
2497 : | |
2498 : "memory", "cc" | |
2499 , "xmm0", "xmm1" | |
2500 ); | |
2501 } | |
2502 #endif // HAS_COPYROW_AVX | |
2503 | |
2504 #ifdef HAS_COPYROW_ERMS | |
2505 // Multiple of 1. | |
2506 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { | |
2507 size_t width_tmp = (size_t)(width); | |
2508 asm volatile ( | |
2509 "rep movsb " MEMMOVESTRING(0,1) " \n" | |
2510 : "+S"(src), // %0 | |
2511 "+D"(dst), // %1 | |
2512 "+c"(width_tmp) // %2 | |
2513 : | |
2514 : "memory", "cc" | |
2515 ); | |
2516 } | |
2517 #endif // HAS_COPYROW_ERMS | |
2518 | |
2519 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 | |
2520 // width in pixels | |
2521 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | |
2522 asm volatile ( | |
2523 "pcmpeqb %%xmm0,%%xmm0 \n" | |
2524 "pslld $0x18,%%xmm0 \n" | |
2525 "pcmpeqb %%xmm1,%%xmm1 \n" | |
2526 "psrld $0x8,%%xmm1 \n" | |
2527 LABELALIGN | |
2528 "1: \n" | |
2529 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
2530 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" | |
2531 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2532 "movdqu " MEMACCESS(1) ",%%xmm4 \n" | |
2533 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" | |
2534 "pand %%xmm0,%%xmm2 \n" | |
2535 "pand %%xmm0,%%xmm3 \n" | |
2536 "pand %%xmm1,%%xmm4 \n" | |
2537 "pand %%xmm1,%%xmm5 \n" | |
2538 "por %%xmm4,%%xmm2 \n" | |
2539 "por %%xmm5,%%xmm3 \n" | |
2540 "movdqu %%xmm2," MEMACCESS(1) " \n" | |
2541 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" | |
2542 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2543 "sub $0x8,%2 \n" | |
2544 "jg 1b \n" | |
2545 : "+r"(src), // %0 | |
2546 "+r"(dst), // %1 | |
2547 "+r"(width) // %2 | |
2548 : | |
2549 : "memory", "cc" | |
2550 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
2551 ); | |
2552 } | |
2553 #endif // HAS_ARGBCOPYALPHAROW_SSE2 | |
2554 | |
2555 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 | |
2556 // width in pixels | |
2557 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | |
2558 asm volatile ( | |
2559 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" | |
2560 "vpsrld $0x8,%%ymm0,%%ymm0 \n" | |
2561 LABELALIGN | |
2562 "1: \n" | |
2563 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" | |
2564 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" | |
2565 "lea " MEMLEA(0x40,0) ",%0 \n" | |
2566 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" | |
2567 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" | |
2568 "vmovdqu %%ymm1," MEMACCESS(1) " \n" | |
2569 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" | |
2570 "lea " MEMLEA(0x40,1) ",%1 \n" | |
2571 "sub $0x10,%2 \n" | |
2572 "jg 1b \n" | |
2573 "vzeroupper \n" | |
2574 : "+r"(src), // %0 | |
2575 "+r"(dst), // %1 | |
2576 "+r"(width) // %2 | |
2577 : | |
2578 : "memory", "cc" | |
2579 , "xmm0", "xmm1", "xmm2" | |
2580 ); | |
2581 } | |
2582 #endif // HAS_ARGBCOPYALPHAROW_AVX2 | |
2583 | |
2584 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 | |
2585 // width in pixels | |
2586 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | |
2587 asm volatile ( | |
2588 "pcmpeqb %%xmm0,%%xmm0 \n" | |
2589 "pslld $0x18,%%xmm0 \n" | |
2590 "pcmpeqb %%xmm1,%%xmm1 \n" | |
2591 "psrld $0x8,%%xmm1 \n" | |
2592 LABELALIGN | |
2593 "1: \n" | |
2594 "movq " MEMACCESS(0) ",%%xmm2 \n" | |
2595 "lea " MEMLEA(0x8,0) ",%0 \n" | |
2596 "punpcklbw %%xmm2,%%xmm2 \n" | |
2597 "punpckhwd %%xmm2,%%xmm3 \n" | |
2598 "punpcklwd %%xmm2,%%xmm2 \n" | |
2599 "movdqu " MEMACCESS(1) ",%%xmm4 \n" | |
2600 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" | |
2601 "pand %%xmm0,%%xmm2 \n" | |
2602 "pand %%xmm0,%%xmm3 \n" | |
2603 "pand %%xmm1,%%xmm4 \n" | |
2604 "pand %%xmm1,%%xmm5 \n" | |
2605 "por %%xmm4,%%xmm2 \n" | |
2606 "por %%xmm5,%%xmm3 \n" | |
2607 "movdqu %%xmm2," MEMACCESS(1) " \n" | |
2608 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" | |
2609 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2610 "sub $0x8,%2 \n" | |
2611 "jg 1b \n" | |
2612 : "+r"(src), // %0 | |
2613 "+r"(dst), // %1 | |
2614 "+r"(width) // %2 | |
2615 : | |
2616 : "memory", "cc" | |
2617 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
2618 ); | |
2619 } | |
2620 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 | |
2621 | |
2622 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 | |
2623 // width in pixels | |
2624 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { | |
2625 asm volatile ( | |
2626 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" | |
2627 "vpsrld $0x8,%%ymm0,%%ymm0 \n" | |
2628 LABELALIGN | |
2629 "1: \n" | |
2630 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" | |
2631 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" | |
2632 "lea " MEMLEA(0x10,0) ",%0 \n" | |
2633 "vpslld $0x18,%%ymm1,%%ymm1 \n" | |
2634 "vpslld $0x18,%%ymm2,%%ymm2 \n" | |
2635 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" | |
2636 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" | |
2637 "vmovdqu %%ymm1," MEMACCESS(1) " \n" | |
2638 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" | |
2639 "lea " MEMLEA(0x40,1) ",%1 \n" | |
2640 "sub $0x10,%2 \n" | |
2641 "jg 1b \n" | |
2642 "vzeroupper \n" | |
2643 : "+r"(src), // %0 | |
2644 "+r"(dst), // %1 | |
2645 "+r"(width) // %2 | |
2646 : | |
2647 : "memory", "cc" | |
2648 , "xmm0", "xmm1", "xmm2" | |
2649 ); | |
2650 } | |
2651 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 | |
2652 | |
2653 #ifdef HAS_SETROW_X86 | |
2654 void SetRow_X86(uint8* dst, uint8 v8, int width) { | |
2655 size_t width_tmp = (size_t)(width >> 2); | |
2656 const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes. | |
2657 asm volatile ( | |
2658 "rep stosl " MEMSTORESTRING(eax,0) " \n" | |
2659 : "+D"(dst), // %0 | |
2660 "+c"(width_tmp) // %1 | |
2661 : "a"(v32) // %2 | |
2662 : "memory", "cc"); | |
2663 } | |
2664 | |
2665 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { | |
2666 size_t width_tmp = (size_t)(width); | |
2667 asm volatile ( | |
2668 "rep stosb " MEMSTORESTRING(al,0) " \n" | |
2669 : "+D"(dst), // %0 | |
2670 "+c"(width_tmp) // %1 | |
2671 : "a"(v8) // %2 | |
2672 : "memory", "cc"); | |
2673 } | |
2674 | |
2675 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { | |
2676 size_t width_tmp = (size_t)(width); | |
2677 asm volatile ( | |
2678 "rep stosl " MEMSTORESTRING(eax,0) " \n" | |
2679 : "+D"(dst_argb), // %0 | |
2680 "+c"(width_tmp) // %1 | |
2681 : "a"(v32) // %2 | |
2682 : "memory", "cc"); | |
2683 } | |
2684 #endif // HAS_SETROW_X86 | |
2685 | |
2686 #ifdef HAS_YUY2TOYROW_SSE2 | |
2687 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { | |
2688 asm volatile ( | |
2689 "pcmpeqb %%xmm5,%%xmm5 \n" | |
2690 "psrlw $0x8,%%xmm5 \n" | |
2691 LABELALIGN | |
2692 "1: \n" | |
2693 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2694 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
2695 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2696 "pand %%xmm5,%%xmm0 \n" | |
2697 "pand %%xmm5,%%xmm1 \n" | |
2698 "packuswb %%xmm1,%%xmm0 \n" | |
2699 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2700 "lea " MEMLEA(0x10,1) ",%1 \n" | |
2701 "sub $0x10,%2 \n" | |
2702 "jg 1b \n" | |
2703 : "+r"(src_yuy2), // %0 | |
2704 "+r"(dst_y), // %1 | |
2705 "+r"(pix) // %2 | |
2706 : | |
2707 : "memory", "cc" | |
2708 , "xmm0", "xmm1", "xmm5" | |
2709 ); | |
2710 } | |
2711 | |
2712 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | |
2713 uint8* dst_u, uint8* dst_v, int pix) { | |
2714 asm volatile ( | |
2715 "pcmpeqb %%xmm5,%%xmm5 \n" | |
2716 "psrlw $0x8,%%xmm5 \n" | |
2717 "sub %1,%2 \n" | |
2718 LABELALIGN | |
2719 "1: \n" | |
2720 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2721 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
2722 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | |
2723 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | |
2724 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2725 "pavgb %%xmm2,%%xmm0 \n" | |
2726 "pavgb %%xmm3,%%xmm1 \n" | |
2727 "psrlw $0x8,%%xmm0 \n" | |
2728 "psrlw $0x8,%%xmm1 \n" | |
2729 "packuswb %%xmm1,%%xmm0 \n" | |
2730 "movdqa %%xmm0,%%xmm1 \n" | |
2731 "pand %%xmm5,%%xmm0 \n" | |
2732 "packuswb %%xmm0,%%xmm0 \n" | |
2733 "psrlw $0x8,%%xmm1 \n" | |
2734 "packuswb %%xmm1,%%xmm1 \n" | |
2735 "movq %%xmm0," MEMACCESS(1) " \n" | |
2736 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | |
2737 "lea " MEMLEA(0x8,1) ",%1 \n" | |
2738 "sub $0x10,%3 \n" | |
2739 "jg 1b \n" | |
2740 : "+r"(src_yuy2), // %0 | |
2741 "+r"(dst_u), // %1 | |
2742 "+r"(dst_v), // %2 | |
2743 "+r"(pix) // %3 | |
2744 : "r"((intptr_t)(stride_yuy2)) // %4 | |
2745 : "memory", "cc", NACL_R14 | |
2746 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
2747 ); | |
2748 } | |
2749 | |
2750 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | |
2751 uint8* dst_u, uint8* dst_v, int pix) { | |
2752 asm volatile ( | |
2753 "pcmpeqb %%xmm5,%%xmm5 \n" | |
2754 "psrlw $0x8,%%xmm5 \n" | |
2755 "sub %1,%2 \n" | |
2756 LABELALIGN | |
2757 "1: \n" | |
2758 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2759 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
2760 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2761 "psrlw $0x8,%%xmm0 \n" | |
2762 "psrlw $0x8,%%xmm1 \n" | |
2763 "packuswb %%xmm1,%%xmm0 \n" | |
2764 "movdqa %%xmm0,%%xmm1 \n" | |
2765 "pand %%xmm5,%%xmm0 \n" | |
2766 "packuswb %%xmm0,%%xmm0 \n" | |
2767 "psrlw $0x8,%%xmm1 \n" | |
2768 "packuswb %%xmm1,%%xmm1 \n" | |
2769 "movq %%xmm0," MEMACCESS(1) " \n" | |
2770 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | |
2771 "lea " MEMLEA(0x8,1) ",%1 \n" | |
2772 "sub $0x10,%3 \n" | |
2773 "jg 1b \n" | |
2774 : "+r"(src_yuy2), // %0 | |
2775 "+r"(dst_u), // %1 | |
2776 "+r"(dst_v), // %2 | |
2777 "+r"(pix) // %3 | |
2778 : | |
2779 : "memory", "cc", NACL_R14 | |
2780 "xmm0", "xmm1", "xmm5" | |
2781 ); | |
2782 } | |
2783 | |
2784 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { | |
2785 asm volatile ( | |
2786 LABELALIGN | |
2787 "1: \n" | |
2788 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2789 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
2790 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2791 "psrlw $0x8,%%xmm0 \n" | |
2792 "psrlw $0x8,%%xmm1 \n" | |
2793 "packuswb %%xmm1,%%xmm0 \n" | |
2794 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2795 "lea " MEMLEA(0x10,1) ",%1 \n" | |
2796 "sub $0x10,%2 \n" | |
2797 "jg 1b \n" | |
2798 : "+r"(src_uyvy), // %0 | |
2799 "+r"(dst_y), // %1 | |
2800 "+r"(pix) // %2 | |
2801 : | |
2802 : "memory", "cc" | |
2803 , "xmm0", "xmm1" | |
2804 ); | |
2805 } | |
2806 | |
2807 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | |
2808 uint8* dst_u, uint8* dst_v, int pix) { | |
2809 asm volatile ( | |
2810 "pcmpeqb %%xmm5,%%xmm5 \n" | |
2811 "psrlw $0x8,%%xmm5 \n" | |
2812 "sub %1,%2 \n" | |
2813 LABELALIGN | |
2814 "1: \n" | |
2815 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2816 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
2817 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | |
2818 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | |
2819 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2820 "pavgb %%xmm2,%%xmm0 \n" | |
2821 "pavgb %%xmm3,%%xmm1 \n" | |
2822 "pand %%xmm5,%%xmm0 \n" | |
2823 "pand %%xmm5,%%xmm1 \n" | |
2824 "packuswb %%xmm1,%%xmm0 \n" | |
2825 "movdqa %%xmm0,%%xmm1 \n" | |
2826 "pand %%xmm5,%%xmm0 \n" | |
2827 "packuswb %%xmm0,%%xmm0 \n" | |
2828 "psrlw $0x8,%%xmm1 \n" | |
2829 "packuswb %%xmm1,%%xmm1 \n" | |
2830 "movq %%xmm0," MEMACCESS(1) " \n" | |
2831 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | |
2832 "lea " MEMLEA(0x8,1) ",%1 \n" | |
2833 "sub $0x10,%3 \n" | |
2834 "jg 1b \n" | |
2835 : "+r"(src_uyvy), // %0 | |
2836 "+r"(dst_u), // %1 | |
2837 "+r"(dst_v), // %2 | |
2838 "+r"(pix) // %3 | |
2839 : "r"((intptr_t)(stride_uyvy)) // %4 | |
2840 : "memory", "cc", NACL_R14 | |
2841 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
2842 ); | |
2843 } | |
2844 | |
2845 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | |
2846 uint8* dst_u, uint8* dst_v, int pix) { | |
2847 asm volatile ( | |
2848 "pcmpeqb %%xmm5,%%xmm5 \n" | |
2849 "psrlw $0x8,%%xmm5 \n" | |
2850 "sub %1,%2 \n" | |
2851 LABELALIGN | |
2852 "1: \n" | |
2853 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
2854 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
2855 "lea " MEMLEA(0x20,0) ",%0 \n" | |
2856 "pand %%xmm5,%%xmm0 \n" | |
2857 "pand %%xmm5,%%xmm1 \n" | |
2858 "packuswb %%xmm1,%%xmm0 \n" | |
2859 "movdqa %%xmm0,%%xmm1 \n" | |
2860 "pand %%xmm5,%%xmm0 \n" | |
2861 "packuswb %%xmm0,%%xmm0 \n" | |
2862 "psrlw $0x8,%%xmm1 \n" | |
2863 "packuswb %%xmm1,%%xmm1 \n" | |
2864 "movq %%xmm0," MEMACCESS(1) " \n" | |
2865 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) | |
2866 "lea " MEMLEA(0x8,1) ",%1 \n" | |
2867 "sub $0x10,%3 \n" | |
2868 "jg 1b \n" | |
2869 : "+r"(src_uyvy), // %0 | |
2870 "+r"(dst_u), // %1 | |
2871 "+r"(dst_v), // %2 | |
2872 "+r"(pix) // %3 | |
2873 : | |
2874 : "memory", "cc", NACL_R14 | |
2875 "xmm0", "xmm1", "xmm5" | |
2876 ); | |
2877 } | |
2878 #endif // HAS_YUY2TOYROW_SSE2 | |
2879 | |
2880 #ifdef HAS_YUY2TOYROW_AVX2 | |
2881 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { | |
2882 asm volatile ( | |
2883 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
2884 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
2885 LABELALIGN | |
2886 "1: \n" | |
2887 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
2888 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
2889 "lea " MEMLEA(0x40,0) ",%0 \n" | |
2890 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | |
2891 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | |
2892 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
2893 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
2894 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
2895 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2896 "sub $0x20,%2 \n" | |
2897 "jg 1b \n" | |
2898 "vzeroupper \n" | |
2899 : "+r"(src_yuy2), // %0 | |
2900 "+r"(dst_y), // %1 | |
2901 "+r"(pix) // %2 | |
2902 : | |
2903 : "memory", "cc" | |
2904 , "xmm0", "xmm1", "xmm5" | |
2905 ); | |
2906 } | |
2907 | |
2908 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | |
2909 uint8* dst_u, uint8* dst_v, int pix) { | |
2910 asm volatile ( | |
2911 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
2912 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
2913 "sub %1,%2 \n" | |
2914 LABELALIGN | |
2915 "1: \n" | |
2916 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
2917 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
2918 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | |
2919 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | |
2920 "lea " MEMLEA(0x40,0) ",%0 \n" | |
2921 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
2922 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | |
2923 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
2924 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
2925 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | |
2926 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
2927 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | |
2928 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
2929 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
2930 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
2931 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | |
2932 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | |
2933 "lea " MEMLEA(0x10,1) ",%1 \n" | |
2934 "sub $0x20,%3 \n" | |
2935 "jg 1b \n" | |
2936 "vzeroupper \n" | |
2937 : "+r"(src_yuy2), // %0 | |
2938 "+r"(dst_u), // %1 | |
2939 "+r"(dst_v), // %2 | |
2940 "+r"(pix) // %3 | |
2941 : "r"((intptr_t)(stride_yuy2)) // %4 | |
2942 : "memory", "cc", NACL_R14 | |
2943 "xmm0", "xmm1", "xmm5" | |
2944 ); | |
2945 } | |
2946 | |
2947 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | |
2948 uint8* dst_u, uint8* dst_v, int pix) { | |
2949 asm volatile ( | |
2950 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
2951 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
2952 "sub %1,%2 \n" | |
2953 LABELALIGN | |
2954 "1: \n" | |
2955 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
2956 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
2957 "lea " MEMLEA(0x40,0) ",%0 \n" | |
2958 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
2959 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | |
2960 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
2961 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
2962 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | |
2963 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
2964 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | |
2965 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
2966 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
2967 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
2968 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | |
2969 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | |
2970 "lea " MEMLEA(0x10,1) ",%1 \n" | |
2971 "sub $0x20,%3 \n" | |
2972 "jg 1b \n" | |
2973 "vzeroupper \n" | |
2974 : "+r"(src_yuy2), // %0 | |
2975 "+r"(dst_u), // %1 | |
2976 "+r"(dst_v), // %2 | |
2977 "+r"(pix) // %3 | |
2978 : | |
2979 : "memory", "cc", NACL_R14 | |
2980 "xmm0", "xmm1", "xmm5" | |
2981 ); | |
2982 } | |
2983 | |
2984 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { | |
2985 asm volatile ( | |
2986 LABELALIGN | |
2987 "1: \n" | |
2988 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
2989 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
2990 "lea " MEMLEA(0x40,0) ",%0 \n" | |
2991 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
2992 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | |
2993 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
2994 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
2995 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
2996 "lea " MEMLEA(0x20,1) ",%1 \n" | |
2997 "sub $0x20,%2 \n" | |
2998 "jg 1b \n" | |
2999 "vzeroupper \n" | |
3000 : "+r"(src_uyvy), // %0 | |
3001 "+r"(dst_y), // %1 | |
3002 "+r"(pix) // %2 | |
3003 : | |
3004 : "memory", "cc" | |
3005 , "xmm0", "xmm1", "xmm5" | |
3006 ); | |
3007 } | |
3008 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | |
3009 uint8* dst_u, uint8* dst_v, int pix) { | |
3010 asm volatile ( | |
3011 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
3012 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
3013 "sub %1,%2 \n" | |
3014 | |
3015 LABELALIGN | |
3016 "1: \n" | |
3017 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
3018 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
3019 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | |
3020 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | |
3021 "lea " MEMLEA(0x40,0) ",%0 \n" | |
3022 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | |
3023 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | |
3024 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
3025 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
3026 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | |
3027 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
3028 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | |
3029 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
3030 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
3031 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
3032 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | |
3033 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | |
3034 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3035 "sub $0x20,%3 \n" | |
3036 "jg 1b \n" | |
3037 "vzeroupper \n" | |
3038 : "+r"(src_uyvy), // %0 | |
3039 "+r"(dst_u), // %1 | |
3040 "+r"(dst_v), // %2 | |
3041 "+r"(pix) // %3 | |
3042 : "r"((intptr_t)(stride_uyvy)) // %4 | |
3043 : "memory", "cc", NACL_R14 | |
3044 "xmm0", "xmm1", "xmm5" | |
3045 ); | |
3046 } | |
3047 | |
3048 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | |
3049 uint8* dst_u, uint8* dst_v, int pix) { | |
3050 asm volatile ( | |
3051 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
3052 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | |
3053 "sub %1,%2 \n" | |
3054 LABELALIGN | |
3055 "1: \n" | |
3056 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
3057 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
3058 "lea " MEMLEA(0x40,0) ",%0 \n" | |
3059 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | |
3060 "vpand %%ymm5,%%ymm1,%%ymm1 \n" | |
3061 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
3062 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
3063 "vpand %%ymm5,%%ymm0,%%ymm1 \n" | |
3064 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
3065 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" | |
3066 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | |
3067 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
3068 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
3069 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" | |
3070 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2
,1) | |
3071 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3072 "sub $0x20,%3 \n" | |
3073 "jg 1b \n" | |
3074 "vzeroupper \n" | |
3075 : "+r"(src_uyvy), // %0 | |
3076 "+r"(dst_u), // %1 | |
3077 "+r"(dst_v), // %2 | |
3078 "+r"(pix) // %3 | |
3079 : | |
3080 : "memory", "cc", NACL_R14 | |
3081 "xmm0", "xmm1", "xmm5" | |
3082 ); | |
3083 } | |
3084 #endif // HAS_YUY2TOYROW_AVX2 | |
3085 | |
3086 #ifdef HAS_ARGBBLENDROW_SSE2 | |
3087 // Blend 8 pixels at a time. | |
3088 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
3089 uint8* dst_argb, int width) { | |
3090 asm volatile ( | |
3091 "pcmpeqb %%xmm7,%%xmm7 \n" | |
3092 "psrlw $0xf,%%xmm7 \n" | |
3093 "pcmpeqb %%xmm6,%%xmm6 \n" | |
3094 "psrlw $0x8,%%xmm6 \n" | |
3095 "pcmpeqb %%xmm5,%%xmm5 \n" | |
3096 "psllw $0x8,%%xmm5 \n" | |
3097 "pcmpeqb %%xmm4,%%xmm4 \n" | |
3098 "pslld $0x18,%%xmm4 \n" | |
3099 "sub $0x1,%3 \n" | |
3100 "je 91f \n" | |
3101 "jl 99f \n" | |
3102 | |
3103 // 1 pixel loop until destination pointer is aligned. | |
3104 "10: \n" | |
3105 "test $0xf,%2 \n" | |
3106 "je 19f \n" | |
3107 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
3108 "lea " MEMLEA(0x4,0) ",%0 \n" | |
3109 "movdqa %%xmm3,%%xmm0 \n" | |
3110 "pxor %%xmm4,%%xmm3 \n" | |
3111 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
3112 "psrlw $0x8,%%xmm3 \n" | |
3113 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
3114 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
3115 "pand %%xmm6,%%xmm2 \n" | |
3116 "paddw %%xmm7,%%xmm3 \n" | |
3117 "pmullw %%xmm3,%%xmm2 \n" | |
3118 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
3119 "lea " MEMLEA(0x4,1) ",%1 \n" | |
3120 "psrlw $0x8,%%xmm1 \n" | |
3121 "por %%xmm4,%%xmm0 \n" | |
3122 "pmullw %%xmm3,%%xmm1 \n" | |
3123 "psrlw $0x8,%%xmm2 \n" | |
3124 "paddusb %%xmm2,%%xmm0 \n" | |
3125 "pand %%xmm5,%%xmm1 \n" | |
3126 "paddusb %%xmm1,%%xmm0 \n" | |
3127 "movd %%xmm0," MEMACCESS(2) " \n" | |
3128 "lea " MEMLEA(0x4,2) ",%2 \n" | |
3129 "sub $0x1,%3 \n" | |
3130 "jge 10b \n" | |
3131 | |
3132 "19: \n" | |
3133 "add $1-4,%3 \n" | |
3134 "jl 49f \n" | |
3135 | |
3136 // 4 pixel loop. | |
3137 LABELALIGN | |
3138 "41: \n" | |
3139 "movdqu " MEMACCESS(0) ",%%xmm3 \n" | |
3140 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3141 "movdqa %%xmm3,%%xmm0 \n" | |
3142 "pxor %%xmm4,%%xmm3 \n" | |
3143 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | |
3144 "psrlw $0x8,%%xmm3 \n" | |
3145 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
3146 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
3147 "pand %%xmm6,%%xmm2 \n" | |
3148 "paddw %%xmm7,%%xmm3 \n" | |
3149 "pmullw %%xmm3,%%xmm2 \n" | |
3150 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
3151 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3152 "psrlw $0x8,%%xmm1 \n" | |
3153 "por %%xmm4,%%xmm0 \n" | |
3154 "pmullw %%xmm3,%%xmm1 \n" | |
3155 "psrlw $0x8,%%xmm2 \n" | |
3156 "paddusb %%xmm2,%%xmm0 \n" | |
3157 "pand %%xmm5,%%xmm1 \n" | |
3158 "paddusb %%xmm1,%%xmm0 \n" | |
3159 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
3160 "lea " MEMLEA(0x10,2) ",%2 \n" | |
3161 "sub $0x4,%3 \n" | |
3162 "jge 41b \n" | |
3163 | |
3164 "49: \n" | |
3165 "add $0x3,%3 \n" | |
3166 "jl 99f \n" | |
3167 | |
3168 // 1 pixel loop. | |
3169 "91: \n" | |
3170 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
3171 "lea " MEMLEA(0x4,0) ",%0 \n" | |
3172 "movdqa %%xmm3,%%xmm0 \n" | |
3173 "pxor %%xmm4,%%xmm3 \n" | |
3174 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
3175 "psrlw $0x8,%%xmm3 \n" | |
3176 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
3177 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
3178 "pand %%xmm6,%%xmm2 \n" | |
3179 "paddw %%xmm7,%%xmm3 \n" | |
3180 "pmullw %%xmm3,%%xmm2 \n" | |
3181 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
3182 "lea " MEMLEA(0x4,1) ",%1 \n" | |
3183 "psrlw $0x8,%%xmm1 \n" | |
3184 "por %%xmm4,%%xmm0 \n" | |
3185 "pmullw %%xmm3,%%xmm1 \n" | |
3186 "psrlw $0x8,%%xmm2 \n" | |
3187 "paddusb %%xmm2,%%xmm0 \n" | |
3188 "pand %%xmm5,%%xmm1 \n" | |
3189 "paddusb %%xmm1,%%xmm0 \n" | |
3190 "movd %%xmm0," MEMACCESS(2) " \n" | |
3191 "lea " MEMLEA(0x4,2) ",%2 \n" | |
3192 "sub $0x1,%3 \n" | |
3193 "jge 91b \n" | |
3194 "99: \n" | |
3195 : "+r"(src_argb0), // %0 | |
3196 "+r"(src_argb1), // %1 | |
3197 "+r"(dst_argb), // %2 | |
3198 "+r"(width) // %3 | |
3199 : | |
3200 : "memory", "cc" | |
3201 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
3202 ); | |
3203 } | |
3204 #endif // HAS_ARGBBLENDROW_SSE2 | |
3205 | |
3206 #ifdef HAS_ARGBBLENDROW_SSSE3 | |
3207 // Shuffle table for isolating alpha. | |
3208 static uvec8 kShuffleAlpha = { | |
3209 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | |
3210 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | |
3211 }; | |
3212 | |
3213 // Blend 8 pixels at a time | |
3214 // Shuffle table for reversing the bytes. | |
3215 | |
3216 // Same as SSE2, but replaces | |
3217 // psrlw xmm3, 8 // alpha | |
3218 // pshufhw xmm3, xmm3,0F5h // 8 alpha words | |
3219 // pshuflw xmm3, xmm3,0F5h | |
3220 // with.. | |
3221 // pshufb xmm3, kShuffleAlpha // alpha | |
3222 | |
3223 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | |
3224 uint8* dst_argb, int width) { | |
3225 asm volatile ( | |
3226 "pcmpeqb %%xmm7,%%xmm7 \n" | |
3227 "psrlw $0xf,%%xmm7 \n" | |
3228 "pcmpeqb %%xmm6,%%xmm6 \n" | |
3229 "psrlw $0x8,%%xmm6 \n" | |
3230 "pcmpeqb %%xmm5,%%xmm5 \n" | |
3231 "psllw $0x8,%%xmm5 \n" | |
3232 "pcmpeqb %%xmm4,%%xmm4 \n" | |
3233 "pslld $0x18,%%xmm4 \n" | |
3234 "sub $0x1,%3 \n" | |
3235 "je 91f \n" | |
3236 "jl 99f \n" | |
3237 | |
3238 // 1 pixel loop until destination pointer is aligned. | |
3239 "10: \n" | |
3240 "test $0xf,%2 \n" | |
3241 "je 19f \n" | |
3242 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
3243 "lea " MEMLEA(0x4,0) ",%0 \n" | |
3244 "movdqa %%xmm3,%%xmm0 \n" | |
3245 "pxor %%xmm4,%%xmm3 \n" | |
3246 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
3247 "pshufb %4,%%xmm3 \n" | |
3248 "pand %%xmm6,%%xmm2 \n" | |
3249 "paddw %%xmm7,%%xmm3 \n" | |
3250 "pmullw %%xmm3,%%xmm2 \n" | |
3251 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
3252 "lea " MEMLEA(0x4,1) ",%1 \n" | |
3253 "psrlw $0x8,%%xmm1 \n" | |
3254 "por %%xmm4,%%xmm0 \n" | |
3255 "pmullw %%xmm3,%%xmm1 \n" | |
3256 "psrlw $0x8,%%xmm2 \n" | |
3257 "paddusb %%xmm2,%%xmm0 \n" | |
3258 "pand %%xmm5,%%xmm1 \n" | |
3259 "paddusb %%xmm1,%%xmm0 \n" | |
3260 "movd %%xmm0," MEMACCESS(2) " \n" | |
3261 "lea " MEMLEA(0x4,2) ",%2 \n" | |
3262 "sub $0x1,%3 \n" | |
3263 "jge 10b \n" | |
3264 | |
3265 "19: \n" | |
3266 "add $1-4,%3 \n" | |
3267 "jl 49f \n" | |
3268 | |
3269 // 4 pixel loop. | |
3270 LABELALIGN | |
3271 "40: \n" | |
3272 "movdqu " MEMACCESS(0) ",%%xmm3 \n" | |
3273 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3274 "movdqa %%xmm3,%%xmm0 \n" | |
3275 "pxor %%xmm4,%%xmm3 \n" | |
3276 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | |
3277 "pshufb %4,%%xmm3 \n" | |
3278 "pand %%xmm6,%%xmm2 \n" | |
3279 "paddw %%xmm7,%%xmm3 \n" | |
3280 "pmullw %%xmm3,%%xmm2 \n" | |
3281 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
3282 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3283 "psrlw $0x8,%%xmm1 \n" | |
3284 "por %%xmm4,%%xmm0 \n" | |
3285 "pmullw %%xmm3,%%xmm1 \n" | |
3286 "psrlw $0x8,%%xmm2 \n" | |
3287 "paddusb %%xmm2,%%xmm0 \n" | |
3288 "pand %%xmm5,%%xmm1 \n" | |
3289 "paddusb %%xmm1,%%xmm0 \n" | |
3290 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
3291 "lea " MEMLEA(0x10,2) ",%2 \n" | |
3292 "sub $0x4,%3 \n" | |
3293 "jge 40b \n" | |
3294 | |
3295 "49: \n" | |
3296 "add $0x3,%3 \n" | |
3297 "jl 99f \n" | |
3298 | |
3299 // 1 pixel loop. | |
3300 "91: \n" | |
3301 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
3302 "lea " MEMLEA(0x4,0) ",%0 \n" | |
3303 "movdqa %%xmm3,%%xmm0 \n" | |
3304 "pxor %%xmm4,%%xmm3 \n" | |
3305 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
3306 "pshufb %4,%%xmm3 \n" | |
3307 "pand %%xmm6,%%xmm2 \n" | |
3308 "paddw %%xmm7,%%xmm3 \n" | |
3309 "pmullw %%xmm3,%%xmm2 \n" | |
3310 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
3311 "lea " MEMLEA(0x4,1) ",%1 \n" | |
3312 "psrlw $0x8,%%xmm1 \n" | |
3313 "por %%xmm4,%%xmm0 \n" | |
3314 "pmullw %%xmm3,%%xmm1 \n" | |
3315 "psrlw $0x8,%%xmm2 \n" | |
3316 "paddusb %%xmm2,%%xmm0 \n" | |
3317 "pand %%xmm5,%%xmm1 \n" | |
3318 "paddusb %%xmm1,%%xmm0 \n" | |
3319 "movd %%xmm0," MEMACCESS(2) " \n" | |
3320 "lea " MEMLEA(0x4,2) ",%2 \n" | |
3321 "sub $0x1,%3 \n" | |
3322 "jge 91b \n" | |
3323 "99: \n" | |
3324 : "+r"(src_argb0), // %0 | |
3325 "+r"(src_argb1), // %1 | |
3326 "+r"(dst_argb), // %2 | |
3327 "+r"(width) // %3 | |
3328 : "m"(kShuffleAlpha) // %4 | |
3329 : "memory", "cc" | |
3330 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
3331 ); | |
3332 } | |
3333 #endif // HAS_ARGBBLENDROW_SSSE3 | |
3334 | |
3335 #ifdef HAS_ARGBATTENUATEROW_SSE2 | |
3336 // Attenuate 4 pixels at a time. | |
3337 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | |
3338 asm volatile ( | |
3339 "pcmpeqb %%xmm4,%%xmm4 \n" | |
3340 "pslld $0x18,%%xmm4 \n" | |
3341 "pcmpeqb %%xmm5,%%xmm5 \n" | |
3342 "psrld $0x8,%%xmm5 \n" | |
3343 | |
3344 // 4 pixel loop. | |
3345 LABELALIGN | |
3346 "1: \n" | |
3347 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3348 "punpcklbw %%xmm0,%%xmm0 \n" | |
3349 "pshufhw $0xff,%%xmm0,%%xmm2 \n" | |
3350 "pshuflw $0xff,%%xmm2,%%xmm2 \n" | |
3351 "pmulhuw %%xmm2,%%xmm0 \n" | |
3352 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
3353 "punpckhbw %%xmm1,%%xmm1 \n" | |
3354 "pshufhw $0xff,%%xmm1,%%xmm2 \n" | |
3355 "pshuflw $0xff,%%xmm2,%%xmm2 \n" | |
3356 "pmulhuw %%xmm2,%%xmm1 \n" | |
3357 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
3358 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3359 "psrlw $0x8,%%xmm0 \n" | |
3360 "pand %%xmm4,%%xmm2 \n" | |
3361 "psrlw $0x8,%%xmm1 \n" | |
3362 "packuswb %%xmm1,%%xmm0 \n" | |
3363 "pand %%xmm5,%%xmm0 \n" | |
3364 "por %%xmm2,%%xmm0 \n" | |
3365 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
3366 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3367 "sub $0x4,%2 \n" | |
3368 "jg 1b \n" | |
3369 : "+r"(src_argb), // %0 | |
3370 "+r"(dst_argb), // %1 | |
3371 "+r"(width) // %2 | |
3372 : | |
3373 : "memory", "cc" | |
3374 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
3375 ); | |
3376 } | |
3377 #endif // HAS_ARGBATTENUATEROW_SSE2 | |
3378 | |
3379 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | |
3380 // Shuffle table duplicating alpha | |
3381 static uvec8 kShuffleAlpha0 = { | |
3382 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u | |
3383 }; | |
3384 static uvec8 kShuffleAlpha1 = { | |
3385 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | |
3386 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u | |
3387 }; | |
3388 // Attenuate 4 pixels at a time. | |
3389 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | |
3390 asm volatile ( | |
3391 "pcmpeqb %%xmm3,%%xmm3 \n" | |
3392 "pslld $0x18,%%xmm3 \n" | |
3393 "movdqa %3,%%xmm4 \n" | |
3394 "movdqa %4,%%xmm5 \n" | |
3395 | |
3396 // 4 pixel loop. | |
3397 LABELALIGN | |
3398 "1: \n" | |
3399 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3400 "pshufb %%xmm4,%%xmm0 \n" | |
3401 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
3402 "punpcklbw %%xmm1,%%xmm1 \n" | |
3403 "pmulhuw %%xmm1,%%xmm0 \n" | |
3404 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
3405 "pshufb %%xmm5,%%xmm1 \n" | |
3406 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
3407 "punpckhbw %%xmm2,%%xmm2 \n" | |
3408 "pmulhuw %%xmm2,%%xmm1 \n" | |
3409 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
3410 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3411 "pand %%xmm3,%%xmm2 \n" | |
3412 "psrlw $0x8,%%xmm0 \n" | |
3413 "psrlw $0x8,%%xmm1 \n" | |
3414 "packuswb %%xmm1,%%xmm0 \n" | |
3415 "por %%xmm2,%%xmm0 \n" | |
3416 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
3417 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3418 "sub $0x4,%2 \n" | |
3419 "jg 1b \n" | |
3420 : "+r"(src_argb), // %0 | |
3421 "+r"(dst_argb), // %1 | |
3422 "+r"(width) // %2 | |
3423 : "m"(kShuffleAlpha0), // %3 | |
3424 "m"(kShuffleAlpha1) // %4 | |
3425 : "memory", "cc" | |
3426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
3427 ); | |
3428 } | |
3429 #endif // HAS_ARGBATTENUATEROW_SSSE3 | |
3430 | |
3431 #ifdef HAS_ARGBATTENUATEROW_AVX2 | |
3432 // Shuffle table duplicating alpha. | |
3433 static const uvec8 kShuffleAlpha_AVX2 = { | |
3434 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u | |
3435 }; | |
3436 // Attenuate 8 pixels at a time. | |
3437 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { | |
3438 asm volatile ( | |
3439 "vbroadcastf128 %3,%%ymm4 \n" | |
3440 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
3441 "vpslld $0x18,%%ymm5,%%ymm5 \n" | |
3442 "sub %0,%1 \n" | |
3443 | |
3444 // 8 pixel loop. | |
3445 LABELALIGN | |
3446 "1: \n" | |
3447 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" | |
3448 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" | |
3449 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" | |
3450 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" | |
3451 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" | |
3452 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | |
3453 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" | |
3454 "vpand %%ymm5,%%ymm6,%%ymm6 \n" | |
3455 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | |
3456 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" | |
3457 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
3458 "vpor %%ymm6,%%ymm0,%%ymm0 \n" | |
3459 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) | |
3460 "lea " MEMLEA(0x20,0) ",%0 \n" | |
3461 "sub $0x8,%2 \n" | |
3462 "jg 1b \n" | |
3463 "vzeroupper \n" | |
3464 : "+r"(src_argb), // %0 | |
3465 "+r"(dst_argb), // %1 | |
3466 "+r"(width) // %2 | |
3467 : "m"(kShuffleAlpha_AVX2) // %3 | |
3468 : "memory", "cc" | |
3469 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
3470 ); | |
3471 } | |
3472 #endif // HAS_ARGBATTENUATEROW_AVX2 | |
3473 | |
3474 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 | |
3475 // Unattenuate 4 pixels at a time. | |
3476 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, | |
3477 int width) { | |
3478 uintptr_t alpha = 0; | |
3479 asm volatile ( | |
3480 // 4 pixel loop. | |
3481 LABELALIGN | |
3482 "1: \n" | |
3483 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3484 "movzb " MEMACCESS2(0x03,0) ",%3 \n" | |
3485 "punpcklbw %%xmm0,%%xmm0 \n" | |
3486 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 | |
3487 "movzb " MEMACCESS2(0x07,0) ",%3 \n" | |
3488 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 | |
3489 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | |
3490 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | |
3491 "movlhps %%xmm3,%%xmm2 \n" | |
3492 "pmulhuw %%xmm2,%%xmm0 \n" | |
3493 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
3494 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" | |
3495 "punpckhbw %%xmm1,%%xmm1 \n" | |
3496 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 | |
3497 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" | |
3498 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 | |
3499 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | |
3500 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | |
3501 "movlhps %%xmm3,%%xmm2 \n" | |
3502 "pmulhuw %%xmm2,%%xmm1 \n" | |
3503 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3504 "packuswb %%xmm1,%%xmm0 \n" | |
3505 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
3506 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3507 "sub $0x4,%2 \n" | |
3508 "jg 1b \n" | |
3509 : "+r"(src_argb), // %0 | |
3510 "+r"(dst_argb), // %1 | |
3511 "+r"(width), // %2 | |
3512 "+r"(alpha) // %3 | |
3513 : "r"(fixed_invtbl8) // %4 | |
3514 : "memory", "cc", NACL_R14 | |
3515 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
3516 ); | |
3517 } | |
3518 #endif // HAS_ARGBUNATTENUATEROW_SSE2 | |
3519 | |
3520 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 | |
3521 // Shuffle table duplicating alpha. | |
3522 static const uvec8 kUnattenShuffleAlpha_AVX2 = { | |
3523 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u | |
3524 }; | |
3525 // Unattenuate 8 pixels at a time. | |
3526 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | |
3527 int width) { | |
3528 uintptr_t alpha = 0; | |
3529 asm volatile ( | |
3530 "sub %0,%1 \n" | |
3531 "vbroadcastf128 %5,%%ymm5 \n" | |
3532 | |
3533 // 8 pixel loop. | |
3534 LABELALIGN | |
3535 "1: \n" | |
3536 // replace VPGATHER | |
3537 "movzb " MEMACCESS2(0x03,0) ",%3 \n" | |
3538 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 | |
3539 "movzb " MEMACCESS2(0x07,0) ",%3 \n" | |
3540 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 | |
3541 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" | |
3542 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" | |
3543 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 | |
3544 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" | |
3545 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 | |
3546 "movzb " MEMACCESS2(0x13,0) ",%3 \n" | |
3547 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" | |
3548 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 | |
3549 "movzb " MEMACCESS2(0x17,0) ",%3 \n" | |
3550 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 | |
3551 "movzb " MEMACCESS2(0x1b,0) ",%3 \n" | |
3552 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" | |
3553 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 | |
3554 "movzb " MEMACCESS2(0x1f,0) ",%3 \n" | |
3555 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 | |
3556 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" | |
3557 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" | |
3558 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" | |
3559 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" | |
3560 // end of VPGATHER | |
3561 | |
3562 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" | |
3563 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" | |
3564 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" | |
3565 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" | |
3566 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" | |
3567 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" | |
3568 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" | |
3569 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | |
3570 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" | |
3571 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
3572 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) | |
3573 "lea " MEMLEA(0x20,0) ",%0 \n" | |
3574 "sub $0x8,%2 \n" | |
3575 "jg 1b \n" | |
3576 "vzeroupper \n" | |
3577 : "+r"(src_argb), // %0 | |
3578 "+r"(dst_argb), // %1 | |
3579 "+r"(width), // %2 | |
3580 "+r"(alpha) // %3 | |
3581 : "r"(fixed_invtbl8), // %4 | |
3582 "m"(kUnattenShuffleAlpha_AVX2) // %5 | |
3583 : "memory", "cc", NACL_R14 | |
3584 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
3585 ); | |
3586 } | |
3587 #endif // HAS_ARGBUNATTENUATEROW_AVX2 | |
3588 | |
3589 #ifdef HAS_ARGBGRAYROW_SSSE3 | |
3590 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels | |
3591 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | |
3592 asm volatile ( | |
3593 "movdqa %3,%%xmm4 \n" | |
3594 "movdqa %4,%%xmm5 \n" | |
3595 | |
3596 // 8 pixel loop. | |
3597 LABELALIGN | |
3598 "1: \n" | |
3599 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3600 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
3601 "pmaddubsw %%xmm4,%%xmm0 \n" | |
3602 "pmaddubsw %%xmm4,%%xmm1 \n" | |
3603 "phaddw %%xmm1,%%xmm0 \n" | |
3604 "paddw %%xmm5,%%xmm0 \n" | |
3605 "psrlw $0x7,%%xmm0 \n" | |
3606 "packuswb %%xmm0,%%xmm0 \n" | |
3607 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
3608 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" | |
3609 "lea " MEMLEA(0x20,0) ",%0 \n" | |
3610 "psrld $0x18,%%xmm2 \n" | |
3611 "psrld $0x18,%%xmm3 \n" | |
3612 "packuswb %%xmm3,%%xmm2 \n" | |
3613 "packuswb %%xmm2,%%xmm2 \n" | |
3614 "movdqa %%xmm0,%%xmm3 \n" | |
3615 "punpcklbw %%xmm0,%%xmm0 \n" | |
3616 "punpcklbw %%xmm2,%%xmm3 \n" | |
3617 "movdqa %%xmm0,%%xmm1 \n" | |
3618 "punpcklwd %%xmm3,%%xmm0 \n" | |
3619 "punpckhwd %%xmm3,%%xmm1 \n" | |
3620 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
3621 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
3622 "lea " MEMLEA(0x20,1) ",%1 \n" | |
3623 "sub $0x8,%2 \n" | |
3624 "jg 1b \n" | |
3625 : "+r"(src_argb), // %0 | |
3626 "+r"(dst_argb), // %1 | |
3627 "+r"(width) // %2 | |
3628 : "m"(kARGBToYJ), // %3 | |
3629 "m"(kAddYJ64) // %4 | |
3630 : "memory", "cc" | |
3631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
3632 ); | |
3633 } | |
3634 #endif // HAS_ARGBGRAYROW_SSSE3 | |
3635 | |
3636 #ifdef HAS_ARGBSEPIAROW_SSSE3 | |
3637 // b = (r * 35 + g * 68 + b * 17) >> 7 | |
3638 // g = (r * 45 + g * 88 + b * 22) >> 7 | |
3639 // r = (r * 50 + g * 98 + b * 24) >> 7 | |
3640 // Constant for ARGB color to sepia tone | |
3641 static vec8 kARGBToSepiaB = { | |
3642 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 | |
3643 }; | |
3644 | |
3645 static vec8 kARGBToSepiaG = { | |
3646 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 | |
3647 }; | |
3648 | |
3649 static vec8 kARGBToSepiaR = { | |
3650 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 | |
3651 }; | |
3652 | |
3653 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | |
3654 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { | |
3655 asm volatile ( | |
3656 "movdqa %2,%%xmm2 \n" | |
3657 "movdqa %3,%%xmm3 \n" | |
3658 "movdqa %4,%%xmm4 \n" | |
3659 | |
3660 // 8 pixel loop. | |
3661 LABELALIGN | |
3662 "1: \n" | |
3663 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3664 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" | |
3665 "pmaddubsw %%xmm2,%%xmm0 \n" | |
3666 "pmaddubsw %%xmm2,%%xmm6 \n" | |
3667 "phaddw %%xmm6,%%xmm0 \n" | |
3668 "psrlw $0x7,%%xmm0 \n" | |
3669 "packuswb %%xmm0,%%xmm0 \n" | |
3670 "movdqu " MEMACCESS(0) ",%%xmm5 \n" | |
3671 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
3672 "pmaddubsw %%xmm3,%%xmm5 \n" | |
3673 "pmaddubsw %%xmm3,%%xmm1 \n" | |
3674 "phaddw %%xmm1,%%xmm5 \n" | |
3675 "psrlw $0x7,%%xmm5 \n" | |
3676 "packuswb %%xmm5,%%xmm5 \n" | |
3677 "punpcklbw %%xmm5,%%xmm0 \n" | |
3678 "movdqu " MEMACCESS(0) ",%%xmm5 \n" | |
3679 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
3680 "pmaddubsw %%xmm4,%%xmm5 \n" | |
3681 "pmaddubsw %%xmm4,%%xmm1 \n" | |
3682 "phaddw %%xmm1,%%xmm5 \n" | |
3683 "psrlw $0x7,%%xmm5 \n" | |
3684 "packuswb %%xmm5,%%xmm5 \n" | |
3685 "movdqu " MEMACCESS(0) ",%%xmm6 \n" | |
3686 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
3687 "psrld $0x18,%%xmm6 \n" | |
3688 "psrld $0x18,%%xmm1 \n" | |
3689 "packuswb %%xmm1,%%xmm6 \n" | |
3690 "packuswb %%xmm6,%%xmm6 \n" | |
3691 "punpcklbw %%xmm6,%%xmm5 \n" | |
3692 "movdqa %%xmm0,%%xmm1 \n" | |
3693 "punpcklwd %%xmm5,%%xmm0 \n" | |
3694 "punpckhwd %%xmm5,%%xmm1 \n" | |
3695 "movdqu %%xmm0," MEMACCESS(0) " \n" | |
3696 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" | |
3697 "lea " MEMLEA(0x20,0) ",%0 \n" | |
3698 "sub $0x8,%1 \n" | |
3699 "jg 1b \n" | |
3700 : "+r"(dst_argb), // %0 | |
3701 "+r"(width) // %1 | |
3702 : "m"(kARGBToSepiaB), // %2 | |
3703 "m"(kARGBToSepiaG), // %3 | |
3704 "m"(kARGBToSepiaR) // %4 | |
3705 : "memory", "cc" | |
3706 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
3707 ); | |
3708 } | |
3709 #endif // HAS_ARGBSEPIAROW_SSSE3 | |
3710 | |
3711 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 | |
3712 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | |
3713 // Same as Sepia except matrix is provided. | |
3714 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | |
3715 const int8* matrix_argb, int width) { | |
3716 asm volatile ( | |
3717 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | |
3718 "pshufd $0x00,%%xmm5,%%xmm2 \n" | |
3719 "pshufd $0x55,%%xmm5,%%xmm3 \n" | |
3720 "pshufd $0xaa,%%xmm5,%%xmm4 \n" | |
3721 "pshufd $0xff,%%xmm5,%%xmm5 \n" | |
3722 | |
3723 // 8 pixel loop. | |
3724 LABELALIGN | |
3725 "1: \n" | |
3726 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3727 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" | |
3728 "pmaddubsw %%xmm2,%%xmm0 \n" | |
3729 "pmaddubsw %%xmm2,%%xmm7 \n" | |
3730 "movdqu " MEMACCESS(0) ",%%xmm6 \n" | |
3731 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
3732 "pmaddubsw %%xmm3,%%xmm6 \n" | |
3733 "pmaddubsw %%xmm3,%%xmm1 \n" | |
3734 "phaddsw %%xmm7,%%xmm0 \n" | |
3735 "phaddsw %%xmm1,%%xmm6 \n" | |
3736 "psraw $0x6,%%xmm0 \n" | |
3737 "psraw $0x6,%%xmm6 \n" | |
3738 "packuswb %%xmm0,%%xmm0 \n" | |
3739 "packuswb %%xmm6,%%xmm6 \n" | |
3740 "punpcklbw %%xmm6,%%xmm0 \n" | |
3741 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
3742 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" | |
3743 "pmaddubsw %%xmm4,%%xmm1 \n" | |
3744 "pmaddubsw %%xmm4,%%xmm7 \n" | |
3745 "phaddsw %%xmm7,%%xmm1 \n" | |
3746 "movdqu " MEMACCESS(0) ",%%xmm6 \n" | |
3747 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" | |
3748 "pmaddubsw %%xmm5,%%xmm6 \n" | |
3749 "pmaddubsw %%xmm5,%%xmm7 \n" | |
3750 "phaddsw %%xmm7,%%xmm6 \n" | |
3751 "psraw $0x6,%%xmm1 \n" | |
3752 "psraw $0x6,%%xmm6 \n" | |
3753 "packuswb %%xmm1,%%xmm1 \n" | |
3754 "packuswb %%xmm6,%%xmm6 \n" | |
3755 "punpcklbw %%xmm6,%%xmm1 \n" | |
3756 "movdqa %%xmm0,%%xmm6 \n" | |
3757 "punpcklwd %%xmm1,%%xmm0 \n" | |
3758 "punpckhwd %%xmm1,%%xmm6 \n" | |
3759 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
3760 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" | |
3761 "lea " MEMLEA(0x20,0) ",%0 \n" | |
3762 "lea " MEMLEA(0x20,1) ",%1 \n" | |
3763 "sub $0x8,%2 \n" | |
3764 "jg 1b \n" | |
3765 : "+r"(src_argb), // %0 | |
3766 "+r"(dst_argb), // %1 | |
3767 "+r"(width) // %2 | |
3768 : "r"(matrix_argb) // %3 | |
3769 : "memory", "cc" | |
3770 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
3771 ); | |
3772 } | |
3773 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 | |
3774 | |
3775 #ifdef HAS_ARGBQUANTIZEROW_SSE2 | |
3776 // Quantize 4 ARGB pixels (16 bytes). | |
3777 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, | |
3778 int interval_offset, int width) { | |
3779 asm volatile ( | |
3780 "movd %2,%%xmm2 \n" | |
3781 "movd %3,%%xmm3 \n" | |
3782 "movd %4,%%xmm4 \n" | |
3783 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | |
3784 "pshufd $0x44,%%xmm2,%%xmm2 \n" | |
3785 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | |
3786 "pshufd $0x44,%%xmm3,%%xmm3 \n" | |
3787 "pshuflw $0x40,%%xmm4,%%xmm4 \n" | |
3788 "pshufd $0x44,%%xmm4,%%xmm4 \n" | |
3789 "pxor %%xmm5,%%xmm5 \n" | |
3790 "pcmpeqb %%xmm6,%%xmm6 \n" | |
3791 "pslld $0x18,%%xmm6 \n" | |
3792 | |
3793 // 4 pixel loop. | |
3794 LABELALIGN | |
3795 "1: \n" | |
3796 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3797 "punpcklbw %%xmm5,%%xmm0 \n" | |
3798 "pmulhuw %%xmm2,%%xmm0 \n" | |
3799 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
3800 "punpckhbw %%xmm5,%%xmm1 \n" | |
3801 "pmulhuw %%xmm2,%%xmm1 \n" | |
3802 "pmullw %%xmm3,%%xmm0 \n" | |
3803 "movdqu " MEMACCESS(0) ",%%xmm7 \n" | |
3804 "pmullw %%xmm3,%%xmm1 \n" | |
3805 "pand %%xmm6,%%xmm7 \n" | |
3806 "paddw %%xmm4,%%xmm0 \n" | |
3807 "paddw %%xmm4,%%xmm1 \n" | |
3808 "packuswb %%xmm1,%%xmm0 \n" | |
3809 "por %%xmm7,%%xmm0 \n" | |
3810 "movdqu %%xmm0," MEMACCESS(0) " \n" | |
3811 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3812 "sub $0x4,%1 \n" | |
3813 "jg 1b \n" | |
3814 : "+r"(dst_argb), // %0 | |
3815 "+r"(width) // %1 | |
3816 : "r"(scale), // %2 | |
3817 "r"(interval_size), // %3 | |
3818 "r"(interval_offset) // %4 | |
3819 : "memory", "cc" | |
3820 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
3821 ); | |
3822 } | |
3823 #endif // HAS_ARGBQUANTIZEROW_SSE2 | |
3824 | |
3825 #ifdef HAS_ARGBSHADEROW_SSE2 | |
3826 // Shade 4 pixels at a time by specified value. | |
3827 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, | |
3828 uint32 value) { | |
3829 asm volatile ( | |
3830 "movd %3,%%xmm2 \n" | |
3831 "punpcklbw %%xmm2,%%xmm2 \n" | |
3832 "punpcklqdq %%xmm2,%%xmm2 \n" | |
3833 | |
3834 // 4 pixel loop. | |
3835 LABELALIGN | |
3836 "1: \n" | |
3837 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3838 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3839 "movdqa %%xmm0,%%xmm1 \n" | |
3840 "punpcklbw %%xmm0,%%xmm0 \n" | |
3841 "punpckhbw %%xmm1,%%xmm1 \n" | |
3842 "pmulhuw %%xmm2,%%xmm0 \n" | |
3843 "pmulhuw %%xmm2,%%xmm1 \n" | |
3844 "psrlw $0x8,%%xmm0 \n" | |
3845 "psrlw $0x8,%%xmm1 \n" | |
3846 "packuswb %%xmm1,%%xmm0 \n" | |
3847 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
3848 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3849 "sub $0x4,%2 \n" | |
3850 "jg 1b \n" | |
3851 : "+r"(src_argb), // %0 | |
3852 "+r"(dst_argb), // %1 | |
3853 "+r"(width) // %2 | |
3854 : "r"(value) // %3 | |
3855 : "memory", "cc" | |
3856 , "xmm0", "xmm1", "xmm2" | |
3857 ); | |
3858 } | |
3859 #endif // HAS_ARGBSHADEROW_SSE2 | |
3860 | |
3861 #ifdef HAS_ARGBMULTIPLYROW_SSE2 | |
3862 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. | |
3863 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
3864 uint8* dst_argb, int width) { | |
3865 asm volatile ( | |
3866 "pxor %%xmm5,%%xmm5 \n" | |
3867 | |
3868 // 4 pixel loop. | |
3869 LABELALIGN | |
3870 "1: \n" | |
3871 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3872 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3873 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | |
3874 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3875 "movdqu %%xmm0,%%xmm1 \n" | |
3876 "movdqu %%xmm2,%%xmm3 \n" | |
3877 "punpcklbw %%xmm0,%%xmm0 \n" | |
3878 "punpckhbw %%xmm1,%%xmm1 \n" | |
3879 "punpcklbw %%xmm5,%%xmm2 \n" | |
3880 "punpckhbw %%xmm5,%%xmm3 \n" | |
3881 "pmulhuw %%xmm2,%%xmm0 \n" | |
3882 "pmulhuw %%xmm3,%%xmm1 \n" | |
3883 "packuswb %%xmm1,%%xmm0 \n" | |
3884 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
3885 "lea " MEMLEA(0x10,2) ",%2 \n" | |
3886 "sub $0x4,%3 \n" | |
3887 "jg 1b \n" | |
3888 : "+r"(src_argb0), // %0 | |
3889 "+r"(src_argb1), // %1 | |
3890 "+r"(dst_argb), // %2 | |
3891 "+r"(width) // %3 | |
3892 : | |
3893 : "memory", "cc" | |
3894 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
3895 ); | |
3896 } | |
3897 #endif // HAS_ARGBMULTIPLYROW_SSE2 | |
3898 | |
3899 #ifdef HAS_ARGBMULTIPLYROW_AVX2 | |
3900 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | |
3901 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | |
3902 uint8* dst_argb, int width) { | |
3903 asm volatile ( | |
3904 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" | |
3905 | |
3906 // 4 pixel loop. | |
3907 LABELALIGN | |
3908 "1: \n" | |
3909 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" | |
3910 "lea " MEMLEA(0x20,0) ",%0 \n" | |
3911 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" | |
3912 "lea " MEMLEA(0x20,1) ",%1 \n" | |
3913 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" | |
3914 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" | |
3915 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" | |
3916 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" | |
3917 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | |
3918 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" | |
3919 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
3920 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | |
3921 "lea " MEMLEA(0x20,2) ",%2 \n" | |
3922 "sub $0x8,%3 \n" | |
3923 "jg 1b \n" | |
3924 "vzeroupper \n" | |
3925 : "+r"(src_argb0), // %0 | |
3926 "+r"(src_argb1), // %1 | |
3927 "+r"(dst_argb), // %2 | |
3928 "+r"(width) // %3 | |
3929 : | |
3930 : "memory", "cc" | |
3931 #if defined(__AVX2__) | |
3932 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
3933 #endif | |
3934 ); | |
3935 } | |
3936 #endif // HAS_ARGBMULTIPLYROW_AVX2 | |
3937 | |
3938 #ifdef HAS_ARGBADDROW_SSE2 | |
3939 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | |
3940 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
3941 uint8* dst_argb, int width) { | |
3942 asm volatile ( | |
3943 // 4 pixel loop. | |
3944 LABELALIGN | |
3945 "1: \n" | |
3946 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3947 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3948 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
3949 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3950 "paddusb %%xmm1,%%xmm0 \n" | |
3951 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
3952 "lea " MEMLEA(0x10,2) ",%2 \n" | |
3953 "sub $0x4,%3 \n" | |
3954 "jg 1b \n" | |
3955 : "+r"(src_argb0), // %0 | |
3956 "+r"(src_argb1), // %1 | |
3957 "+r"(dst_argb), // %2 | |
3958 "+r"(width) // %3 | |
3959 : | |
3960 : "memory", "cc" | |
3961 , "xmm0", "xmm1" | |
3962 ); | |
3963 } | |
3964 #endif // HAS_ARGBADDROW_SSE2 | |
3965 | |
3966 #ifdef HAS_ARGBADDROW_AVX2 | |
3967 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | |
3968 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | |
3969 uint8* dst_argb, int width) { | |
3970 asm volatile ( | |
3971 // 4 pixel loop. | |
3972 LABELALIGN | |
3973 "1: \n" | |
3974 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
3975 "lea " MEMLEA(0x20,0) ",%0 \n" | |
3976 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" | |
3977 "lea " MEMLEA(0x20,1) ",%1 \n" | |
3978 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | |
3979 "lea " MEMLEA(0x20,2) ",%2 \n" | |
3980 "sub $0x8,%3 \n" | |
3981 "jg 1b \n" | |
3982 "vzeroupper \n" | |
3983 : "+r"(src_argb0), // %0 | |
3984 "+r"(src_argb1), // %1 | |
3985 "+r"(dst_argb), // %2 | |
3986 "+r"(width) // %3 | |
3987 : | |
3988 : "memory", "cc" | |
3989 , "xmm0" | |
3990 ); | |
3991 } | |
3992 #endif // HAS_ARGBADDROW_AVX2 | |
3993 | |
3994 #ifdef HAS_ARGBSUBTRACTROW_SSE2 | |
3995 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. | |
3996 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
3997 uint8* dst_argb, int width) { | |
3998 asm volatile ( | |
3999 // 4 pixel loop. | |
4000 LABELALIGN | |
4001 "1: \n" | |
4002 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4003 "lea " MEMLEA(0x10,0) ",%0 \n" | |
4004 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
4005 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4006 "psubusb %%xmm1,%%xmm0 \n" | |
4007 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
4008 "lea " MEMLEA(0x10,2) ",%2 \n" | |
4009 "sub $0x4,%3 \n" | |
4010 "jg 1b \n" | |
4011 : "+r"(src_argb0), // %0 | |
4012 "+r"(src_argb1), // %1 | |
4013 "+r"(dst_argb), // %2 | |
4014 "+r"(width) // %3 | |
4015 : | |
4016 : "memory", "cc" | |
4017 , "xmm0", "xmm1" | |
4018 ); | |
4019 } | |
4020 #endif // HAS_ARGBSUBTRACTROW_SSE2 | |
4021 | |
4022 #ifdef HAS_ARGBSUBTRACTROW_AVX2 | |
4023 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | |
4024 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | |
4025 uint8* dst_argb, int width) { | |
4026 asm volatile ( | |
4027 // 4 pixel loop. | |
4028 LABELALIGN | |
4029 "1: \n" | |
4030 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
4031 "lea " MEMLEA(0x20,0) ",%0 \n" | |
4032 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" | |
4033 "lea " MEMLEA(0x20,1) ",%1 \n" | |
4034 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | |
4035 "lea " MEMLEA(0x20,2) ",%2 \n" | |
4036 "sub $0x8,%3 \n" | |
4037 "jg 1b \n" | |
4038 "vzeroupper \n" | |
4039 : "+r"(src_argb0), // %0 | |
4040 "+r"(src_argb1), // %1 | |
4041 "+r"(dst_argb), // %2 | |
4042 "+r"(width) // %3 | |
4043 : | |
4044 : "memory", "cc" | |
4045 , "xmm0" | |
4046 ); | |
4047 } | |
4048 #endif // HAS_ARGBSUBTRACTROW_AVX2 | |
4049 | |
4050 #ifdef HAS_SOBELXROW_SSE2 | |
4051 // SobelX as a matrix is | |
4052 // -1 0 1 | |
4053 // -2 0 2 | |
4054 // -1 0 1 | |
4055 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, | |
4056 const uint8* src_y2, uint8* dst_sobelx, int width) { | |
4057 asm volatile ( | |
4058 "sub %0,%1 \n" | |
4059 "sub %0,%2 \n" | |
4060 "sub %0,%3 \n" | |
4061 "pxor %%xmm5,%%xmm5 \n" | |
4062 | |
4063 // 8 pixel loop. | |
4064 LABELALIGN | |
4065 "1: \n" | |
4066 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
4067 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" | |
4068 "punpcklbw %%xmm5,%%xmm0 \n" | |
4069 "punpcklbw %%xmm5,%%xmm1 \n" | |
4070 "psubw %%xmm1,%%xmm0 \n" | |
4071 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 | |
4072 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 | |
4073 "punpcklbw %%xmm5,%%xmm1 \n" | |
4074 "punpcklbw %%xmm5,%%xmm2 \n" | |
4075 "psubw %%xmm2,%%xmm1 \n" | |
4076 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 | |
4077 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 | |
4078 "punpcklbw %%xmm5,%%xmm2 \n" | |
4079 "punpcklbw %%xmm5,%%xmm3 \n" | |
4080 "psubw %%xmm3,%%xmm2 \n" | |
4081 "paddw %%xmm2,%%xmm0 \n" | |
4082 "paddw %%xmm1,%%xmm0 \n" | |
4083 "paddw %%xmm1,%%xmm0 \n" | |
4084 "pxor %%xmm1,%%xmm1 \n" | |
4085 "psubw %%xmm0,%%xmm1 \n" | |
4086 "pmaxsw %%xmm1,%%xmm0 \n" | |
4087 "packuswb %%xmm0,%%xmm0 \n" | |
4088 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) | |
4089 "lea " MEMLEA(0x8,0) ",%0 \n" | |
4090 "sub $0x8,%4 \n" | |
4091 "jg 1b \n" | |
4092 : "+r"(src_y0), // %0 | |
4093 "+r"(src_y1), // %1 | |
4094 "+r"(src_y2), // %2 | |
4095 "+r"(dst_sobelx), // %3 | |
4096 "+r"(width) // %4 | |
4097 : | |
4098 : "memory", "cc", NACL_R14 | |
4099 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
4100 ); | |
4101 } | |
4102 #endif // HAS_SOBELXROW_SSE2 | |
4103 | |
4104 #ifdef HAS_SOBELYROW_SSE2 | |
4105 // SobelY as a matrix is | |
4106 // -1 -2 -1 | |
4107 // 0 0 0 | |
4108 // 1 2 1 | |
4109 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, | |
4110 uint8* dst_sobely, int width) { | |
4111 asm volatile ( | |
4112 "sub %0,%1 \n" | |
4113 "sub %0,%2 \n" | |
4114 "pxor %%xmm5,%%xmm5 \n" | |
4115 | |
4116 // 8 pixel loop. | |
4117 LABELALIGN | |
4118 "1: \n" | |
4119 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
4120 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 | |
4121 "punpcklbw %%xmm5,%%xmm0 \n" | |
4122 "punpcklbw %%xmm5,%%xmm1 \n" | |
4123 "psubw %%xmm1,%%xmm0 \n" | |
4124 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" | |
4125 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 | |
4126 "punpcklbw %%xmm5,%%xmm1 \n" | |
4127 "punpcklbw %%xmm5,%%xmm2 \n" | |
4128 "psubw %%xmm2,%%xmm1 \n" | |
4129 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" | |
4130 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 | |
4131 "punpcklbw %%xmm5,%%xmm2 \n" | |
4132 "punpcklbw %%xmm5,%%xmm3 \n" | |
4133 "psubw %%xmm3,%%xmm2 \n" | |
4134 "paddw %%xmm2,%%xmm0 \n" | |
4135 "paddw %%xmm1,%%xmm0 \n" | |
4136 "paddw %%xmm1,%%xmm0 \n" | |
4137 "pxor %%xmm1,%%xmm1 \n" | |
4138 "psubw %%xmm0,%%xmm1 \n" | |
4139 "pmaxsw %%xmm1,%%xmm0 \n" | |
4140 "packuswb %%xmm0,%%xmm0 \n" | |
4141 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) | |
4142 "lea " MEMLEA(0x8,0) ",%0 \n" | |
4143 "sub $0x8,%3 \n" | |
4144 "jg 1b \n" | |
4145 : "+r"(src_y0), // %0 | |
4146 "+r"(src_y1), // %1 | |
4147 "+r"(dst_sobely), // %2 | |
4148 "+r"(width) // %3 | |
4149 : | |
4150 : "memory", "cc", NACL_R14 | |
4151 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
4152 ); | |
4153 } | |
4154 #endif // HAS_SOBELYROW_SSE2 | |
4155 | |
4156 #ifdef HAS_SOBELROW_SSE2 | |
4157 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | |
4158 // A = 255 | |
4159 // R = Sobel | |
4160 // G = Sobel | |
4161 // B = Sobel | |
4162 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | |
4163 uint8* dst_argb, int width) { | |
4164 asm volatile ( | |
4165 "sub %0,%1 \n" | |
4166 "pcmpeqb %%xmm5,%%xmm5 \n" | |
4167 "pslld $0x18,%%xmm5 \n" | |
4168 | |
4169 // 8 pixel loop. | |
4170 LABELALIGN | |
4171 "1: \n" | |
4172 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4173 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | |
4174 "lea " MEMLEA(0x10,0) ",%0 \n" | |
4175 "paddusb %%xmm1,%%xmm0 \n" | |
4176 "movdqa %%xmm0,%%xmm2 \n" | |
4177 "punpcklbw %%xmm0,%%xmm2 \n" | |
4178 "punpckhbw %%xmm0,%%xmm0 \n" | |
4179 "movdqa %%xmm2,%%xmm1 \n" | |
4180 "punpcklwd %%xmm2,%%xmm1 \n" | |
4181 "punpckhwd %%xmm2,%%xmm2 \n" | |
4182 "por %%xmm5,%%xmm1 \n" | |
4183 "por %%xmm5,%%xmm2 \n" | |
4184 "movdqa %%xmm0,%%xmm3 \n" | |
4185 "punpcklwd %%xmm0,%%xmm3 \n" | |
4186 "punpckhwd %%xmm0,%%xmm0 \n" | |
4187 "por %%xmm5,%%xmm3 \n" | |
4188 "por %%xmm5,%%xmm0 \n" | |
4189 "movdqu %%xmm1," MEMACCESS(2) " \n" | |
4190 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" | |
4191 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" | |
4192 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" | |
4193 "lea " MEMLEA(0x40,2) ",%2 \n" | |
4194 "sub $0x10,%3 \n" | |
4195 "jg 1b \n" | |
4196 : "+r"(src_sobelx), // %0 | |
4197 "+r"(src_sobely), // %1 | |
4198 "+r"(dst_argb), // %2 | |
4199 "+r"(width) // %3 | |
4200 : | |
4201 : "memory", "cc", NACL_R14 | |
4202 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | |
4203 ); | |
4204 } | |
4205 #endif // HAS_SOBELROW_SSE2 | |
4206 | |
4207 #ifdef HAS_SOBELTOPLANEROW_SSE2 | |
4208 // Adds Sobel X and Sobel Y and stores Sobel into a plane. | |
4209 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | |
4210 uint8* dst_y, int width) { | |
4211 asm volatile ( | |
4212 "sub %0,%1 \n" | |
4213 "pcmpeqb %%xmm5,%%xmm5 \n" | |
4214 "pslld $0x18,%%xmm5 \n" | |
4215 | |
4216 // 8 pixel loop. | |
4217 LABELALIGN | |
4218 "1: \n" | |
4219 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4220 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | |
4221 "lea " MEMLEA(0x10,0) ",%0 \n" | |
4222 "paddusb %%xmm1,%%xmm0 \n" | |
4223 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
4224 "lea " MEMLEA(0x10,2) ",%2 \n" | |
4225 "sub $0x10,%3 \n" | |
4226 "jg 1b \n" | |
4227 : "+r"(src_sobelx), // %0 | |
4228 "+r"(src_sobely), // %1 | |
4229 "+r"(dst_y), // %2 | |
4230 "+r"(width) // %3 | |
4231 : | |
4232 : "memory", "cc", NACL_R14 | |
4233 "xmm0", "xmm1" | |
4234 ); | |
4235 } | |
4236 #endif // HAS_SOBELTOPLANEROW_SSE2 | |
4237 | |
4238 #ifdef HAS_SOBELXYROW_SSE2 | |
4239 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | |
4240 // A = 255 | |
4241 // R = Sobel X | |
4242 // G = Sobel | |
4243 // B = Sobel Y | |
4244 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | |
4245 uint8* dst_argb, int width) { | |
4246 asm volatile ( | |
4247 "sub %0,%1 \n" | |
4248 "pcmpeqb %%xmm5,%%xmm5 \n" | |
4249 | |
4250 // 8 pixel loop. | |
4251 LABELALIGN | |
4252 "1: \n" | |
4253 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4254 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | |
4255 "lea " MEMLEA(0x10,0) ",%0 \n" | |
4256 "movdqa %%xmm0,%%xmm2 \n" | |
4257 "paddusb %%xmm1,%%xmm2 \n" | |
4258 "movdqa %%xmm0,%%xmm3 \n" | |
4259 "punpcklbw %%xmm5,%%xmm3 \n" | |
4260 "punpckhbw %%xmm5,%%xmm0 \n" | |
4261 "movdqa %%xmm1,%%xmm4 \n" | |
4262 "punpcklbw %%xmm2,%%xmm4 \n" | |
4263 "punpckhbw %%xmm2,%%xmm1 \n" | |
4264 "movdqa %%xmm4,%%xmm6 \n" | |
4265 "punpcklwd %%xmm3,%%xmm6 \n" | |
4266 "punpckhwd %%xmm3,%%xmm4 \n" | |
4267 "movdqa %%xmm1,%%xmm7 \n" | |
4268 "punpcklwd %%xmm0,%%xmm7 \n" | |
4269 "punpckhwd %%xmm0,%%xmm1 \n" | |
4270 "movdqu %%xmm6," MEMACCESS(2) " \n" | |
4271 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" | |
4272 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" | |
4273 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" | |
4274 "lea " MEMLEA(0x40,2) ",%2 \n" | |
4275 "sub $0x10,%3 \n" | |
4276 "jg 1b \n" | |
4277 : "+r"(src_sobelx), // %0 | |
4278 "+r"(src_sobely), // %1 | |
4279 "+r"(dst_argb), // %2 | |
4280 "+r"(width) // %3 | |
4281 : | |
4282 : "memory", "cc", NACL_R14 | |
4283 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
4284 ); | |
4285 } | |
4286 #endif // HAS_SOBELXYROW_SSE2 | |
4287 | |
4288 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 | |
4289 // Creates a table of cumulative sums where each value is a sum of all values | |
4290 // above and to the left of the value, inclusive of the value. | |
4291 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, | |
4292 const int32* previous_cumsum, int width) { | |
4293 asm volatile ( | |
4294 "pxor %%xmm0,%%xmm0 \n" | |
4295 "pxor %%xmm1,%%xmm1 \n" | |
4296 "sub $0x4,%3 \n" | |
4297 "jl 49f \n" | |
4298 "test $0xf,%1 \n" | |
4299 "jne 49f \n" | |
4300 | |
4301 // 4 pixel loop \n" | |
4302 LABELALIGN | |
4303 "40: \n" | |
4304 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
4305 "lea " MEMLEA(0x10,0) ",%0 \n" | |
4306 "movdqa %%xmm2,%%xmm4 \n" | |
4307 "punpcklbw %%xmm1,%%xmm2 \n" | |
4308 "movdqa %%xmm2,%%xmm3 \n" | |
4309 "punpcklwd %%xmm1,%%xmm2 \n" | |
4310 "punpckhwd %%xmm1,%%xmm3 \n" | |
4311 "punpckhbw %%xmm1,%%xmm4 \n" | |
4312 "movdqa %%xmm4,%%xmm5 \n" | |
4313 "punpcklwd %%xmm1,%%xmm4 \n" | |
4314 "punpckhwd %%xmm1,%%xmm5 \n" | |
4315 "paddd %%xmm2,%%xmm0 \n" | |
4316 "movdqu " MEMACCESS(2) ",%%xmm2 \n" | |
4317 "paddd %%xmm0,%%xmm2 \n" | |
4318 "paddd %%xmm3,%%xmm0 \n" | |
4319 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" | |
4320 "paddd %%xmm0,%%xmm3 \n" | |
4321 "paddd %%xmm4,%%xmm0 \n" | |
4322 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" | |
4323 "paddd %%xmm0,%%xmm4 \n" | |
4324 "paddd %%xmm5,%%xmm0 \n" | |
4325 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" | |
4326 "lea " MEMLEA(0x40,2) ",%2 \n" | |
4327 "paddd %%xmm0,%%xmm5 \n" | |
4328 "movdqu %%xmm2," MEMACCESS(1) " \n" | |
4329 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" | |
4330 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" | |
4331 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" | |
4332 "lea " MEMLEA(0x40,1) ",%1 \n" | |
4333 "sub $0x4,%3 \n" | |
4334 "jge 40b \n" | |
4335 | |
4336 "49: \n" | |
4337 "add $0x3,%3 \n" | |
4338 "jl 19f \n" | |
4339 | |
4340 // 1 pixel loop \n" | |
4341 LABELALIGN | |
4342 "10: \n" | |
4343 "movd " MEMACCESS(0) ",%%xmm2 \n" | |
4344 "lea " MEMLEA(0x4,0) ",%0 \n" | |
4345 "punpcklbw %%xmm1,%%xmm2 \n" | |
4346 "punpcklwd %%xmm1,%%xmm2 \n" | |
4347 "paddd %%xmm2,%%xmm0 \n" | |
4348 "movdqu " MEMACCESS(2) ",%%xmm2 \n" | |
4349 "lea " MEMLEA(0x10,2) ",%2 \n" | |
4350 "paddd %%xmm0,%%xmm2 \n" | |
4351 "movdqu %%xmm2," MEMACCESS(1) " \n" | |
4352 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4353 "sub $0x1,%3 \n" | |
4354 "jge 10b \n" | |
4355 | |
4356 "19: \n" | |
4357 : "+r"(row), // %0 | |
4358 "+r"(cumsum), // %1 | |
4359 "+r"(previous_cumsum), // %2 | |
4360 "+r"(width) // %3 | |
4361 : | |
4362 : "memory", "cc" | |
4363 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
4364 ); | |
4365 } | |
4366 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 | |
4367 | |
4368 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | |
4369 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, | |
4370 int width, int area, uint8* dst, | |
4371 int count) { | |
4372 asm volatile ( | |
4373 "movd %5,%%xmm5 \n" | |
4374 "cvtdq2ps %%xmm5,%%xmm5 \n" | |
4375 "rcpss %%xmm5,%%xmm4 \n" | |
4376 "pshufd $0x0,%%xmm4,%%xmm4 \n" | |
4377 "sub $0x4,%3 \n" | |
4378 "jl 49f \n" | |
4379 "cmpl $0x80,%5 \n" | |
4380 "ja 40f \n" | |
4381 | |
4382 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
4383 "pcmpeqb %%xmm6,%%xmm6 \n" | |
4384 "psrld $0x10,%%xmm6 \n" | |
4385 "cvtdq2ps %%xmm6,%%xmm6 \n" | |
4386 "addps %%xmm6,%%xmm5 \n" | |
4387 "mulps %%xmm4,%%xmm5 \n" | |
4388 "cvtps2dq %%xmm5,%%xmm5 \n" | |
4389 "packssdw %%xmm5,%%xmm5 \n" | |
4390 | |
4391 // 4 pixel small loop \n" | |
4392 LABELALIGN | |
4393 "4: \n" | |
4394 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4395 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
4396 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
4397 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
4398 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 | |
4399 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 | |
4400 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 | |
4401 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 | |
4402 "lea " MEMLEA(0x40,0) ",%0 \n" | |
4403 "psubd " MEMACCESS(1) ",%%xmm0 \n" | |
4404 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" | |
4405 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" | |
4406 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" | |
4407 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 | |
4408 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 | |
4409 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 | |
4410 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 | |
4411 "lea " MEMLEA(0x40,1) ",%1 \n" | |
4412 "packssdw %%xmm1,%%xmm0 \n" | |
4413 "packssdw %%xmm3,%%xmm2 \n" | |
4414 "pmulhuw %%xmm5,%%xmm0 \n" | |
4415 "pmulhuw %%xmm5,%%xmm2 \n" | |
4416 "packuswb %%xmm2,%%xmm0 \n" | |
4417 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
4418 "lea " MEMLEA(0x10,2) ",%2 \n" | |
4419 "sub $0x4,%3 \n" | |
4420 "jge 4b \n" | |
4421 "jmp 49f \n" | |
4422 | |
4423 // 4 pixel loop \n" | |
4424 LABELALIGN | |
4425 "40: \n" | |
4426 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4427 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
4428 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" | |
4429 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" | |
4430 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 | |
4431 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 | |
4432 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 | |
4433 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 | |
4434 "lea " MEMLEA(0x40,0) ",%0 \n" | |
4435 "psubd " MEMACCESS(1) ",%%xmm0 \n" | |
4436 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" | |
4437 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" | |
4438 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" | |
4439 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 | |
4440 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 | |
4441 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 | |
4442 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 | |
4443 "lea " MEMLEA(0x40,1) ",%1 \n" | |
4444 "cvtdq2ps %%xmm0,%%xmm0 \n" | |
4445 "cvtdq2ps %%xmm1,%%xmm1 \n" | |
4446 "mulps %%xmm4,%%xmm0 \n" | |
4447 "mulps %%xmm4,%%xmm1 \n" | |
4448 "cvtdq2ps %%xmm2,%%xmm2 \n" | |
4449 "cvtdq2ps %%xmm3,%%xmm3 \n" | |
4450 "mulps %%xmm4,%%xmm2 \n" | |
4451 "mulps %%xmm4,%%xmm3 \n" | |
4452 "cvtps2dq %%xmm0,%%xmm0 \n" | |
4453 "cvtps2dq %%xmm1,%%xmm1 \n" | |
4454 "cvtps2dq %%xmm2,%%xmm2 \n" | |
4455 "cvtps2dq %%xmm3,%%xmm3 \n" | |
4456 "packssdw %%xmm1,%%xmm0 \n" | |
4457 "packssdw %%xmm3,%%xmm2 \n" | |
4458 "packuswb %%xmm2,%%xmm0 \n" | |
4459 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
4460 "lea " MEMLEA(0x10,2) ",%2 \n" | |
4461 "sub $0x4,%3 \n" | |
4462 "jge 40b \n" | |
4463 | |
4464 "49: \n" | |
4465 "add $0x3,%3 \n" | |
4466 "jl 19f \n" | |
4467 | |
4468 // 1 pixel loop \n" | |
4469 LABELALIGN | |
4470 "10: \n" | |
4471 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4472 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 | |
4473 "lea " MEMLEA(0x10,0) ",%0 \n" | |
4474 "psubd " MEMACCESS(1) ",%%xmm0 \n" | |
4475 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 | |
4476 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4477 "cvtdq2ps %%xmm0,%%xmm0 \n" | |
4478 "mulps %%xmm4,%%xmm0 \n" | |
4479 "cvtps2dq %%xmm0,%%xmm0 \n" | |
4480 "packssdw %%xmm0,%%xmm0 \n" | |
4481 "packuswb %%xmm0,%%xmm0 \n" | |
4482 "movd %%xmm0," MEMACCESS(2) " \n" | |
4483 "lea " MEMLEA(0x4,2) ",%2 \n" | |
4484 "sub $0x1,%3 \n" | |
4485 "jge 10b \n" | |
4486 "19: \n" | |
4487 : "+r"(topleft), // %0 | |
4488 "+r"(botleft), // %1 | |
4489 "+r"(dst), // %2 | |
4490 "+rm"(count) // %3 | |
4491 : "r"((intptr_t)(width)), // %4 | |
4492 "rm"(area) // %5 | |
4493 : "memory", "cc", NACL_R14 | |
4494 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
4495 ); | |
4496 } | |
4497 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | |
4498 | |
4499 #ifdef HAS_ARGBAFFINEROW_SSE2 | |
4500 // Copy ARGB pixels from source image with slope to a row of destination. | |
4501 LIBYUV_API | |
4502 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, | |
4503 uint8* dst_argb, const float* src_dudv, int width) { | |
4504 intptr_t src_argb_stride_temp = src_argb_stride; | |
4505 intptr_t temp = 0; | |
4506 asm volatile ( | |
4507 "movq " MEMACCESS(3) ",%%xmm2 \n" | |
4508 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" | |
4509 "shl $0x10,%1 \n" | |
4510 "add $0x4,%1 \n" | |
4511 "movd %1,%%xmm5 \n" | |
4512 "sub $0x4,%4 \n" | |
4513 "jl 49f \n" | |
4514 | |
4515 "pshufd $0x44,%%xmm7,%%xmm7 \n" | |
4516 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
4517 "movdqa %%xmm2,%%xmm0 \n" | |
4518 "addps %%xmm7,%%xmm0 \n" | |
4519 "movlhps %%xmm0,%%xmm2 \n" | |
4520 "movdqa %%xmm7,%%xmm4 \n" | |
4521 "addps %%xmm4,%%xmm4 \n" | |
4522 "movdqa %%xmm2,%%xmm3 \n" | |
4523 "addps %%xmm4,%%xmm3 \n" | |
4524 "addps %%xmm4,%%xmm4 \n" | |
4525 | |
4526 // 4 pixel loop \n" | |
4527 LABELALIGN | |
4528 "40: \n" | |
4529 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 | |
4530 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 | |
4531 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts | |
4532 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride | |
4533 "movd %%xmm0,%k1 \n" | |
4534 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
4535 "movd %%xmm0,%k5 \n" | |
4536 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
4537 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 | |
4538 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 | |
4539 "punpckldq %%xmm6,%%xmm1 \n" | |
4540 "addps %%xmm4,%%xmm2 \n" | |
4541 "movq %%xmm1," MEMACCESS(2) " \n" | |
4542 "movd %%xmm0,%k1 \n" | |
4543 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
4544 "movd %%xmm0,%k5 \n" | |
4545 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 | |
4546 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 | |
4547 "punpckldq %%xmm6,%%xmm0 \n" | |
4548 "addps %%xmm4,%%xmm3 \n" | |
4549 "movq %%xmm0," MEMACCESS2(0x08,2) " \n" | |
4550 "lea " MEMLEA(0x10,2) ",%2 \n" | |
4551 "sub $0x4,%4 \n" | |
4552 "jge 40b \n" | |
4553 | |
4554 "49: \n" | |
4555 "add $0x3,%4 \n" | |
4556 "jl 19f \n" | |
4557 | |
4558 // 1 pixel loop \n" | |
4559 LABELALIGN | |
4560 "10: \n" | |
4561 "cvttps2dq %%xmm2,%%xmm0 \n" | |
4562 "packssdw %%xmm0,%%xmm0 \n" | |
4563 "pmaddwd %%xmm5,%%xmm0 \n" | |
4564 "addps %%xmm7,%%xmm2 \n" | |
4565 "movd %%xmm0,%k1 \n" | |
4566 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 | |
4567 "movd %%xmm0," MEMACCESS(2) " \n" | |
4568 "lea " MEMLEA(0x04,2) ",%2 \n" | |
4569 "sub $0x1,%4 \n" | |
4570 "jge 10b \n" | |
4571 "19: \n" | |
4572 : "+r"(src_argb), // %0 | |
4573 "+r"(src_argb_stride_temp), // %1 | |
4574 "+r"(dst_argb), // %2 | |
4575 "+r"(src_dudv), // %3 | |
4576 "+rm"(width), // %4 | |
4577 "+r"(temp) // %5 | |
4578 : | |
4579 : "memory", "cc", NACL_R14 | |
4580 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
4581 ); | |
4582 } | |
4583 #endif // HAS_ARGBAFFINEROW_SSE2 | |
4584 | |
4585 #ifdef HAS_INTERPOLATEROW_SSSE3 | |
4586 // Bilinear filter 16x2 -> 16x1 | |
4587 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | |
4588 ptrdiff_t src_stride, int dst_width, | |
4589 int source_y_fraction) { | |
4590 asm volatile ( | |
4591 "sub %1,%0 \n" | |
4592 "shr %3 \n" | |
4593 "cmp $0x0,%3 \n" | |
4594 "je 100f \n" | |
4595 "cmp $0x20,%3 \n" | |
4596 "je 75f \n" | |
4597 "cmp $0x40,%3 \n" | |
4598 "je 50f \n" | |
4599 "cmp $0x60,%3 \n" | |
4600 "je 25f \n" | |
4601 | |
4602 "movd %3,%%xmm0 \n" | |
4603 "neg %3 \n" | |
4604 "add $0x80,%3 \n" | |
4605 "movd %3,%%xmm5 \n" | |
4606 "punpcklbw %%xmm0,%%xmm5 \n" | |
4607 "punpcklwd %%xmm5,%%xmm5 \n" | |
4608 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
4609 | |
4610 // General purpose row blend. | |
4611 LABELALIGN | |
4612 "1: \n" | |
4613 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
4614 MEMOPREG(movdqu,0x00,1,4,1,xmm2) | |
4615 "movdqa %%xmm0,%%xmm1 \n" | |
4616 "punpcklbw %%xmm2,%%xmm0 \n" | |
4617 "punpckhbw %%xmm2,%%xmm1 \n" | |
4618 "pmaddubsw %%xmm5,%%xmm0 \n" | |
4619 "pmaddubsw %%xmm5,%%xmm1 \n" | |
4620 "psrlw $0x7,%%xmm0 \n" | |
4621 "psrlw $0x7,%%xmm1 \n" | |
4622 "packuswb %%xmm1,%%xmm0 \n" | |
4623 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
4624 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4625 "sub $0x10,%2 \n" | |
4626 "jg 1b \n" | |
4627 "jmp 99f \n" | |
4628 | |
4629 // Blend 25 / 75. | |
4630 LABELALIGN | |
4631 "25: \n" | |
4632 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
4633 MEMOPREG(movdqu,0x00,1,4,1,xmm1) | |
4634 "pavgb %%xmm1,%%xmm0 \n" | |
4635 "pavgb %%xmm1,%%xmm0 \n" | |
4636 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
4637 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4638 "sub $0x10,%2 \n" | |
4639 "jg 25b \n" | |
4640 "jmp 99f \n" | |
4641 | |
4642 // Blend 50 / 50. | |
4643 LABELALIGN | |
4644 "50: \n" | |
4645 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
4646 MEMOPREG(movdqu,0x00,1,4,1,xmm1) | |
4647 "pavgb %%xmm1,%%xmm0 \n" | |
4648 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
4649 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4650 "sub $0x10,%2 \n" | |
4651 "jg 50b \n" | |
4652 "jmp 99f \n" | |
4653 | |
4654 // Blend 75 / 25. | |
4655 LABELALIGN | |
4656 "75: \n" | |
4657 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
4658 MEMOPREG(movdqu,0x00,1,4,1,xmm0) | |
4659 "pavgb %%xmm1,%%xmm0 \n" | |
4660 "pavgb %%xmm1,%%xmm0 \n" | |
4661 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
4662 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4663 "sub $0x10,%2 \n" | |
4664 "jg 75b \n" | |
4665 "jmp 99f \n" | |
4666 | |
4667 // Blend 100 / 0 - Copy row unchanged. | |
4668 LABELALIGN | |
4669 "100: \n" | |
4670 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
4671 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) | |
4672 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4673 "sub $0x10,%2 \n" | |
4674 "jg 100b \n" | |
4675 | |
4676 "99: \n" | |
4677 : "+r"(dst_ptr), // %0 | |
4678 "+r"(src_ptr), // %1 | |
4679 "+r"(dst_width), // %2 | |
4680 "+r"(source_y_fraction) // %3 | |
4681 : "r"((intptr_t)(src_stride)) // %4 | |
4682 : "memory", "cc", NACL_R14 | |
4683 "xmm0", "xmm1", "xmm2", "xmm5" | |
4684 ); | |
4685 } | |
4686 #endif // HAS_INTERPOLATEROW_SSSE3 | |
4687 | |
4688 #ifdef HAS_INTERPOLATEROW_AVX2 | |
4689 // Bilinear filter 32x2 -> 32x1 | |
4690 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, | |
4691 ptrdiff_t src_stride, int dst_width, | |
4692 int source_y_fraction) { | |
4693 asm volatile ( | |
4694 "shr %3 \n" | |
4695 "cmp $0x0,%3 \n" | |
4696 "je 100f \n" | |
4697 "sub %1,%0 \n" | |
4698 "cmp $0x20,%3 \n" | |
4699 "je 75f \n" | |
4700 "cmp $0x40,%3 \n" | |
4701 "je 50f \n" | |
4702 "cmp $0x60,%3 \n" | |
4703 "je 25f \n" | |
4704 | |
4705 "vmovd %3,%%xmm0 \n" | |
4706 "neg %3 \n" | |
4707 "add $0x80,%3 \n" | |
4708 "vmovd %3,%%xmm5 \n" | |
4709 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" | |
4710 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" | |
4711 "vpxor %%ymm0,%%ymm0,%%ymm0 \n" | |
4712 "vpermd %%ymm5,%%ymm0,%%ymm5 \n" | |
4713 | |
4714 // General purpose row blend. | |
4715 LABELALIGN | |
4716 "1: \n" | |
4717 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" | |
4718 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) | |
4719 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" | |
4720 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" | |
4721 "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n" | |
4722 "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n" | |
4723 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" | |
4724 "vpsrlw $0x7,%%ymm1,%%ymm1 \n" | |
4725 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | |
4726 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) | |
4727 "lea " MEMLEA(0x20,1) ",%1 \n" | |
4728 "sub $0x20,%2 \n" | |
4729 "jg 1b \n" | |
4730 "jmp 99f \n" | |
4731 | |
4732 // Blend 25 / 75. | |
4733 LABELALIGN | |
4734 "25: \n" | |
4735 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" | |
4736 MEMOPREG(vmovdqu,0x00,1,4,1,ymm1) | |
4737 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" | |
4738 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" | |
4739 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) | |
4740 "lea " MEMLEA(0x20,1) ",%1 \n" | |
4741 "sub $0x20,%2 \n" | |
4742 "jg 25b \n" | |
4743 "jmp 99f \n" | |
4744 | |
4745 // Blend 50 / 50. | |
4746 LABELALIGN | |
4747 "50: \n" | |
4748 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" | |
4749 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 | |
4750 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) | |
4751 "lea " MEMLEA(0x20,1) ",%1 \n" | |
4752 "sub $0x20,%2 \n" | |
4753 "jg 50b \n" | |
4754 "jmp 99f \n" | |
4755 | |
4756 // Blend 75 / 25. | |
4757 LABELALIGN | |
4758 "75: \n" | |
4759 "vmovdqu " MEMACCESS(1) ",%%ymm1 \n" | |
4760 MEMOPREG(vmovdqu,0x00,1,4,1,ymm0) | |
4761 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" | |
4762 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" | |
4763 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) | |
4764 "lea " MEMLEA(0x20,1) ",%1 \n" | |
4765 "sub $0x20,%2 \n" | |
4766 "jg 75b \n" | |
4767 "jmp 99f \n" | |
4768 | |
4769 // Blend 100 / 0 - Copy row unchanged. | |
4770 LABELALIGN | |
4771 "100: \n" | |
4772 "rep movsb " MEMMOVESTRING(1,0) " \n" | |
4773 "jmp 999f \n" | |
4774 | |
4775 "99: \n" | |
4776 "vzeroupper \n" | |
4777 "999: \n" | |
4778 : "+D"(dst_ptr), // %0 | |
4779 "+S"(src_ptr), // %1 | |
4780 "+c"(dst_width), // %2 | |
4781 "+r"(source_y_fraction) // %3 | |
4782 : "r"((intptr_t)(src_stride)) // %4 | |
4783 : "memory", "cc", NACL_R14 | |
4784 "xmm0", "xmm1", "xmm2", "xmm5" | |
4785 ); | |
4786 } | |
4787 #endif // HAS_INTERPOLATEROW_AVX2 | |
4788 | |
4789 #ifdef HAS_INTERPOLATEROW_SSE2 | |
4790 // Bilinear filter 16x2 -> 16x1 | |
4791 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, | |
4792 ptrdiff_t src_stride, int dst_width, | |
4793 int source_y_fraction) { | |
4794 asm volatile ( | |
4795 "sub %1,%0 \n" | |
4796 "shr %3 \n" | |
4797 "cmp $0x0,%3 \n" | |
4798 "je 100f \n" | |
4799 "cmp $0x20,%3 \n" | |
4800 "je 75f \n" | |
4801 "cmp $0x40,%3 \n" | |
4802 "je 50f \n" | |
4803 "cmp $0x60,%3 \n" | |
4804 "je 25f \n" | |
4805 | |
4806 "movd %3,%%xmm0 \n" | |
4807 "neg %3 \n" | |
4808 "add $0x80,%3 \n" | |
4809 "movd %3,%%xmm5 \n" | |
4810 "punpcklbw %%xmm0,%%xmm5 \n" | |
4811 "punpcklwd %%xmm5,%%xmm5 \n" | |
4812 "pshufd $0x0,%%xmm5,%%xmm5 \n" | |
4813 "pxor %%xmm4,%%xmm4 \n" | |
4814 | |
4815 // General purpose row blend. | |
4816 LABELALIGN | |
4817 "1: \n" | |
4818 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
4819 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 | |
4820 "movdqa %%xmm0,%%xmm1 \n" | |
4821 "movdqa %%xmm2,%%xmm3 \n" | |
4822 "punpcklbw %%xmm4,%%xmm2 \n" | |
4823 "punpckhbw %%xmm4,%%xmm3 \n" | |
4824 "punpcklbw %%xmm4,%%xmm0 \n" | |
4825 "punpckhbw %%xmm4,%%xmm1 \n" | |
4826 "psubw %%xmm0,%%xmm2 \n" | |
4827 "psubw %%xmm1,%%xmm3 \n" | |
4828 "paddw %%xmm2,%%xmm2 \n" | |
4829 "paddw %%xmm3,%%xmm3 \n" | |
4830 "pmulhw %%xmm5,%%xmm2 \n" | |
4831 "pmulhw %%xmm5,%%xmm3 \n" | |
4832 "paddw %%xmm2,%%xmm0 \n" | |
4833 "paddw %%xmm3,%%xmm1 \n" | |
4834 "packuswb %%xmm1,%%xmm0 \n" | |
4835 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
4836 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4837 "sub $0x10,%2 \n" | |
4838 "jg 1b \n" | |
4839 "jmp 99f \n" | |
4840 | |
4841 // Blend 25 / 75. | |
4842 LABELALIGN | |
4843 "25: \n" | |
4844 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
4845 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 | |
4846 "pavgb %%xmm1,%%xmm0 \n" | |
4847 "pavgb %%xmm1,%%xmm0 \n" | |
4848 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
4849 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4850 "sub $0x10,%2 \n" | |
4851 "jg 25b \n" | |
4852 "jmp 99f \n" | |
4853 | |
4854 // Blend 50 / 50. | |
4855 LABELALIGN | |
4856 "50: \n" | |
4857 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
4858 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 | |
4859 "pavgb %%xmm1,%%xmm0 \n" | |
4860 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
4861 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4862 "sub $0x10,%2 \n" | |
4863 "jg 50b \n" | |
4864 "jmp 99f \n" | |
4865 | |
4866 // Blend 75 / 25. | |
4867 LABELALIGN | |
4868 "75: \n" | |
4869 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
4870 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 | |
4871 "pavgb %%xmm1,%%xmm0 \n" | |
4872 "pavgb %%xmm1,%%xmm0 \n" | |
4873 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
4874 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4875 "sub $0x10,%2 \n" | |
4876 "jg 75b \n" | |
4877 "jmp 99f \n" | |
4878 | |
4879 // Blend 100 / 0 - Copy row unchanged. | |
4880 LABELALIGN | |
4881 "100: \n" | |
4882 "movdqu " MEMACCESS(1) ",%%xmm0 \n" | |
4883 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) | |
4884 "lea " MEMLEA(0x10,1) ",%1 \n" | |
4885 "sub $0x10,%2 \n" | |
4886 "jg 100b \n" | |
4887 | |
4888 "99: \n" | |
4889 : "+r"(dst_ptr), // %0 | |
4890 "+r"(src_ptr), // %1 | |
4891 "+r"(dst_width), // %2 | |
4892 "+r"(source_y_fraction) // %3 | |
4893 : "r"((intptr_t)(src_stride)) // %4 | |
4894 : "memory", "cc", NACL_R14 | |
4895 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
4896 ); | |
4897 } | |
4898 #endif // HAS_INTERPOLATEROW_SSE2 | |
4899 | |
4900 #ifdef HAS_ARGBTOBAYERGGROW_SSE2 | |
4901 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, | |
4902 uint32 selector, int pix) { | |
4903 asm volatile ( | |
4904 "pcmpeqb %%xmm5,%%xmm5 \n" | |
4905 "psrld $0x18,%%xmm5 \n" | |
4906 LABELALIGN | |
4907 "1: \n" | |
4908 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4909 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
4910 "lea " MEMLEA(0x20,0) ",%0 \n" | |
4911 "psrld $0x8,%%xmm0 \n" | |
4912 "psrld $0x8,%%xmm1 \n" | |
4913 "pand %%xmm5,%%xmm0 \n" | |
4914 "pand %%xmm5,%%xmm1 \n" | |
4915 "packssdw %%xmm1,%%xmm0 \n" | |
4916 "packuswb %%xmm1,%%xmm0 \n" | |
4917 "movq %%xmm0," MEMACCESS(1) " \n" | |
4918 "lea " MEMLEA(0x8,1) ",%1 \n" | |
4919 "sub $0x8,%2 \n" | |
4920 "jg 1b \n" | |
4921 : "+r"(src_argb), // %0 | |
4922 "+r"(dst_bayer), // %1 | |
4923 "+r"(pix) // %2 | |
4924 : | |
4925 : "memory", "cc" | |
4926 , "xmm0", "xmm1", "xmm5" | |
4927 ); | |
4928 } | |
4929 #endif // HAS_ARGBTOBAYERGGROW_SSE2 | |
4930 | |
4931 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 | |
4932 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | |
4933 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | |
4934 const uint8* shuffler, int pix) { | |
4935 asm volatile ( | |
4936 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | |
4937 LABELALIGN | |
4938 "1: \n" | |
4939 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4940 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
4941 "lea " MEMLEA(0x20,0) ",%0 \n" | |
4942 "pshufb %%xmm5,%%xmm0 \n" | |
4943 "pshufb %%xmm5,%%xmm1 \n" | |
4944 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
4945 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | |
4946 "lea " MEMLEA(0x20,1) ",%1 \n" | |
4947 "sub $0x8,%2 \n" | |
4948 "jg 1b \n" | |
4949 : "+r"(src_argb), // %0 | |
4950 "+r"(dst_argb), // %1 | |
4951 "+r"(pix) // %2 | |
4952 : "r"(shuffler) // %3 | |
4953 : "memory", "cc" | |
4954 , "xmm0", "xmm1", "xmm5" | |
4955 ); | |
4956 } | |
4957 #endif // HAS_ARGBSHUFFLEROW_SSSE3 | |
4958 | |
4959 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | |
4960 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | |
4961 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | |
4962 const uint8* shuffler, int pix) { | |
4963 asm volatile ( | |
4964 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" | |
4965 LABELALIGN | |
4966 "1: \n" | |
4967 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | |
4968 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | |
4969 "lea " MEMLEA(0x40,0) ",%0 \n" | |
4970 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" | |
4971 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" | |
4972 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | |
4973 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | |
4974 "lea " MEMLEA(0x40,1) ",%1 \n" | |
4975 "sub $0x10,%2 \n" | |
4976 "jg 1b \n" | |
4977 "vzeroupper \n" | |
4978 : "+r"(src_argb), // %0 | |
4979 "+r"(dst_argb), // %1 | |
4980 "+r"(pix) // %2 | |
4981 : "r"(shuffler) // %3 | |
4982 : "memory", "cc" | |
4983 , "xmm0", "xmm1", "xmm5" | |
4984 ); | |
4985 } | |
4986 #endif // HAS_ARGBSHUFFLEROW_AVX2 | |
4987 | |
4988 #ifdef HAS_ARGBSHUFFLEROW_SSE2 | |
4989 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | |
4990 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | |
4991 const uint8* shuffler, int pix) { | |
4992 uintptr_t pixel_temp = 0u; | |
4993 asm volatile ( | |
4994 "pxor %%xmm5,%%xmm5 \n" | |
4995 "mov " MEMACCESS(4) ",%k2 \n" | |
4996 "cmp $0x3000102,%k2 \n" | |
4997 "je 3012f \n" | |
4998 "cmp $0x10203,%k2 \n" | |
4999 "je 123f \n" | |
5000 "cmp $0x30201,%k2 \n" | |
5001 "je 321f \n" | |
5002 "cmp $0x2010003,%k2 \n" | |
5003 "je 2103f \n" | |
5004 | |
5005 LABELALIGN | |
5006 "1: \n" | |
5007 "movzb " MEMACCESS(4) ",%2 \n" | |
5008 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 | |
5009 "mov %b2," MEMACCESS(1) " \n" | |
5010 "movzb " MEMACCESS2(0x1,4) ",%2 \n" | |
5011 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 | |
5012 "mov %b2," MEMACCESS2(0x1,1) " \n" | |
5013 "movzb " MEMACCESS2(0x2,4) ",%2 \n" | |
5014 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 | |
5015 "mov %b2," MEMACCESS2(0x2,1) " \n" | |
5016 "movzb " MEMACCESS2(0x3,4) ",%2 \n" | |
5017 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 | |
5018 "mov %b2," MEMACCESS2(0x3,1) " \n" | |
5019 "lea " MEMLEA(0x4,0) ",%0 \n" | |
5020 "lea " MEMLEA(0x4,1) ",%1 \n" | |
5021 "sub $0x1,%3 \n" | |
5022 "jg 1b \n" | |
5023 "jmp 99f \n" | |
5024 | |
5025 LABELALIGN | |
5026 "123: \n" | |
5027 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
5028 "lea " MEMLEA(0x10,0) ",%0 \n" | |
5029 "movdqa %%xmm0,%%xmm1 \n" | |
5030 "punpcklbw %%xmm5,%%xmm0 \n" | |
5031 "punpckhbw %%xmm5,%%xmm1 \n" | |
5032 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" | |
5033 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" | |
5034 "pshufhw $0x1b,%%xmm1,%%xmm1 \n" | |
5035 "pshuflw $0x1b,%%xmm1,%%xmm1 \n" | |
5036 "packuswb %%xmm1,%%xmm0 \n" | |
5037 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
5038 "lea " MEMLEA(0x10,1) ",%1 \n" | |
5039 "sub $0x4,%3 \n" | |
5040 "jg 123b \n" | |
5041 "jmp 99f \n" | |
5042 | |
5043 LABELALIGN | |
5044 "321: \n" | |
5045 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
5046 "lea " MEMLEA(0x10,0) ",%0 \n" | |
5047 "movdqa %%xmm0,%%xmm1 \n" | |
5048 "punpcklbw %%xmm5,%%xmm0 \n" | |
5049 "punpckhbw %%xmm5,%%xmm1 \n" | |
5050 "pshufhw $0x39,%%xmm0,%%xmm0 \n" | |
5051 "pshuflw $0x39,%%xmm0,%%xmm0 \n" | |
5052 "pshufhw $0x39,%%xmm1,%%xmm1 \n" | |
5053 "pshuflw $0x39,%%xmm1,%%xmm1 \n" | |
5054 "packuswb %%xmm1,%%xmm0 \n" | |
5055 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
5056 "lea " MEMLEA(0x10,1) ",%1 \n" | |
5057 "sub $0x4,%3 \n" | |
5058 "jg 321b \n" | |
5059 "jmp 99f \n" | |
5060 | |
5061 LABELALIGN | |
5062 "2103: \n" | |
5063 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
5064 "lea " MEMLEA(0x10,0) ",%0 \n" | |
5065 "movdqa %%xmm0,%%xmm1 \n" | |
5066 "punpcklbw %%xmm5,%%xmm0 \n" | |
5067 "punpckhbw %%xmm5,%%xmm1 \n" | |
5068 "pshufhw $0x93,%%xmm0,%%xmm0 \n" | |
5069 "pshuflw $0x93,%%xmm0,%%xmm0 \n" | |
5070 "pshufhw $0x93,%%xmm1,%%xmm1 \n" | |
5071 "pshuflw $0x93,%%xmm1,%%xmm1 \n" | |
5072 "packuswb %%xmm1,%%xmm0 \n" | |
5073 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
5074 "lea " MEMLEA(0x10,1) ",%1 \n" | |
5075 "sub $0x4,%3 \n" | |
5076 "jg 2103b \n" | |
5077 "jmp 99f \n" | |
5078 | |
5079 LABELALIGN | |
5080 "3012: \n" | |
5081 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
5082 "lea " MEMLEA(0x10,0) ",%0 \n" | |
5083 "movdqa %%xmm0,%%xmm1 \n" | |
5084 "punpcklbw %%xmm5,%%xmm0 \n" | |
5085 "punpckhbw %%xmm5,%%xmm1 \n" | |
5086 "pshufhw $0xc6,%%xmm0,%%xmm0 \n" | |
5087 "pshuflw $0xc6,%%xmm0,%%xmm0 \n" | |
5088 "pshufhw $0xc6,%%xmm1,%%xmm1 \n" | |
5089 "pshuflw $0xc6,%%xmm1,%%xmm1 \n" | |
5090 "packuswb %%xmm1,%%xmm0 \n" | |
5091 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
5092 "lea " MEMLEA(0x10,1) ",%1 \n" | |
5093 "sub $0x4,%3 \n" | |
5094 "jg 3012b \n" | |
5095 | |
5096 "99: \n" | |
5097 : "+r"(src_argb), // %0 | |
5098 "+r"(dst_argb), // %1 | |
5099 "+d"(pixel_temp), // %2 | |
5100 "+r"(pix) // %3 | |
5101 : "r"(shuffler) // %4 | |
5102 : "memory", "cc", NACL_R14 | |
5103 "xmm0", "xmm1", "xmm5" | |
5104 ); | |
5105 } | |
5106 #endif // HAS_ARGBSHUFFLEROW_SSE2 | |
5107 | |
5108 #ifdef HAS_I422TOYUY2ROW_SSE2 | |
5109 void I422ToYUY2Row_SSE2(const uint8* src_y, | |
5110 const uint8* src_u, | |
5111 const uint8* src_v, | |
5112 uint8* dst_frame, int width) { | |
5113 asm volatile ( | |
5114 "sub %1,%2 \n" | |
5115 LABELALIGN | |
5116 "1: \n" | |
5117 "movq " MEMACCESS(1) ",%%xmm2 \n" | |
5118 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 | |
5119 "lea " MEMLEA(0x8,1) ",%1 \n" | |
5120 "punpcklbw %%xmm3,%%xmm2 \n" | |
5121 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
5122 "lea " MEMLEA(0x10,0) ",%0 \n" | |
5123 "movdqa %%xmm0,%%xmm1 \n" | |
5124 "punpcklbw %%xmm2,%%xmm0 \n" | |
5125 "punpckhbw %%xmm2,%%xmm1 \n" | |
5126 "movdqu %%xmm0," MEMACCESS(3) " \n" | |
5127 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" | |
5128 "lea " MEMLEA(0x20,3) ",%3 \n" | |
5129 "sub $0x10,%4 \n" | |
5130 "jg 1b \n" | |
5131 : "+r"(src_y), // %0 | |
5132 "+r"(src_u), // %1 | |
5133 "+r"(src_v), // %2 | |
5134 "+r"(dst_frame), // %3 | |
5135 "+rm"(width) // %4 | |
5136 : | |
5137 : "memory", "cc", NACL_R14 | |
5138 "xmm0", "xmm1", "xmm2", "xmm3" | |
5139 ); | |
5140 } | |
5141 #endif // HAS_I422TOYUY2ROW_SSE2 | |
5142 | |
5143 #ifdef HAS_I422TOUYVYROW_SSE2 | |
5144 void I422ToUYVYRow_SSE2(const uint8* src_y, | |
5145 const uint8* src_u, | |
5146 const uint8* src_v, | |
5147 uint8* dst_frame, int width) { | |
5148 asm volatile ( | |
5149 "sub %1,%2 \n" | |
5150 LABELALIGN | |
5151 "1: \n" | |
5152 "movq " MEMACCESS(1) ",%%xmm2 \n" | |
5153 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 | |
5154 "lea " MEMLEA(0x8,1) ",%1 \n" | |
5155 "punpcklbw %%xmm3,%%xmm2 \n" | |
5156 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
5157 "movdqa %%xmm2,%%xmm1 \n" | |
5158 "lea " MEMLEA(0x10,0) ",%0 \n" | |
5159 "punpcklbw %%xmm0,%%xmm1 \n" | |
5160 "punpckhbw %%xmm0,%%xmm2 \n" | |
5161 "movdqu %%xmm1," MEMACCESS(3) " \n" | |
5162 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" | |
5163 "lea " MEMLEA(0x20,3) ",%3 \n" | |
5164 "sub $0x10,%4 \n" | |
5165 "jg 1b \n" | |
5166 : "+r"(src_y), // %0 | |
5167 "+r"(src_u), // %1 | |
5168 "+r"(src_v), // %2 | |
5169 "+r"(dst_frame), // %3 | |
5170 "+rm"(width) // %4 | |
5171 : | |
5172 : "memory", "cc", NACL_R14 | |
5173 "xmm0", "xmm1", "xmm2", "xmm3" | |
5174 ); | |
5175 } | |
5176 #endif // HAS_I422TOUYVYROW_SSE2 | |
5177 | |
5178 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 | |
5179 void ARGBPolynomialRow_SSE2(const uint8* src_argb, | |
5180 uint8* dst_argb, const float* poly, | |
5181 int width) { | |
5182 asm volatile ( | |
5183 "pxor %%xmm3,%%xmm3 \n" | |
5184 | |
5185 // 2 pixel loop. | |
5186 LABELALIGN | |
5187 "1: \n" | |
5188 "movq " MEMACCESS(0) ",%%xmm0 \n" | |
5189 "lea " MEMLEA(0x8,0) ",%0 \n" | |
5190 "punpcklbw %%xmm3,%%xmm0 \n" | |
5191 "movdqa %%xmm0,%%xmm4 \n" | |
5192 "punpcklwd %%xmm3,%%xmm0 \n" | |
5193 "punpckhwd %%xmm3,%%xmm4 \n" | |
5194 "cvtdq2ps %%xmm0,%%xmm0 \n" | |
5195 "cvtdq2ps %%xmm4,%%xmm4 \n" | |
5196 "movdqa %%xmm0,%%xmm1 \n" | |
5197 "movdqa %%xmm4,%%xmm5 \n" | |
5198 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" | |
5199 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" | |
5200 "addps " MEMACCESS(3) ",%%xmm0 \n" | |
5201 "addps " MEMACCESS(3) ",%%xmm4 \n" | |
5202 "movdqa %%xmm1,%%xmm2 \n" | |
5203 "movdqa %%xmm5,%%xmm6 \n" | |
5204 "mulps %%xmm1,%%xmm2 \n" | |
5205 "mulps %%xmm5,%%xmm6 \n" | |
5206 "mulps %%xmm2,%%xmm1 \n" | |
5207 "mulps %%xmm6,%%xmm5 \n" | |
5208 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" | |
5209 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" | |
5210 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" | |
5211 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" | |
5212 "addps %%xmm2,%%xmm0 \n" | |
5213 "addps %%xmm6,%%xmm4 \n" | |
5214 "addps %%xmm1,%%xmm0 \n" | |
5215 "addps %%xmm5,%%xmm4 \n" | |
5216 "cvttps2dq %%xmm0,%%xmm0 \n" | |
5217 "cvttps2dq %%xmm4,%%xmm4 \n" | |
5218 "packuswb %%xmm4,%%xmm0 \n" | |
5219 "packuswb %%xmm0,%%xmm0 \n" | |
5220 "movq %%xmm0," MEMACCESS(1) " \n" | |
5221 "lea " MEMLEA(0x8,1) ",%1 \n" | |
5222 "sub $0x2,%2 \n" | |
5223 "jg 1b \n" | |
5224 : "+r"(src_argb), // %0 | |
5225 "+r"(dst_argb), // %1 | |
5226 "+r"(width) // %2 | |
5227 : "r"(poly) // %3 | |
5228 : "memory", "cc" | |
5229 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | |
5230 ); | |
5231 } | |
5232 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 | |
5233 | |
5234 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 | |
5235 void ARGBPolynomialRow_AVX2(const uint8* src_argb, | |
5236 uint8* dst_argb, const float* poly, | |
5237 int width) { | |
5238 asm volatile ( | |
5239 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" | |
5240 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" | |
5241 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" | |
5242 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" | |
5243 | |
5244 // 2 pixel loop. | |
5245 LABELALIGN | |
5246 "1: \n" | |
5247 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels | |
5248 "lea " MEMLEA(0x8,0) ",%0 \n" | |
5249 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats | |
5250 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X | |
5251 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X | |
5252 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X | |
5253 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X | |
5254 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X | |
5255 "vcvttps2dq %%ymm0,%%ymm0 \n" | |
5256 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" | |
5257 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | |
5258 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" | |
5259 "vmovq %%xmm0," MEMACCESS(1) " \n" | |
5260 "lea " MEMLEA(0x8,1) ",%1 \n" | |
5261 "sub $0x2,%2 \n" | |
5262 "jg 1b \n" | |
5263 "vzeroupper \n" | |
5264 : "+r"(src_argb), // %0 | |
5265 "+r"(dst_argb), // %1 | |
5266 "+r"(width) // %2 | |
5267 : "r"(poly) // %3 | |
5268 : "memory", "cc", | |
5269 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
5270 ); | |
5271 } | |
5272 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 | |
5273 | |
5274 #ifdef HAS_ARGBCOLORTABLEROW_X86 | |
5275 // Tranform ARGB pixels with color table. | |
5276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | |
5277 int width) { | |
5278 uintptr_t pixel_temp = 0u; | |
5279 asm volatile ( | |
5280 // 1 pixel loop. | |
5281 LABELALIGN | |
5282 "1: \n" | |
5283 "movzb " MEMACCESS(0) ",%1 \n" | |
5284 "lea " MEMLEA(0x4,0) ",%0 \n" | |
5285 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 | |
5286 "mov %b1," MEMACCESS2(-0x4,0) " \n" | |
5287 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" | |
5288 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 | |
5289 "mov %b1," MEMACCESS2(-0x3,0) " \n" | |
5290 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" | |
5291 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 | |
5292 "mov %b1," MEMACCESS2(-0x2,0) " \n" | |
5293 "movzb " MEMACCESS2(-0x1,0) ",%1 \n" | |
5294 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 | |
5295 "mov %b1," MEMACCESS2(-0x1,0) " \n" | |
5296 "dec %2 \n" | |
5297 "jg 1b \n" | |
5298 : "+r"(dst_argb), // %0 | |
5299 "+d"(pixel_temp), // %1 | |
5300 "+r"(width) // %2 | |
5301 : "r"(table_argb) // %3 | |
5302 : "memory", "cc"); | |
5303 } | |
5304 #endif // HAS_ARGBCOLORTABLEROW_X86 | |
5305 | |
5306 #ifdef HAS_RGBCOLORTABLEROW_X86 | |
5307 // Tranform RGB pixels with color table. | |
5308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { | |
5309 uintptr_t pixel_temp = 0u; | |
5310 asm volatile ( | |
5311 // 1 pixel loop. | |
5312 LABELALIGN | |
5313 "1: \n" | |
5314 "movzb " MEMACCESS(0) ",%1 \n" | |
5315 "lea " MEMLEA(0x4,0) ",%0 \n" | |
5316 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 | |
5317 "mov %b1," MEMACCESS2(-0x4,0) " \n" | |
5318 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" | |
5319 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 | |
5320 "mov %b1," MEMACCESS2(-0x3,0) " \n" | |
5321 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" | |
5322 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 | |
5323 "mov %b1," MEMACCESS2(-0x2,0) " \n" | |
5324 "dec %2 \n" | |
5325 "jg 1b \n" | |
5326 : "+r"(dst_argb), // %0 | |
5327 "+d"(pixel_temp), // %1 | |
5328 "+r"(width) // %2 | |
5329 : "r"(table_argb) // %3 | |
5330 : "memory", "cc"); | |
5331 } | |
5332 #endif // HAS_RGBCOLORTABLEROW_X86 | |
5333 | |
5334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 | |
5335 // Tranform RGB pixels with luma table. | |
5336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | |
5337 int width, | |
5338 const uint8* luma, uint32 lumacoeff) { | |
5339 uintptr_t pixel_temp = 0u; | |
5340 uintptr_t table_temp = 0u; | |
5341 asm volatile ( | |
5342 "movd %6,%%xmm3 \n" | |
5343 "pshufd $0x0,%%xmm3,%%xmm3 \n" | |
5344 "pcmpeqb %%xmm4,%%xmm4 \n" | |
5345 "psllw $0x8,%%xmm4 \n" | |
5346 "pxor %%xmm5,%%xmm5 \n" | |
5347 | |
5348 // 4 pixel loop. | |
5349 LABELALIGN | |
5350 "1: \n" | |
5351 "movdqu " MEMACCESS(2) ",%%xmm0 \n" | |
5352 "pmaddubsw %%xmm3,%%xmm0 \n" | |
5353 "phaddw %%xmm0,%%xmm0 \n" | |
5354 "pand %%xmm4,%%xmm0 \n" | |
5355 "punpcklwd %%xmm5,%%xmm0 \n" | |
5356 "movd %%xmm0,%k1 \n" // 32 bit offset | |
5357 "add %5,%1 \n" | |
5358 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
5359 | |
5360 "movzb " MEMACCESS(2) ",%0 \n" | |
5361 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5362 "mov %b0," MEMACCESS(3) " \n" | |
5363 "movzb " MEMACCESS2(0x1,2) ",%0 \n" | |
5364 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5365 "mov %b0," MEMACCESS2(0x1,3) " \n" | |
5366 "movzb " MEMACCESS2(0x2,2) ",%0 \n" | |
5367 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5368 "mov %b0," MEMACCESS2(0x2,3) " \n" | |
5369 "movzb " MEMACCESS2(0x3,2) ",%0 \n" | |
5370 "mov %b0," MEMACCESS2(0x3,3) " \n" | |
5371 | |
5372 "movd %%xmm0,%k1 \n" // 32 bit offset | |
5373 "add %5,%1 \n" | |
5374 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
5375 | |
5376 "movzb " MEMACCESS2(0x4,2) ",%0 \n" | |
5377 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5378 "mov %b0," MEMACCESS2(0x4,3) " \n" | |
5379 "movzb " MEMACCESS2(0x5,2) ",%0 \n" | |
5380 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5381 "mov %b0," MEMACCESS2(0x5,3) " \n" | |
5382 "movzb " MEMACCESS2(0x6,2) ",%0 \n" | |
5383 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5384 "mov %b0," MEMACCESS2(0x6,3) " \n" | |
5385 "movzb " MEMACCESS2(0x7,2) ",%0 \n" | |
5386 "mov %b0," MEMACCESS2(0x7,3) " \n" | |
5387 | |
5388 "movd %%xmm0,%k1 \n" // 32 bit offset | |
5389 "add %5,%1 \n" | |
5390 "pshufd $0x39,%%xmm0,%%xmm0 \n" | |
5391 | |
5392 "movzb " MEMACCESS2(0x8,2) ",%0 \n" | |
5393 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5394 "mov %b0," MEMACCESS2(0x8,3) " \n" | |
5395 "movzb " MEMACCESS2(0x9,2) ",%0 \n" | |
5396 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5397 "mov %b0," MEMACCESS2(0x9,3) " \n" | |
5398 "movzb " MEMACCESS2(0xa,2) ",%0 \n" | |
5399 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5400 "mov %b0," MEMACCESS2(0xa,3) " \n" | |
5401 "movzb " MEMACCESS2(0xb,2) ",%0 \n" | |
5402 "mov %b0," MEMACCESS2(0xb,3) " \n" | |
5403 | |
5404 "movd %%xmm0,%k1 \n" // 32 bit offset | |
5405 "add %5,%1 \n" | |
5406 | |
5407 "movzb " MEMACCESS2(0xc,2) ",%0 \n" | |
5408 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5409 "mov %b0," MEMACCESS2(0xc,3) " \n" | |
5410 "movzb " MEMACCESS2(0xd,2) ",%0 \n" | |
5411 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5412 "mov %b0," MEMACCESS2(0xd,3) " \n" | |
5413 "movzb " MEMACCESS2(0xe,2) ",%0 \n" | |
5414 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | |
5415 "mov %b0," MEMACCESS2(0xe,3) " \n" | |
5416 "movzb " MEMACCESS2(0xf,2) ",%0 \n" | |
5417 "mov %b0," MEMACCESS2(0xf,3) " \n" | |
5418 "lea " MEMLEA(0x10,2) ",%2 \n" | |
5419 "lea " MEMLEA(0x10,3) ",%3 \n" | |
5420 "sub $0x4,%4 \n" | |
5421 "jg 1b \n" | |
5422 : "+d"(pixel_temp), // %0 | |
5423 "+a"(table_temp), // %1 | |
5424 "+r"(src_argb), // %2 | |
5425 "+r"(dst_argb), // %3 | |
5426 "+rm"(width) // %4 | |
5427 : "r"(luma), // %5 | |
5428 "rm"(lumacoeff) // %6 | |
5429 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" | |
5430 ); | |
5431 } | |
5432 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | |
5433 | |
5434 #endif // defined(__x86_64__) || defined(__i386__) | |
5435 | |
5436 #ifdef __cplusplus | |
5437 } // extern "C" | |
5438 } // namespace libyuv | |
5439 #endif | |
OLD | NEW |