Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(211)

Side by Side Diff: source/row_gcc.cc

Issue 1398633002: change all pix parameters to width for consistency (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_common.cc ('k') | source/row_neon.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 155 matching lines...) Expand 10 before | Expand all | Expand 10 after
166 }; 166 };
167 167
168 // NV21 shuf 8 VU to 16 UV. 168 // NV21 shuf 8 VU to 16 UV.
169 static const lvec8 kShuffleNV21 = { 169 static const lvec8 kShuffleNV21 = {
170 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 170 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
171 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 171 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
172 }; 172 };
173 #endif // HAS_RGB24TOARGBROW_SSSE3 173 #endif // HAS_RGB24TOARGBROW_SSSE3
174 174
175 #ifdef HAS_J400TOARGBROW_SSE2 175 #ifdef HAS_J400TOARGBROW_SSE2
176 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 176 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
177 asm volatile ( 177 asm volatile (
178 "pcmpeqb %%xmm5,%%xmm5 \n" 178 "pcmpeqb %%xmm5,%%xmm5 \n"
179 "pslld $0x18,%%xmm5 \n" 179 "pslld $0x18,%%xmm5 \n"
180 LABELALIGN 180 LABELALIGN
181 "1: \n" 181 "1: \n"
182 "movq " MEMACCESS(0) ",%%xmm0 \n" 182 "movq " MEMACCESS(0) ",%%xmm0 \n"
183 "lea " MEMLEA(0x8,0) ",%0 \n" 183 "lea " MEMLEA(0x8,0) ",%0 \n"
184 "punpcklbw %%xmm0,%%xmm0 \n" 184 "punpcklbw %%xmm0,%%xmm0 \n"
185 "movdqa %%xmm0,%%xmm1 \n" 185 "movdqa %%xmm0,%%xmm1 \n"
186 "punpcklwd %%xmm0,%%xmm0 \n" 186 "punpcklwd %%xmm0,%%xmm0 \n"
187 "punpckhwd %%xmm1,%%xmm1 \n" 187 "punpckhwd %%xmm1,%%xmm1 \n"
188 "por %%xmm5,%%xmm0 \n" 188 "por %%xmm5,%%xmm0 \n"
189 "por %%xmm5,%%xmm1 \n" 189 "por %%xmm5,%%xmm1 \n"
190 "movdqu %%xmm0," MEMACCESS(1) " \n" 190 "movdqu %%xmm0," MEMACCESS(1) " \n"
191 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 191 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
192 "lea " MEMLEA(0x20,1) ",%1 \n" 192 "lea " MEMLEA(0x20,1) ",%1 \n"
193 "sub $0x8,%2 \n" 193 "sub $0x8,%2 \n"
194 "jg 1b \n" 194 "jg 1b \n"
195 : "+r"(src_y), // %0 195 : "+r"(src_y), // %0
196 "+r"(dst_argb), // %1 196 "+r"(dst_argb), // %1
197 "+r"(pix) // %2 197 "+r"(width) // %2
198 :: "memory", "cc", "xmm0", "xmm1", "xmm5" 198 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
199 ); 199 );
200 } 200 }
201 #endif // HAS_J400TOARGBROW_SSE2 201 #endif // HAS_J400TOARGBROW_SSE2
202 202
203 #ifdef HAS_RGB24TOARGBROW_SSSE3 203 #ifdef HAS_RGB24TOARGBROW_SSSE3
204 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 204 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
205 asm volatile ( 205 asm volatile (
206 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 206 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
207 "pslld $0x18,%%xmm5 \n" 207 "pslld $0x18,%%xmm5 \n"
208 "movdqa %3,%%xmm4 \n" 208 "movdqa %3,%%xmm4 \n"
209 LABELALIGN 209 LABELALIGN
210 "1: \n" 210 "1: \n"
211 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 211 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
212 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 212 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
213 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 213 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
214 "lea " MEMLEA(0x30,0) ",%0 \n" 214 "lea " MEMLEA(0x30,0) ",%0 \n"
(...skipping 11 matching lines...) Expand all
226 "palignr $0x4,%%xmm3,%%xmm3 \n" 226 "palignr $0x4,%%xmm3,%%xmm3 \n"
227 "pshufb %%xmm4,%%xmm3 \n" 227 "pshufb %%xmm4,%%xmm3 \n"
228 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 228 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
229 "por %%xmm5,%%xmm3 \n" 229 "por %%xmm5,%%xmm3 \n"
230 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" 230 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
231 "lea " MEMLEA(0x40,1) ",%1 \n" 231 "lea " MEMLEA(0x40,1) ",%1 \n"
232 "sub $0x10,%2 \n" 232 "sub $0x10,%2 \n"
233 "jg 1b \n" 233 "jg 1b \n"
234 : "+r"(src_rgb24), // %0 234 : "+r"(src_rgb24), // %0
235 "+r"(dst_argb), // %1 235 "+r"(dst_argb), // %1
236 "+r"(pix) // %2 236 "+r"(width) // %2
237 : "m"(kShuffleMaskRGB24ToARGB) // %3 237 : "m"(kShuffleMaskRGB24ToARGB) // %3
238 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 238 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
239 ); 239 );
240 } 240 }
241 241
242 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { 242 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
243 asm volatile ( 243 asm volatile (
244 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 244 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
245 "pslld $0x18,%%xmm5 \n" 245 "pslld $0x18,%%xmm5 \n"
246 "movdqa %3,%%xmm4 \n" 246 "movdqa %3,%%xmm4 \n"
247 LABELALIGN 247 LABELALIGN
248 "1: \n" 248 "1: \n"
249 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 249 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
250 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 250 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
251 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 251 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
252 "lea " MEMLEA(0x30,0) ",%0 \n" 252 "lea " MEMLEA(0x30,0) ",%0 \n"
(...skipping 11 matching lines...) Expand all
264 "palignr $0x4,%%xmm3,%%xmm3 \n" 264 "palignr $0x4,%%xmm3,%%xmm3 \n"
265 "pshufb %%xmm4,%%xmm3 \n" 265 "pshufb %%xmm4,%%xmm3 \n"
266 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 266 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
267 "por %%xmm5,%%xmm3 \n" 267 "por %%xmm5,%%xmm3 \n"
268 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" 268 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
269 "lea " MEMLEA(0x40,1) ",%1 \n" 269 "lea " MEMLEA(0x40,1) ",%1 \n"
270 "sub $0x10,%2 \n" 270 "sub $0x10,%2 \n"
271 "jg 1b \n" 271 "jg 1b \n"
272 : "+r"(src_raw), // %0 272 : "+r"(src_raw), // %0
273 "+r"(dst_argb), // %1 273 "+r"(dst_argb), // %1
274 "+r"(pix) // %2 274 "+r"(width) // %2
275 : "m"(kShuffleMaskRAWToARGB) // %3 275 : "m"(kShuffleMaskRAWToARGB) // %3
276 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 276 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
277 ); 277 );
278 } 278 }
279 279
280 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 280 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
281 asm volatile ( 281 asm volatile (
282 "mov $0x1080108,%%eax \n" 282 "mov $0x1080108,%%eax \n"
283 "movd %%eax,%%xmm5 \n" 283 "movd %%eax,%%xmm5 \n"
284 "pshufd $0x0,%%xmm5,%%xmm5 \n" 284 "pshufd $0x0,%%xmm5,%%xmm5 \n"
285 "mov $0x20802080,%%eax \n" 285 "mov $0x20802080,%%eax \n"
286 "movd %%eax,%%xmm6 \n" 286 "movd %%eax,%%xmm6 \n"
287 "pshufd $0x0,%%xmm6,%%xmm6 \n" 287 "pshufd $0x0,%%xmm6,%%xmm6 \n"
288 "pcmpeqb %%xmm3,%%xmm3 \n" 288 "pcmpeqb %%xmm3,%%xmm3 \n"
289 "psllw $0xb,%%xmm3 \n" 289 "psllw $0xb,%%xmm3 \n"
290 "pcmpeqb %%xmm4,%%xmm4 \n" 290 "pcmpeqb %%xmm4,%%xmm4 \n"
(...skipping 20 matching lines...) Expand all
311 "movdqa %%xmm1,%%xmm2 \n" 311 "movdqa %%xmm1,%%xmm2 \n"
312 "punpcklbw %%xmm0,%%xmm1 \n" 312 "punpcklbw %%xmm0,%%xmm1 \n"
313 "punpckhbw %%xmm0,%%xmm2 \n" 313 "punpckhbw %%xmm0,%%xmm2 \n"
314 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) 314 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
315 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) 315 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
316 "lea " MEMLEA(0x10,0) ",%0 \n" 316 "lea " MEMLEA(0x10,0) ",%0 \n"
317 "sub $0x8,%2 \n" 317 "sub $0x8,%2 \n"
318 "jg 1b \n" 318 "jg 1b \n"
319 : "+r"(src), // %0 319 : "+r"(src), // %0
320 "+r"(dst), // %1 320 "+r"(dst), // %1
321 "+r"(pix) // %2 321 "+r"(width) // %2
322 : 322 :
323 : "memory", "cc", "eax", NACL_R14 323 : "memory", "cc", "eax", NACL_R14
324 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 324 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
325 ); 325 );
326 } 326 }
327 327
328 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 328 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
329 asm volatile ( 329 asm volatile (
330 "mov $0x1080108,%%eax \n" 330 "mov $0x1080108,%%eax \n"
331 "movd %%eax,%%xmm5 \n" 331 "movd %%eax,%%xmm5 \n"
332 "pshufd $0x0,%%xmm5,%%xmm5 \n" 332 "pshufd $0x0,%%xmm5,%%xmm5 \n"
333 "mov $0x42004200,%%eax \n" 333 "mov $0x42004200,%%eax \n"
334 "movd %%eax,%%xmm6 \n" 334 "movd %%eax,%%xmm6 \n"
335 "pshufd $0x0,%%xmm6,%%xmm6 \n" 335 "pshufd $0x0,%%xmm6,%%xmm6 \n"
336 "pcmpeqb %%xmm3,%%xmm3 \n" 336 "pcmpeqb %%xmm3,%%xmm3 \n"
337 "psllw $0xb,%%xmm3 \n" 337 "psllw $0xb,%%xmm3 \n"
338 "movdqa %%xmm3,%%xmm4 \n" 338 "movdqa %%xmm3,%%xmm4 \n"
(...skipping 23 matching lines...) Expand all
362 "movdqa %%xmm1,%%xmm2 \n" 362 "movdqa %%xmm1,%%xmm2 \n"
363 "punpcklbw %%xmm0,%%xmm1 \n" 363 "punpcklbw %%xmm0,%%xmm1 \n"
364 "punpckhbw %%xmm0,%%xmm2 \n" 364 "punpckhbw %%xmm0,%%xmm2 \n"
365 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) 365 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
366 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) 366 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
367 "lea " MEMLEA(0x10,0) ",%0 \n" 367 "lea " MEMLEA(0x10,0) ",%0 \n"
368 "sub $0x8,%2 \n" 368 "sub $0x8,%2 \n"
369 "jg 1b \n" 369 "jg 1b \n"
370 : "+r"(src), // %0 370 : "+r"(src), // %0
371 "+r"(dst), // %1 371 "+r"(dst), // %1
372 "+r"(pix) // %2 372 "+r"(width) // %2
373 : 373 :
374 : "memory", "cc", "eax", NACL_R14 374 : "memory", "cc", "eax", NACL_R14
375 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 375 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
376 ); 376 );
377 } 377 }
378 378
379 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 379 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
380 asm volatile ( 380 asm volatile (
381 "mov $0xf0f0f0f,%%eax \n" 381 "mov $0xf0f0f0f,%%eax \n"
382 "movd %%eax,%%xmm4 \n" 382 "movd %%eax,%%xmm4 \n"
383 "pshufd $0x0,%%xmm4,%%xmm4 \n" 383 "pshufd $0x0,%%xmm4,%%xmm4 \n"
384 "movdqa %%xmm4,%%xmm5 \n" 384 "movdqa %%xmm4,%%xmm5 \n"
385 "pslld $0x4,%%xmm5 \n" 385 "pslld $0x4,%%xmm5 \n"
386 "sub %0,%1 \n" 386 "sub %0,%1 \n"
387 "sub %0,%1 \n" 387 "sub %0,%1 \n"
388 LABELALIGN 388 LABELALIGN
389 "1: \n" 389 "1: \n"
(...skipping 10 matching lines...) Expand all
400 "movdqa %%xmm0,%%xmm1 \n" 400 "movdqa %%xmm0,%%xmm1 \n"
401 "punpcklbw %%xmm2,%%xmm0 \n" 401 "punpcklbw %%xmm2,%%xmm0 \n"
402 "punpckhbw %%xmm2,%%xmm1 \n" 402 "punpckhbw %%xmm2,%%xmm1 \n"
403 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) 403 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
404 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) 404 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
405 "lea " MEMLEA(0x10,0) ",%0 \n" 405 "lea " MEMLEA(0x10,0) ",%0 \n"
406 "sub $0x8,%2 \n" 406 "sub $0x8,%2 \n"
407 "jg 1b \n" 407 "jg 1b \n"
408 : "+r"(src), // %0 408 : "+r"(src), // %0
409 "+r"(dst), // %1 409 "+r"(dst), // %1
410 "+r"(pix) // %2 410 "+r"(width) // %2
411 : 411 :
412 : "memory", "cc", "eax", NACL_R14 412 : "memory", "cc", "eax", NACL_R14
413 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 413 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
414 ); 414 );
415 } 415 }
416 416
417 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { 417 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
418 asm volatile ( 418 asm volatile (
419 "movdqa %3,%%xmm6 \n" 419 "movdqa %3,%%xmm6 \n"
420 LABELALIGN 420 LABELALIGN
421 "1: \n" 421 "1: \n"
422 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 422 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
423 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 423 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
424 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 424 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
425 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 425 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
426 "lea " MEMLEA(0x40,0) ",%0 \n" 426 "lea " MEMLEA(0x40,0) ",%0 \n"
427 "pshufb %%xmm6,%%xmm0 \n" 427 "pshufb %%xmm6,%%xmm0 \n"
(...skipping 11 matching lines...) Expand all
439 "psrldq $0x8,%%xmm2 \n" 439 "psrldq $0x8,%%xmm2 \n"
440 "pslldq $0x4,%%xmm3 \n" 440 "pslldq $0x4,%%xmm3 \n"
441 "por %%xmm3,%%xmm2 \n" 441 "por %%xmm3,%%xmm2 \n"
442 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 442 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
443 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 443 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
444 "lea " MEMLEA(0x30,1) ",%1 \n" 444 "lea " MEMLEA(0x30,1) ",%1 \n"
445 "sub $0x10,%2 \n" 445 "sub $0x10,%2 \n"
446 "jg 1b \n" 446 "jg 1b \n"
447 : "+r"(src), // %0 447 : "+r"(src), // %0
448 "+r"(dst), // %1 448 "+r"(dst), // %1
449 "+r"(pix) // %2 449 "+r"(width) // %2
450 : "m"(kShuffleMaskARGBToRGB24) // %3 450 : "m"(kShuffleMaskARGBToRGB24) // %3
451 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 451 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
452 ); 452 );
453 } 453 }
454 454
455 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { 455 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
456 asm volatile ( 456 asm volatile (
457 "movdqa %3,%%xmm6 \n" 457 "movdqa %3,%%xmm6 \n"
458 LABELALIGN 458 LABELALIGN
459 "1: \n" 459 "1: \n"
460 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 460 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
461 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 461 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
462 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 462 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
463 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 463 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
464 "lea " MEMLEA(0x40,0) ",%0 \n" 464 "lea " MEMLEA(0x40,0) ",%0 \n"
465 "pshufb %%xmm6,%%xmm0 \n" 465 "pshufb %%xmm6,%%xmm0 \n"
(...skipping 11 matching lines...) Expand all
477 "psrldq $0x8,%%xmm2 \n" 477 "psrldq $0x8,%%xmm2 \n"
478 "pslldq $0x4,%%xmm3 \n" 478 "pslldq $0x4,%%xmm3 \n"
479 "por %%xmm3,%%xmm2 \n" 479 "por %%xmm3,%%xmm2 \n"
480 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 480 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
481 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 481 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
482 "lea " MEMLEA(0x30,1) ",%1 \n" 482 "lea " MEMLEA(0x30,1) ",%1 \n"
483 "sub $0x10,%2 \n" 483 "sub $0x10,%2 \n"
484 "jg 1b \n" 484 "jg 1b \n"
485 : "+r"(src), // %0 485 : "+r"(src), // %0
486 "+r"(dst), // %1 486 "+r"(dst), // %1
487 "+r"(pix) // %2 487 "+r"(width) // %2
488 : "m"(kShuffleMaskARGBToRAW) // %3 488 : "m"(kShuffleMaskARGBToRAW) // %3
489 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 489 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
490 ); 490 );
491 } 491 }
492 492
493 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { 493 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
494 asm volatile ( 494 asm volatile (
495 "pcmpeqb %%xmm3,%%xmm3 \n" 495 "pcmpeqb %%xmm3,%%xmm3 \n"
496 "psrld $0x1b,%%xmm3 \n" 496 "psrld $0x1b,%%xmm3 \n"
497 "pcmpeqb %%xmm4,%%xmm4 \n" 497 "pcmpeqb %%xmm4,%%xmm4 \n"
498 "psrld $0x1a,%%xmm4 \n" 498 "psrld $0x1a,%%xmm4 \n"
499 "pslld $0x5,%%xmm4 \n" 499 "pslld $0x5,%%xmm4 \n"
500 "pcmpeqb %%xmm5,%%xmm5 \n" 500 "pcmpeqb %%xmm5,%%xmm5 \n"
501 "pslld $0xb,%%xmm5 \n" 501 "pslld $0xb,%%xmm5 \n"
502 LABELALIGN 502 LABELALIGN
503 "1: \n" 503 "1: \n"
(...skipping 10 matching lines...) Expand all
514 "por %%xmm2,%%xmm1 \n" 514 "por %%xmm2,%%xmm1 \n"
515 "por %%xmm1,%%xmm0 \n" 515 "por %%xmm1,%%xmm0 \n"
516 "packssdw %%xmm0,%%xmm0 \n" 516 "packssdw %%xmm0,%%xmm0 \n"
517 "lea " MEMLEA(0x10,0) ",%0 \n" 517 "lea " MEMLEA(0x10,0) ",%0 \n"
518 "movq %%xmm0," MEMACCESS(1) " \n" 518 "movq %%xmm0," MEMACCESS(1) " \n"
519 "lea " MEMLEA(0x8,1) ",%1 \n" 519 "lea " MEMLEA(0x8,1) ",%1 \n"
520 "sub $0x4,%2 \n" 520 "sub $0x4,%2 \n"
521 "jg 1b \n" 521 "jg 1b \n"
522 : "+r"(src), // %0 522 : "+r"(src), // %0
523 "+r"(dst), // %1 523 "+r"(dst), // %1
524 "+r"(pix) // %2 524 "+r"(width) // %2
525 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 525 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
526 ); 526 );
527 } 527 }
528 528
529 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, 529 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
530 const uint32 dither4, int pix) { 530 const uint32 dither4, int width) {
531 asm volatile ( 531 asm volatile (
532 "movd %3,%%xmm6 \n" 532 "movd %3,%%xmm6 \n"
533 "punpcklbw %%xmm6,%%xmm6 \n" 533 "punpcklbw %%xmm6,%%xmm6 \n"
534 "movdqa %%xmm6,%%xmm7 \n" 534 "movdqa %%xmm6,%%xmm7 \n"
535 "punpcklwd %%xmm6,%%xmm6 \n" 535 "punpcklwd %%xmm6,%%xmm6 \n"
536 "punpckhwd %%xmm7,%%xmm7 \n" 536 "punpckhwd %%xmm7,%%xmm7 \n"
537 "pcmpeqb %%xmm3,%%xmm3 \n" 537 "pcmpeqb %%xmm3,%%xmm3 \n"
538 "psrld $0x1b,%%xmm3 \n" 538 "psrld $0x1b,%%xmm3 \n"
539 "pcmpeqb %%xmm4,%%xmm4 \n" 539 "pcmpeqb %%xmm4,%%xmm4 \n"
540 "psrld $0x1a,%%xmm4 \n" 540 "psrld $0x1a,%%xmm4 \n"
(...skipping 17 matching lines...) Expand all
558 "por %%xmm2,%%xmm1 \n" 558 "por %%xmm2,%%xmm1 \n"
559 "por %%xmm1,%%xmm0 \n" 559 "por %%xmm1,%%xmm0 \n"
560 "packssdw %%xmm0,%%xmm0 \n" 560 "packssdw %%xmm0,%%xmm0 \n"
561 "lea 0x10(%0),%0 \n" 561 "lea 0x10(%0),%0 \n"
562 "movq %%xmm0,(%1) \n" 562 "movq %%xmm0,(%1) \n"
563 "lea 0x8(%1),%1 \n" 563 "lea 0x8(%1),%1 \n"
564 "sub $0x4,%2 \n" 564 "sub $0x4,%2 \n"
565 "jg 1b \n" 565 "jg 1b \n"
566 : "+r"(src), // %0 566 : "+r"(src), // %0
567 "+r"(dst), // %1 567 "+r"(dst), // %1
568 "+r"(pix) // %2 568 "+r"(width) // %2
569 : "m"(dither4) // %3 569 : "m"(dither4) // %3
570 : "memory", "cc", 570 : "memory", "cc",
571 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 571 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
572 ); 572 );
573 } 573 }
574 574
575 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 575 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
576 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, 576 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
577 const uint32 dither4, int pix) { 577 const uint32 dither4, int width) {
578 asm volatile ( 578 asm volatile (
579 "vbroadcastss %3,%%xmm6 \n" 579 "vbroadcastss %3,%%xmm6 \n"
580 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" 580 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
581 "vpermq $0xd8,%%ymm6,%%ymm6 \n" 581 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
582 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" 582 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
583 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" 583 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
584 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" 584 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
585 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 585 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
586 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" 586 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
587 "vpslld $0x5,%%ymm4,%%ymm4 \n" 587 "vpslld $0x5,%%ymm4,%%ymm4 \n"
(...skipping 14 matching lines...) Expand all
602 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 602 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
603 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 603 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
604 "lea 0x20(%0),%0 \n" 604 "lea 0x20(%0),%0 \n"
605 "vmovdqu %%xmm0,(%1) \n" 605 "vmovdqu %%xmm0,(%1) \n"
606 "lea 0x10(%1),%1 \n" 606 "lea 0x10(%1),%1 \n"
607 "sub $0x8,%2 \n" 607 "sub $0x8,%2 \n"
608 "jg 1b \n" 608 "jg 1b \n"
609 "vzeroupper \n" 609 "vzeroupper \n"
610 : "+r"(src), // %0 610 : "+r"(src), // %0
611 "+r"(dst), // %1 611 "+r"(dst), // %1
612 "+r"(pix) // %2 612 "+r"(width) // %2
613 : "m"(dither4) // %3 613 : "m"(dither4) // %3
614 : "memory", "cc", 614 : "memory", "cc",
615 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 615 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
616 ); 616 );
617 } 617 }
618 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 618 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
619 619
620 620
621 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { 621 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
622 asm volatile ( 622 asm volatile (
623 "pcmpeqb %%xmm4,%%xmm4 \n" 623 "pcmpeqb %%xmm4,%%xmm4 \n"
624 "psrld $0x1b,%%xmm4 \n" 624 "psrld $0x1b,%%xmm4 \n"
625 "movdqa %%xmm4,%%xmm5 \n" 625 "movdqa %%xmm4,%%xmm5 \n"
626 "pslld $0x5,%%xmm5 \n" 626 "pslld $0x5,%%xmm5 \n"
627 "movdqa %%xmm4,%%xmm6 \n" 627 "movdqa %%xmm4,%%xmm6 \n"
628 "pslld $0xa,%%xmm6 \n" 628 "pslld $0xa,%%xmm6 \n"
629 "pcmpeqb %%xmm7,%%xmm7 \n" 629 "pcmpeqb %%xmm7,%%xmm7 \n"
630 "pslld $0xf,%%xmm7 \n" 630 "pslld $0xf,%%xmm7 \n"
631 LABELALIGN 631 LABELALIGN
(...skipping 14 matching lines...) Expand all
646 "por %%xmm3,%%xmm2 \n" 646 "por %%xmm3,%%xmm2 \n"
647 "por %%xmm2,%%xmm0 \n" 647 "por %%xmm2,%%xmm0 \n"
648 "packssdw %%xmm0,%%xmm0 \n" 648 "packssdw %%xmm0,%%xmm0 \n"
649 "lea " MEMLEA(0x10,0) ",%0 \n" 649 "lea " MEMLEA(0x10,0) ",%0 \n"
650 "movq %%xmm0," MEMACCESS(1) " \n" 650 "movq %%xmm0," MEMACCESS(1) " \n"
651 "lea " MEMLEA(0x8,1) ",%1 \n" 651 "lea " MEMLEA(0x8,1) ",%1 \n"
652 "sub $0x4,%2 \n" 652 "sub $0x4,%2 \n"
653 "jg 1b \n" 653 "jg 1b \n"
654 : "+r"(src), // %0 654 : "+r"(src), // %0
655 "+r"(dst), // %1 655 "+r"(dst), // %1
656 "+r"(pix) // %2 656 "+r"(width) // %2
657 :: "memory", "cc", 657 :: "memory", "cc",
658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
659 ); 659 );
660 } 660 }
661 661
662 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { 662 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
663 asm volatile ( 663 asm volatile (
664 "pcmpeqb %%xmm4,%%xmm4 \n" 664 "pcmpeqb %%xmm4,%%xmm4 \n"
665 "psllw $0xc,%%xmm4 \n" 665 "psllw $0xc,%%xmm4 \n"
666 "movdqa %%xmm4,%%xmm3 \n" 666 "movdqa %%xmm4,%%xmm3 \n"
667 "psrlw $0x8,%%xmm3 \n" 667 "psrlw $0x8,%%xmm3 \n"
668 LABELALIGN 668 LABELALIGN
669 "1: \n" 669 "1: \n"
670 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 670 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
671 "movdqa %%xmm0,%%xmm1 \n" 671 "movdqa %%xmm0,%%xmm1 \n"
672 "pand %%xmm3,%%xmm0 \n" 672 "pand %%xmm3,%%xmm0 \n"
673 "pand %%xmm4,%%xmm1 \n" 673 "pand %%xmm4,%%xmm1 \n"
674 "psrlq $0x4,%%xmm0 \n" 674 "psrlq $0x4,%%xmm0 \n"
675 "psrlq $0x8,%%xmm1 \n" 675 "psrlq $0x8,%%xmm1 \n"
676 "por %%xmm1,%%xmm0 \n" 676 "por %%xmm1,%%xmm0 \n"
677 "packuswb %%xmm0,%%xmm0 \n" 677 "packuswb %%xmm0,%%xmm0 \n"
678 "lea " MEMLEA(0x10,0) ",%0 \n" 678 "lea " MEMLEA(0x10,0) ",%0 \n"
679 "movq %%xmm0," MEMACCESS(1) " \n" 679 "movq %%xmm0," MEMACCESS(1) " \n"
680 "lea " MEMLEA(0x8,1) ",%1 \n" 680 "lea " MEMLEA(0x8,1) ",%1 \n"
681 "sub $0x4,%2 \n" 681 "sub $0x4,%2 \n"
682 "jg 1b \n" 682 "jg 1b \n"
683 : "+r"(src), // %0 683 : "+r"(src), // %0
684 "+r"(dst), // %1 684 "+r"(dst), // %1
685 "+r"(pix) // %2 685 "+r"(width) // %2
686 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 686 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
687 ); 687 );
688 } 688 }
689 #endif // HAS_RGB24TOARGBROW_SSSE3 689 #endif // HAS_RGB24TOARGBROW_SSSE3
690 690
691 #ifdef HAS_ARGBTOYROW_SSSE3 691 #ifdef HAS_ARGBTOYROW_SSSE3
692 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 692 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
693 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 693 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
694 asm volatile ( 694 asm volatile (
695 "movdqa %3,%%xmm4 \n" 695 "movdqa %3,%%xmm4 \n"
696 "movdqa %4,%%xmm5 \n" 696 "movdqa %4,%%xmm5 \n"
697 LABELALIGN 697 LABELALIGN
698 "1: \n" 698 "1: \n"
699 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 699 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
700 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 700 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
701 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 701 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
702 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 702 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
703 "pmaddubsw %%xmm4,%%xmm0 \n" 703 "pmaddubsw %%xmm4,%%xmm0 \n"
704 "pmaddubsw %%xmm4,%%xmm1 \n" 704 "pmaddubsw %%xmm4,%%xmm1 \n"
705 "pmaddubsw %%xmm4,%%xmm2 \n" 705 "pmaddubsw %%xmm4,%%xmm2 \n"
706 "pmaddubsw %%xmm4,%%xmm3 \n" 706 "pmaddubsw %%xmm4,%%xmm3 \n"
707 "lea " MEMLEA(0x40,0) ",%0 \n" 707 "lea " MEMLEA(0x40,0) ",%0 \n"
708 "phaddw %%xmm1,%%xmm0 \n" 708 "phaddw %%xmm1,%%xmm0 \n"
709 "phaddw %%xmm3,%%xmm2 \n" 709 "phaddw %%xmm3,%%xmm2 \n"
710 "psrlw $0x7,%%xmm0 \n" 710 "psrlw $0x7,%%xmm0 \n"
711 "psrlw $0x7,%%xmm2 \n" 711 "psrlw $0x7,%%xmm2 \n"
712 "packuswb %%xmm2,%%xmm0 \n" 712 "packuswb %%xmm2,%%xmm0 \n"
713 "paddb %%xmm5,%%xmm0 \n" 713 "paddb %%xmm5,%%xmm0 \n"
714 "movdqu %%xmm0," MEMACCESS(1) " \n" 714 "movdqu %%xmm0," MEMACCESS(1) " \n"
715 "lea " MEMLEA(0x10,1) ",%1 \n" 715 "lea " MEMLEA(0x10,1) ",%1 \n"
716 "sub $0x10,%2 \n" 716 "sub $0x10,%2 \n"
717 "jg 1b \n" 717 "jg 1b \n"
718 : "+r"(src_argb), // %0 718 : "+r"(src_argb), // %0
719 "+r"(dst_y), // %1 719 "+r"(dst_y), // %1
720 "+r"(pix) // %2 720 "+r"(width) // %2
721 : "m"(kARGBToY), // %3 721 : "m"(kARGBToY), // %3
722 "m"(kAddY16) // %4 722 "m"(kAddY16) // %4
723 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 723 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
724 ); 724 );
725 } 725 }
726 #endif // HAS_ARGBTOYROW_SSSE3 726 #endif // HAS_ARGBTOYROW_SSSE3
727 727
728 #ifdef HAS_ARGBTOYJROW_SSSE3 728 #ifdef HAS_ARGBTOYJROW_SSSE3
729 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 729 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
730 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 730 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
731 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 731 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
732 asm volatile ( 732 asm volatile (
733 "movdqa %3,%%xmm4 \n" 733 "movdqa %3,%%xmm4 \n"
734 "movdqa %4,%%xmm5 \n" 734 "movdqa %4,%%xmm5 \n"
735 LABELALIGN 735 LABELALIGN
736 "1: \n" 736 "1: \n"
737 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 737 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
738 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 738 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
739 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 739 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
740 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 740 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
741 "pmaddubsw %%xmm4,%%xmm0 \n" 741 "pmaddubsw %%xmm4,%%xmm0 \n"
742 "pmaddubsw %%xmm4,%%xmm1 \n" 742 "pmaddubsw %%xmm4,%%xmm1 \n"
743 "pmaddubsw %%xmm4,%%xmm2 \n" 743 "pmaddubsw %%xmm4,%%xmm2 \n"
744 "pmaddubsw %%xmm4,%%xmm3 \n" 744 "pmaddubsw %%xmm4,%%xmm3 \n"
745 "lea " MEMLEA(0x40,0) ",%0 \n" 745 "lea " MEMLEA(0x40,0) ",%0 \n"
746 "phaddw %%xmm1,%%xmm0 \n" 746 "phaddw %%xmm1,%%xmm0 \n"
747 "phaddw %%xmm3,%%xmm2 \n" 747 "phaddw %%xmm3,%%xmm2 \n"
748 "paddw %%xmm5,%%xmm0 \n" 748 "paddw %%xmm5,%%xmm0 \n"
749 "paddw %%xmm5,%%xmm2 \n" 749 "paddw %%xmm5,%%xmm2 \n"
750 "psrlw $0x7,%%xmm0 \n" 750 "psrlw $0x7,%%xmm0 \n"
751 "psrlw $0x7,%%xmm2 \n" 751 "psrlw $0x7,%%xmm2 \n"
752 "packuswb %%xmm2,%%xmm0 \n" 752 "packuswb %%xmm2,%%xmm0 \n"
753 "movdqu %%xmm0," MEMACCESS(1) " \n" 753 "movdqu %%xmm0," MEMACCESS(1) " \n"
754 "lea " MEMLEA(0x10,1) ",%1 \n" 754 "lea " MEMLEA(0x10,1) ",%1 \n"
755 "sub $0x10,%2 \n" 755 "sub $0x10,%2 \n"
756 "jg 1b \n" 756 "jg 1b \n"
757 : "+r"(src_argb), // %0 757 : "+r"(src_argb), // %0
758 "+r"(dst_y), // %1 758 "+r"(dst_y), // %1
759 "+r"(pix) // %2 759 "+r"(width) // %2
760 : "m"(kARGBToYJ), // %3 760 : "m"(kARGBToYJ), // %3
761 "m"(kAddYJ64) // %4 761 "m"(kAddYJ64) // %4
762 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 762 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
763 ); 763 );
764 } 764 }
765 #endif // HAS_ARGBTOYJROW_SSSE3 765 #endif // HAS_ARGBTOYJROW_SSSE3
766 766
767 #ifdef HAS_ARGBTOYROW_AVX2 767 #ifdef HAS_ARGBTOYROW_AVX2
768 // vpermd for vphaddw + vpackuswb vpermd. 768 // vpermd for vphaddw + vpackuswb vpermd.
769 static const lvec32 kPermdARGBToY_AVX = { 769 static const lvec32 kPermdARGBToY_AVX = {
770 0, 4, 1, 5, 2, 6, 3, 7 770 0, 4, 1, 5, 2, 6, 3, 7
771 }; 771 };
772 772
773 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 773 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
774 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 774 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
775 asm volatile ( 775 asm volatile (
776 "vbroadcastf128 %3,%%ymm4 \n" 776 "vbroadcastf128 %3,%%ymm4 \n"
777 "vbroadcastf128 %4,%%ymm5 \n" 777 "vbroadcastf128 %4,%%ymm5 \n"
778 "vmovdqu %5,%%ymm6 \n" 778 "vmovdqu %5,%%ymm6 \n"
779 LABELALIGN 779 LABELALIGN
780 "1: \n" 780 "1: \n"
781 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 781 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
782 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 782 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
783 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 783 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
784 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 784 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
785 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" 785 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
786 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" 786 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
787 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 787 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
788 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 788 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
789 "lea " MEMLEA(0x80,0) ",%0 \n" 789 "lea " MEMLEA(0x80,0) ",%0 \n"
790 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. 790 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
791 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" 791 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
792 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" 792 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
793 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" 793 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
794 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. 794 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
795 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. 795 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
796 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y 796 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
797 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 797 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
798 "lea " MEMLEA(0x20,1) ",%1 \n" 798 "lea " MEMLEA(0x20,1) ",%1 \n"
799 "sub $0x20,%2 \n" 799 "sub $0x20,%2 \n"
800 "jg 1b \n" 800 "jg 1b \n"
801 "vzeroupper \n" 801 "vzeroupper \n"
802 : "+r"(src_argb), // %0 802 : "+r"(src_argb), // %0
803 "+r"(dst_y), // %1 803 "+r"(dst_y), // %1
804 "+r"(pix) // %2 804 "+r"(width) // %2
805 : "m"(kARGBToY), // %3 805 : "m"(kARGBToY), // %3
806 "m"(kAddY16), // %4 806 "m"(kAddY16), // %4
807 "m"(kPermdARGBToY_AVX) // %5 807 "m"(kPermdARGBToY_AVX) // %5
808 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 808 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
809 ); 809 );
810 } 810 }
811 #endif // HAS_ARGBTOYROW_AVX2 811 #endif // HAS_ARGBTOYROW_AVX2
812 812
813 #ifdef HAS_ARGBTOYJROW_AVX2 813 #ifdef HAS_ARGBTOYJROW_AVX2
814 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 814 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
815 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 815 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
816 asm volatile ( 816 asm volatile (
817 "vbroadcastf128 %3,%%ymm4 \n" 817 "vbroadcastf128 %3,%%ymm4 \n"
818 "vbroadcastf128 %4,%%ymm5 \n" 818 "vbroadcastf128 %4,%%ymm5 \n"
819 "vmovdqu %5,%%ymm6 \n" 819 "vmovdqu %5,%%ymm6 \n"
820 LABELALIGN 820 LABELALIGN
821 "1: \n" 821 "1: \n"
822 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 822 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
823 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 823 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
824 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 824 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
825 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 825 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
(...skipping 10 matching lines...) Expand all
836 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" 836 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
837 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. 837 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
838 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. 838 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
839 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 839 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
840 "lea " MEMLEA(0x20,1) ",%1 \n" 840 "lea " MEMLEA(0x20,1) ",%1 \n"
841 "sub $0x20,%2 \n" 841 "sub $0x20,%2 \n"
842 "jg 1b \n" 842 "jg 1b \n"
843 "vzeroupper \n" 843 "vzeroupper \n"
844 : "+r"(src_argb), // %0 844 : "+r"(src_argb), // %0
845 "+r"(dst_y), // %1 845 "+r"(dst_y), // %1
846 "+r"(pix) // %2 846 "+r"(width) // %2
847 : "m"(kARGBToYJ), // %3 847 : "m"(kARGBToYJ), // %3
848 "m"(kAddYJ64), // %4 848 "m"(kAddYJ64), // %4
849 "m"(kPermdARGBToY_AVX) // %5 849 "m"(kPermdARGBToY_AVX) // %5
850 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 850 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
851 ); 851 );
852 } 852 }
853 #endif // HAS_ARGBTOYJROW_AVX2 853 #endif // HAS_ARGBTOYJROW_AVX2
854 854
855 #ifdef HAS_ARGBTOUVROW_SSSE3 855 #ifdef HAS_ARGBTOUVROW_SSSE3
856 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 856 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
(...skipping 290 matching lines...) Expand 10 before | Expand all | Expand 10 after
1147 "+rm"(width) // %3 1147 "+rm"(width) // %3
1148 : "m"(kARGBToV), // %4 1148 : "m"(kARGBToV), // %4
1149 "m"(kARGBToU), // %5 1149 "m"(kARGBToU), // %5
1150 "m"(kAddUV128) // %6 1150 "m"(kAddUV128) // %6
1151 : "memory", "cc", NACL_R14 1151 : "memory", "cc", NACL_R14
1152 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1152 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1153 ); 1153 );
1154 } 1154 }
1155 #endif // HAS_ARGBTOUV422ROW_SSSE3 1155 #endif // HAS_ARGBTOUV422ROW_SSSE3
1156 1156
1157 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { 1157 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
1158 asm volatile ( 1158 asm volatile (
1159 "movdqa %4,%%xmm5 \n" 1159 "movdqa %4,%%xmm5 \n"
1160 "movdqa %3,%%xmm4 \n" 1160 "movdqa %3,%%xmm4 \n"
1161 LABELALIGN 1161 LABELALIGN
1162 "1: \n" 1162 "1: \n"
1163 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1163 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1164 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1164 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1165 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1165 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1166 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1166 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1167 "pmaddubsw %%xmm4,%%xmm0 \n" 1167 "pmaddubsw %%xmm4,%%xmm0 \n"
1168 "pmaddubsw %%xmm4,%%xmm1 \n" 1168 "pmaddubsw %%xmm4,%%xmm1 \n"
1169 "pmaddubsw %%xmm4,%%xmm2 \n" 1169 "pmaddubsw %%xmm4,%%xmm2 \n"
1170 "pmaddubsw %%xmm4,%%xmm3 \n" 1170 "pmaddubsw %%xmm4,%%xmm3 \n"
1171 "lea " MEMLEA(0x40,0) ",%0 \n" 1171 "lea " MEMLEA(0x40,0) ",%0 \n"
1172 "phaddw %%xmm1,%%xmm0 \n" 1172 "phaddw %%xmm1,%%xmm0 \n"
1173 "phaddw %%xmm3,%%xmm2 \n" 1173 "phaddw %%xmm3,%%xmm2 \n"
1174 "psrlw $0x7,%%xmm0 \n" 1174 "psrlw $0x7,%%xmm0 \n"
1175 "psrlw $0x7,%%xmm2 \n" 1175 "psrlw $0x7,%%xmm2 \n"
1176 "packuswb %%xmm2,%%xmm0 \n" 1176 "packuswb %%xmm2,%%xmm0 \n"
1177 "paddb %%xmm5,%%xmm0 \n" 1177 "paddb %%xmm5,%%xmm0 \n"
1178 "movdqu %%xmm0," MEMACCESS(1) " \n" 1178 "movdqu %%xmm0," MEMACCESS(1) " \n"
1179 "lea " MEMLEA(0x10,1) ",%1 \n" 1179 "lea " MEMLEA(0x10,1) ",%1 \n"
1180 "sub $0x10,%2 \n" 1180 "sub $0x10,%2 \n"
1181 "jg 1b \n" 1181 "jg 1b \n"
1182 : "+r"(src_bgra), // %0 1182 : "+r"(src_bgra), // %0
1183 "+r"(dst_y), // %1 1183 "+r"(dst_y), // %1
1184 "+r"(pix) // %2 1184 "+r"(width) // %2
1185 : "m"(kBGRAToY), // %3 1185 : "m"(kBGRAToY), // %3
1186 "m"(kAddY16) // %4 1186 "m"(kAddY16) // %4
1187 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1187 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1188 ); 1188 );
1189 } 1189 }
1190 1190
1191 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1191 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1192 uint8* dst_u, uint8* dst_v, int width) { 1192 uint8* dst_u, uint8* dst_v, int width) {
1193 asm volatile ( 1193 asm volatile (
1194 "movdqa %5,%%xmm3 \n" 1194 "movdqa %5,%%xmm3 \n"
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
1242 "+rm"(width) // %3 1242 "+rm"(width) // %3
1243 : "r"((intptr_t)(src_stride_bgra)), // %4 1243 : "r"((intptr_t)(src_stride_bgra)), // %4
1244 "m"(kBGRAToV), // %5 1244 "m"(kBGRAToV), // %5
1245 "m"(kBGRAToU), // %6 1245 "m"(kBGRAToU), // %6
1246 "m"(kAddUV128) // %7 1246 "m"(kAddUV128) // %7
1247 : "memory", "cc", NACL_R14 1247 : "memory", "cc", NACL_R14
1248 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1248 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1249 ); 1249 );
1250 } 1250 }
1251 1251
1252 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { 1252 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
1253 asm volatile ( 1253 asm volatile (
1254 "movdqa %4,%%xmm5 \n" 1254 "movdqa %4,%%xmm5 \n"
1255 "movdqa %3,%%xmm4 \n" 1255 "movdqa %3,%%xmm4 \n"
1256 LABELALIGN 1256 LABELALIGN
1257 "1: \n" 1257 "1: \n"
1258 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1258 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1259 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1259 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1260 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1260 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1261 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1261 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1262 "pmaddubsw %%xmm4,%%xmm0 \n" 1262 "pmaddubsw %%xmm4,%%xmm0 \n"
1263 "pmaddubsw %%xmm4,%%xmm1 \n" 1263 "pmaddubsw %%xmm4,%%xmm1 \n"
1264 "pmaddubsw %%xmm4,%%xmm2 \n" 1264 "pmaddubsw %%xmm4,%%xmm2 \n"
1265 "pmaddubsw %%xmm4,%%xmm3 \n" 1265 "pmaddubsw %%xmm4,%%xmm3 \n"
1266 "lea " MEMLEA(0x40,0) ",%0 \n" 1266 "lea " MEMLEA(0x40,0) ",%0 \n"
1267 "phaddw %%xmm1,%%xmm0 \n" 1267 "phaddw %%xmm1,%%xmm0 \n"
1268 "phaddw %%xmm3,%%xmm2 \n" 1268 "phaddw %%xmm3,%%xmm2 \n"
1269 "psrlw $0x7,%%xmm0 \n" 1269 "psrlw $0x7,%%xmm0 \n"
1270 "psrlw $0x7,%%xmm2 \n" 1270 "psrlw $0x7,%%xmm2 \n"
1271 "packuswb %%xmm2,%%xmm0 \n" 1271 "packuswb %%xmm2,%%xmm0 \n"
1272 "paddb %%xmm5,%%xmm0 \n" 1272 "paddb %%xmm5,%%xmm0 \n"
1273 "movdqu %%xmm0," MEMACCESS(1) " \n" 1273 "movdqu %%xmm0," MEMACCESS(1) " \n"
1274 "lea " MEMLEA(0x10,1) ",%1 \n" 1274 "lea " MEMLEA(0x10,1) ",%1 \n"
1275 "sub $0x10,%2 \n" 1275 "sub $0x10,%2 \n"
1276 "jg 1b \n" 1276 "jg 1b \n"
1277 : "+r"(src_abgr), // %0 1277 : "+r"(src_abgr), // %0
1278 "+r"(dst_y), // %1 1278 "+r"(dst_y), // %1
1279 "+r"(pix) // %2 1279 "+r"(width) // %2
1280 : "m"(kABGRToY), // %3 1280 : "m"(kABGRToY), // %3
1281 "m"(kAddY16) // %4 1281 "m"(kAddY16) // %4
1282 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1282 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1283 ); 1283 );
1284 } 1284 }
1285 1285
1286 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { 1286 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
1287 asm volatile ( 1287 asm volatile (
1288 "movdqa %4,%%xmm5 \n" 1288 "movdqa %4,%%xmm5 \n"
1289 "movdqa %3,%%xmm4 \n" 1289 "movdqa %3,%%xmm4 \n"
1290 LABELALIGN 1290 LABELALIGN
1291 "1: \n" 1291 "1: \n"
1292 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1292 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1293 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1293 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1294 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1294 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1295 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1295 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1296 "pmaddubsw %%xmm4,%%xmm0 \n" 1296 "pmaddubsw %%xmm4,%%xmm0 \n"
1297 "pmaddubsw %%xmm4,%%xmm1 \n" 1297 "pmaddubsw %%xmm4,%%xmm1 \n"
1298 "pmaddubsw %%xmm4,%%xmm2 \n" 1298 "pmaddubsw %%xmm4,%%xmm2 \n"
1299 "pmaddubsw %%xmm4,%%xmm3 \n" 1299 "pmaddubsw %%xmm4,%%xmm3 \n"
1300 "lea " MEMLEA(0x40,0) ",%0 \n" 1300 "lea " MEMLEA(0x40,0) ",%0 \n"
1301 "phaddw %%xmm1,%%xmm0 \n" 1301 "phaddw %%xmm1,%%xmm0 \n"
1302 "phaddw %%xmm3,%%xmm2 \n" 1302 "phaddw %%xmm3,%%xmm2 \n"
1303 "psrlw $0x7,%%xmm0 \n" 1303 "psrlw $0x7,%%xmm0 \n"
1304 "psrlw $0x7,%%xmm2 \n" 1304 "psrlw $0x7,%%xmm2 \n"
1305 "packuswb %%xmm2,%%xmm0 \n" 1305 "packuswb %%xmm2,%%xmm0 \n"
1306 "paddb %%xmm5,%%xmm0 \n" 1306 "paddb %%xmm5,%%xmm0 \n"
1307 "movdqu %%xmm0," MEMACCESS(1) " \n" 1307 "movdqu %%xmm0," MEMACCESS(1) " \n"
1308 "lea " MEMLEA(0x10,1) ",%1 \n" 1308 "lea " MEMLEA(0x10,1) ",%1 \n"
1309 "sub $0x10,%2 \n" 1309 "sub $0x10,%2 \n"
1310 "jg 1b \n" 1310 "jg 1b \n"
1311 : "+r"(src_rgba), // %0 1311 : "+r"(src_rgba), // %0
1312 "+r"(dst_y), // %1 1312 "+r"(dst_y), // %1
1313 "+r"(pix) // %2 1313 "+r"(width) // %2
1314 : "m"(kRGBAToY), // %3 1314 : "m"(kRGBAToY), // %3
1315 "m"(kAddY16) // %4 1315 "m"(kAddY16) // %4
1316 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1316 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1317 ); 1317 );
1318 } 1318 }
1319 1319
1320 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1320 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1321 uint8* dst_u, uint8* dst_v, int width) { 1321 uint8* dst_u, uint8* dst_v, int width) {
1322 asm volatile ( 1322 asm volatile (
1323 "movdqa %5,%%xmm3 \n" 1323 "movdqa %5,%%xmm3 \n"
(...skipping 1384 matching lines...) Expand 10 before | Expand all | Expand 10 after
2708 "+r"(dst), // %1 2708 "+r"(dst), // %1
2709 "+r"(temp_width) // %2 2709 "+r"(temp_width) // %2
2710 : "m"(kARGBShuffleMirror_AVX2) // %3 2710 : "m"(kARGBShuffleMirror_AVX2) // %3
2711 : "memory", "cc", NACL_R14 2711 : "memory", "cc", NACL_R14
2712 "xmm0", "xmm5" 2712 "xmm0", "xmm5"
2713 ); 2713 );
2714 } 2714 }
2715 #endif // HAS_ARGBMIRRORROW_AVX2 2715 #endif // HAS_ARGBMIRRORROW_AVX2
2716 2716
2717 #ifdef HAS_SPLITUVROW_AVX2 2717 #ifdef HAS_SPLITUVROW_AVX2
2718 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 2718 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
2719 asm volatile ( 2719 asm volatile (
2720 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2720 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2721 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 2721 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2722 "sub %1,%2 \n" 2722 "sub %1,%2 \n"
2723 LABELALIGN 2723 LABELALIGN
2724 "1: \n" 2724 "1: \n"
2725 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2725 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2726 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 2726 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2727 "lea " MEMLEA(0x40,0) ",%0 \n" 2727 "lea " MEMLEA(0x40,0) ",%0 \n"
2728 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" 2728 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
2729 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" 2729 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
2730 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 2730 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2731 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 2731 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2732 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 2732 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2733 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" 2733 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
2734 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 2734 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2735 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2735 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2736 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2736 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2737 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) 2737 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
2738 "lea " MEMLEA(0x20,1) ",%1 \n" 2738 "lea " MEMLEA(0x20,1) ",%1 \n"
2739 "sub $0x20,%3 \n" 2739 "sub $0x20,%3 \n"
2740 "jg 1b \n" 2740 "jg 1b \n"
2741 "vzeroupper \n" 2741 "vzeroupper \n"
2742 : "+r"(src_uv), // %0 2742 : "+r"(src_uv), // %0
2743 "+r"(dst_u), // %1 2743 "+r"(dst_u), // %1
2744 "+r"(dst_v), // %2 2744 "+r"(dst_v), // %2
2745 "+r"(pix) // %3 2745 "+r"(width) // %3
2746 : 2746 :
2747 : "memory", "cc", NACL_R14 2747 : "memory", "cc", NACL_R14
2748 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2748 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2749 ); 2749 );
2750 } 2750 }
2751 #endif // HAS_SPLITUVROW_AVX2 2751 #endif // HAS_SPLITUVROW_AVX2
2752 2752
2753 #ifdef HAS_SPLITUVROW_SSE2 2753 #ifdef HAS_SPLITUVROW_SSE2
2754 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 2754 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
2755 asm volatile ( 2755 asm volatile (
2756 "pcmpeqb %%xmm5,%%xmm5 \n" 2756 "pcmpeqb %%xmm5,%%xmm5 \n"
2757 "psrlw $0x8,%%xmm5 \n" 2757 "psrlw $0x8,%%xmm5 \n"
2758 "sub %1,%2 \n" 2758 "sub %1,%2 \n"
2759 LABELALIGN 2759 LABELALIGN
2760 "1: \n" 2760 "1: \n"
2761 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2761 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2762 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2762 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2763 "lea " MEMLEA(0x20,0) ",%0 \n" 2763 "lea " MEMLEA(0x20,0) ",%0 \n"
2764 "movdqa %%xmm0,%%xmm2 \n" 2764 "movdqa %%xmm0,%%xmm2 \n"
2765 "movdqa %%xmm1,%%xmm3 \n" 2765 "movdqa %%xmm1,%%xmm3 \n"
2766 "pand %%xmm5,%%xmm0 \n" 2766 "pand %%xmm5,%%xmm0 \n"
2767 "pand %%xmm5,%%xmm1 \n" 2767 "pand %%xmm5,%%xmm1 \n"
2768 "packuswb %%xmm1,%%xmm0 \n" 2768 "packuswb %%xmm1,%%xmm0 \n"
2769 "psrlw $0x8,%%xmm2 \n" 2769 "psrlw $0x8,%%xmm2 \n"
2770 "psrlw $0x8,%%xmm3 \n" 2770 "psrlw $0x8,%%xmm3 \n"
2771 "packuswb %%xmm3,%%xmm2 \n" 2771 "packuswb %%xmm3,%%xmm2 \n"
2772 "movdqu %%xmm0," MEMACCESS(1) " \n" 2772 "movdqu %%xmm0," MEMACCESS(1) " \n"
2773 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) 2773 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
2774 "lea " MEMLEA(0x10,1) ",%1 \n" 2774 "lea " MEMLEA(0x10,1) ",%1 \n"
2775 "sub $0x10,%3 \n" 2775 "sub $0x10,%3 \n"
2776 "jg 1b \n" 2776 "jg 1b \n"
2777 : "+r"(src_uv), // %0 2777 : "+r"(src_uv), // %0
2778 "+r"(dst_u), // %1 2778 "+r"(dst_u), // %1
2779 "+r"(dst_v), // %2 2779 "+r"(dst_v), // %2
2780 "+r"(pix) // %3 2780 "+r"(width) // %3
2781 : 2781 :
2782 : "memory", "cc", NACL_R14 2782 : "memory", "cc", NACL_R14
2783 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2783 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2784 ); 2784 );
2785 } 2785 }
2786 #endif // HAS_SPLITUVROW_SSE2 2786 #endif // HAS_SPLITUVROW_SSE2
2787 2787
2788 #ifdef HAS_MERGEUVROW_AVX2 2788 #ifdef HAS_MERGEUVROW_AVX2
2789 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 2789 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2790 int width) { 2790 int width) {
(...skipping 276 matching lines...) Expand 10 before | Expand all | Expand 10 after
3067 asm volatile ( 3067 asm volatile (
3068 "rep stosl " MEMSTORESTRING(eax,0) " \n" 3068 "rep stosl " MEMSTORESTRING(eax,0) " \n"
3069 : "+D"(dst_argb), // %0 3069 : "+D"(dst_argb), // %0
3070 "+c"(width_tmp) // %1 3070 "+c"(width_tmp) // %1
3071 : "a"(v32) // %2 3071 : "a"(v32) // %2
3072 : "memory", "cc"); 3072 : "memory", "cc");
3073 } 3073 }
3074 #endif // HAS_SETROW_X86 3074 #endif // HAS_SETROW_X86
3075 3075
3076 #ifdef HAS_YUY2TOYROW_SSE2 3076 #ifdef HAS_YUY2TOYROW_SSE2
3077 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { 3077 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3078 asm volatile ( 3078 asm volatile (
3079 "pcmpeqb %%xmm5,%%xmm5 \n" 3079 "pcmpeqb %%xmm5,%%xmm5 \n"
3080 "psrlw $0x8,%%xmm5 \n" 3080 "psrlw $0x8,%%xmm5 \n"
3081 LABELALIGN 3081 LABELALIGN
3082 "1: \n" 3082 "1: \n"
3083 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3083 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3084 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3084 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3085 "lea " MEMLEA(0x20,0) ",%0 \n" 3085 "lea " MEMLEA(0x20,0) ",%0 \n"
3086 "pand %%xmm5,%%xmm0 \n" 3086 "pand %%xmm5,%%xmm0 \n"
3087 "pand %%xmm5,%%xmm1 \n" 3087 "pand %%xmm5,%%xmm1 \n"
3088 "packuswb %%xmm1,%%xmm0 \n" 3088 "packuswb %%xmm1,%%xmm0 \n"
3089 "movdqu %%xmm0," MEMACCESS(1) " \n" 3089 "movdqu %%xmm0," MEMACCESS(1) " \n"
3090 "lea " MEMLEA(0x10,1) ",%1 \n" 3090 "lea " MEMLEA(0x10,1) ",%1 \n"
3091 "sub $0x10,%2 \n" 3091 "sub $0x10,%2 \n"
3092 "jg 1b \n" 3092 "jg 1b \n"
3093 : "+r"(src_yuy2), // %0 3093 : "+r"(src_yuy2), // %0
3094 "+r"(dst_y), // %1 3094 "+r"(dst_y), // %1
3095 "+r"(pix) // %2 3095 "+r"(width) // %2
3096 : 3096 :
3097 : "memory", "cc" 3097 : "memory", "cc"
3098 , "xmm0", "xmm1", "xmm5" 3098 , "xmm0", "xmm1", "xmm5"
3099 ); 3099 );
3100 } 3100 }
3101 3101
3102 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3102 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3103 uint8* dst_u, uint8* dst_v, int pix) { 3103 uint8* dst_u, uint8* dst_v, int width) {
3104 asm volatile ( 3104 asm volatile (
3105 "pcmpeqb %%xmm5,%%xmm5 \n" 3105 "pcmpeqb %%xmm5,%%xmm5 \n"
3106 "psrlw $0x8,%%xmm5 \n" 3106 "psrlw $0x8,%%xmm5 \n"
3107 "sub %1,%2 \n" 3107 "sub %1,%2 \n"
3108 LABELALIGN 3108 LABELALIGN
3109 "1: \n" 3109 "1: \n"
3110 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3110 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3111 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3111 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3112 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3112 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3113 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3113 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3114 "lea " MEMLEA(0x20,0) ",%0 \n" 3114 "lea " MEMLEA(0x20,0) ",%0 \n"
3115 "pavgb %%xmm2,%%xmm0 \n" 3115 "pavgb %%xmm2,%%xmm0 \n"
3116 "pavgb %%xmm3,%%xmm1 \n" 3116 "pavgb %%xmm3,%%xmm1 \n"
3117 "psrlw $0x8,%%xmm0 \n" 3117 "psrlw $0x8,%%xmm0 \n"
3118 "psrlw $0x8,%%xmm1 \n" 3118 "psrlw $0x8,%%xmm1 \n"
3119 "packuswb %%xmm1,%%xmm0 \n" 3119 "packuswb %%xmm1,%%xmm0 \n"
3120 "movdqa %%xmm0,%%xmm1 \n" 3120 "movdqa %%xmm0,%%xmm1 \n"
3121 "pand %%xmm5,%%xmm0 \n" 3121 "pand %%xmm5,%%xmm0 \n"
3122 "packuswb %%xmm0,%%xmm0 \n" 3122 "packuswb %%xmm0,%%xmm0 \n"
3123 "psrlw $0x8,%%xmm1 \n" 3123 "psrlw $0x8,%%xmm1 \n"
3124 "packuswb %%xmm1,%%xmm1 \n" 3124 "packuswb %%xmm1,%%xmm1 \n"
3125 "movq %%xmm0," MEMACCESS(1) " \n" 3125 "movq %%xmm0," MEMACCESS(1) " \n"
3126 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3126 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3127 "lea " MEMLEA(0x8,1) ",%1 \n" 3127 "lea " MEMLEA(0x8,1) ",%1 \n"
3128 "sub $0x10,%3 \n" 3128 "sub $0x10,%3 \n"
3129 "jg 1b \n" 3129 "jg 1b \n"
3130 : "+r"(src_yuy2), // %0 3130 : "+r"(src_yuy2), // %0
3131 "+r"(dst_u), // %1 3131 "+r"(dst_u), // %1
3132 "+r"(dst_v), // %2 3132 "+r"(dst_v), // %2
3133 "+r"(pix) // %3 3133 "+r"(width) // %3
3134 : "r"((intptr_t)(stride_yuy2)) // %4 3134 : "r"((intptr_t)(stride_yuy2)) // %4
3135 : "memory", "cc", NACL_R14 3135 : "memory", "cc", NACL_R14
3136 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3136 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3137 ); 3137 );
3138 } 3138 }
3139 3139
3140 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3140 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3141 uint8* dst_u, uint8* dst_v, int pix) { 3141 uint8* dst_u, uint8* dst_v, int width) {
3142 asm volatile ( 3142 asm volatile (
3143 "pcmpeqb %%xmm5,%%xmm5 \n" 3143 "pcmpeqb %%xmm5,%%xmm5 \n"
3144 "psrlw $0x8,%%xmm5 \n" 3144 "psrlw $0x8,%%xmm5 \n"
3145 "sub %1,%2 \n" 3145 "sub %1,%2 \n"
3146 LABELALIGN 3146 LABELALIGN
3147 "1: \n" 3147 "1: \n"
3148 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3148 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3149 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3149 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3150 "lea " MEMLEA(0x20,0) ",%0 \n" 3150 "lea " MEMLEA(0x20,0) ",%0 \n"
3151 "psrlw $0x8,%%xmm0 \n" 3151 "psrlw $0x8,%%xmm0 \n"
3152 "psrlw $0x8,%%xmm1 \n" 3152 "psrlw $0x8,%%xmm1 \n"
3153 "packuswb %%xmm1,%%xmm0 \n" 3153 "packuswb %%xmm1,%%xmm0 \n"
3154 "movdqa %%xmm0,%%xmm1 \n" 3154 "movdqa %%xmm0,%%xmm1 \n"
3155 "pand %%xmm5,%%xmm0 \n" 3155 "pand %%xmm5,%%xmm0 \n"
3156 "packuswb %%xmm0,%%xmm0 \n" 3156 "packuswb %%xmm0,%%xmm0 \n"
3157 "psrlw $0x8,%%xmm1 \n" 3157 "psrlw $0x8,%%xmm1 \n"
3158 "packuswb %%xmm1,%%xmm1 \n" 3158 "packuswb %%xmm1,%%xmm1 \n"
3159 "movq %%xmm0," MEMACCESS(1) " \n" 3159 "movq %%xmm0," MEMACCESS(1) " \n"
3160 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3160 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3161 "lea " MEMLEA(0x8,1) ",%1 \n" 3161 "lea " MEMLEA(0x8,1) ",%1 \n"
3162 "sub $0x10,%3 \n" 3162 "sub $0x10,%3 \n"
3163 "jg 1b \n" 3163 "jg 1b \n"
3164 : "+r"(src_yuy2), // %0 3164 : "+r"(src_yuy2), // %0
3165 "+r"(dst_u), // %1 3165 "+r"(dst_u), // %1
3166 "+r"(dst_v), // %2 3166 "+r"(dst_v), // %2
3167 "+r"(pix) // %3 3167 "+r"(width) // %3
3168 : 3168 :
3169 : "memory", "cc", NACL_R14 3169 : "memory", "cc", NACL_R14
3170 "xmm0", "xmm1", "xmm5" 3170 "xmm0", "xmm1", "xmm5"
3171 ); 3171 );
3172 } 3172 }
3173 3173
3174 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { 3174 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
3175 asm volatile ( 3175 asm volatile (
3176 LABELALIGN 3176 LABELALIGN
3177 "1: \n" 3177 "1: \n"
3178 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3178 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3179 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3179 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3180 "lea " MEMLEA(0x20,0) ",%0 \n" 3180 "lea " MEMLEA(0x20,0) ",%0 \n"
3181 "psrlw $0x8,%%xmm0 \n" 3181 "psrlw $0x8,%%xmm0 \n"
3182 "psrlw $0x8,%%xmm1 \n" 3182 "psrlw $0x8,%%xmm1 \n"
3183 "packuswb %%xmm1,%%xmm0 \n" 3183 "packuswb %%xmm1,%%xmm0 \n"
3184 "movdqu %%xmm0," MEMACCESS(1) " \n" 3184 "movdqu %%xmm0," MEMACCESS(1) " \n"
3185 "lea " MEMLEA(0x10,1) ",%1 \n" 3185 "lea " MEMLEA(0x10,1) ",%1 \n"
3186 "sub $0x10,%2 \n" 3186 "sub $0x10,%2 \n"
3187 "jg 1b \n" 3187 "jg 1b \n"
3188 : "+r"(src_uyvy), // %0 3188 : "+r"(src_uyvy), // %0
3189 "+r"(dst_y), // %1 3189 "+r"(dst_y), // %1
3190 "+r"(pix) // %2 3190 "+r"(width) // %2
3191 : 3191 :
3192 : "memory", "cc" 3192 : "memory", "cc"
3193 , "xmm0", "xmm1" 3193 , "xmm0", "xmm1"
3194 ); 3194 );
3195 } 3195 }
3196 3196
3197 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 3197 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3198 uint8* dst_u, uint8* dst_v, int pix) { 3198 uint8* dst_u, uint8* dst_v, int width) {
3199 asm volatile ( 3199 asm volatile (
3200 "pcmpeqb %%xmm5,%%xmm5 \n" 3200 "pcmpeqb %%xmm5,%%xmm5 \n"
3201 "psrlw $0x8,%%xmm5 \n" 3201 "psrlw $0x8,%%xmm5 \n"
3202 "sub %1,%2 \n" 3202 "sub %1,%2 \n"
3203 LABELALIGN 3203 LABELALIGN
3204 "1: \n" 3204 "1: \n"
3205 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3205 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3206 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3206 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3207 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3207 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3208 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3208 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3209 "lea " MEMLEA(0x20,0) ",%0 \n" 3209 "lea " MEMLEA(0x20,0) ",%0 \n"
3210 "pavgb %%xmm2,%%xmm0 \n" 3210 "pavgb %%xmm2,%%xmm0 \n"
3211 "pavgb %%xmm3,%%xmm1 \n" 3211 "pavgb %%xmm3,%%xmm1 \n"
3212 "pand %%xmm5,%%xmm0 \n" 3212 "pand %%xmm5,%%xmm0 \n"
3213 "pand %%xmm5,%%xmm1 \n" 3213 "pand %%xmm5,%%xmm1 \n"
3214 "packuswb %%xmm1,%%xmm0 \n" 3214 "packuswb %%xmm1,%%xmm0 \n"
3215 "movdqa %%xmm0,%%xmm1 \n" 3215 "movdqa %%xmm0,%%xmm1 \n"
3216 "pand %%xmm5,%%xmm0 \n" 3216 "pand %%xmm5,%%xmm0 \n"
3217 "packuswb %%xmm0,%%xmm0 \n" 3217 "packuswb %%xmm0,%%xmm0 \n"
3218 "psrlw $0x8,%%xmm1 \n" 3218 "psrlw $0x8,%%xmm1 \n"
3219 "packuswb %%xmm1,%%xmm1 \n" 3219 "packuswb %%xmm1,%%xmm1 \n"
3220 "movq %%xmm0," MEMACCESS(1) " \n" 3220 "movq %%xmm0," MEMACCESS(1) " \n"
3221 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3221 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3222 "lea " MEMLEA(0x8,1) ",%1 \n" 3222 "lea " MEMLEA(0x8,1) ",%1 \n"
3223 "sub $0x10,%3 \n" 3223 "sub $0x10,%3 \n"
3224 "jg 1b \n" 3224 "jg 1b \n"
3225 : "+r"(src_uyvy), // %0 3225 : "+r"(src_uyvy), // %0
3226 "+r"(dst_u), // %1 3226 "+r"(dst_u), // %1
3227 "+r"(dst_v), // %2 3227 "+r"(dst_v), // %2
3228 "+r"(pix) // %3 3228 "+r"(width) // %3
3229 : "r"((intptr_t)(stride_uyvy)) // %4 3229 : "r"((intptr_t)(stride_uyvy)) // %4
3230 : "memory", "cc", NACL_R14 3230 : "memory", "cc", NACL_R14
3231 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3231 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3232 ); 3232 );
3233 } 3233 }
3234 3234
3235 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 3235 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3236 uint8* dst_u, uint8* dst_v, int pix) { 3236 uint8* dst_u, uint8* dst_v, int width) {
3237 asm volatile ( 3237 asm volatile (
3238 "pcmpeqb %%xmm5,%%xmm5 \n" 3238 "pcmpeqb %%xmm5,%%xmm5 \n"
3239 "psrlw $0x8,%%xmm5 \n" 3239 "psrlw $0x8,%%xmm5 \n"
3240 "sub %1,%2 \n" 3240 "sub %1,%2 \n"
3241 LABELALIGN 3241 LABELALIGN
3242 "1: \n" 3242 "1: \n"
3243 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3243 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3244 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3244 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3245 "lea " MEMLEA(0x20,0) ",%0 \n" 3245 "lea " MEMLEA(0x20,0) ",%0 \n"
3246 "pand %%xmm5,%%xmm0 \n" 3246 "pand %%xmm5,%%xmm0 \n"
3247 "pand %%xmm5,%%xmm1 \n" 3247 "pand %%xmm5,%%xmm1 \n"
3248 "packuswb %%xmm1,%%xmm0 \n" 3248 "packuswb %%xmm1,%%xmm0 \n"
3249 "movdqa %%xmm0,%%xmm1 \n" 3249 "movdqa %%xmm0,%%xmm1 \n"
3250 "pand %%xmm5,%%xmm0 \n" 3250 "pand %%xmm5,%%xmm0 \n"
3251 "packuswb %%xmm0,%%xmm0 \n" 3251 "packuswb %%xmm0,%%xmm0 \n"
3252 "psrlw $0x8,%%xmm1 \n" 3252 "psrlw $0x8,%%xmm1 \n"
3253 "packuswb %%xmm1,%%xmm1 \n" 3253 "packuswb %%xmm1,%%xmm1 \n"
3254 "movq %%xmm0," MEMACCESS(1) " \n" 3254 "movq %%xmm0," MEMACCESS(1) " \n"
3255 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3255 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3256 "lea " MEMLEA(0x8,1) ",%1 \n" 3256 "lea " MEMLEA(0x8,1) ",%1 \n"
3257 "sub $0x10,%3 \n" 3257 "sub $0x10,%3 \n"
3258 "jg 1b \n" 3258 "jg 1b \n"
3259 : "+r"(src_uyvy), // %0 3259 : "+r"(src_uyvy), // %0
3260 "+r"(dst_u), // %1 3260 "+r"(dst_u), // %1
3261 "+r"(dst_v), // %2 3261 "+r"(dst_v), // %2
3262 "+r"(pix) // %3 3262 "+r"(width) // %3
3263 : 3263 :
3264 : "memory", "cc", NACL_R14 3264 : "memory", "cc", NACL_R14
3265 "xmm0", "xmm1", "xmm5" 3265 "xmm0", "xmm1", "xmm5"
3266 ); 3266 );
3267 } 3267 }
3268 #endif // HAS_YUY2TOYROW_SSE2 3268 #endif // HAS_YUY2TOYROW_SSE2
3269 3269
3270 #ifdef HAS_YUY2TOYROW_AVX2 3270 #ifdef HAS_YUY2TOYROW_AVX2
3271 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { 3271 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3272 asm volatile ( 3272 asm volatile (
3273 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3273 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3274 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3274 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3275 LABELALIGN 3275 LABELALIGN
3276 "1: \n" 3276 "1: \n"
3277 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3277 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3278 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3278 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3279 "lea " MEMLEA(0x40,0) ",%0 \n" 3279 "lea " MEMLEA(0x40,0) ",%0 \n"
3280 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3280 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3281 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 3281 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3282 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3282 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3283 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3283 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3284 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 3284 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3285 "lea " MEMLEA(0x20,1) ",%1 \n" 3285 "lea " MEMLEA(0x20,1) ",%1 \n"
3286 "sub $0x20,%2 \n" 3286 "sub $0x20,%2 \n"
3287 "jg 1b \n" 3287 "jg 1b \n"
3288 "vzeroupper \n" 3288 "vzeroupper \n"
3289 : "+r"(src_yuy2), // %0 3289 : "+r"(src_yuy2), // %0
3290 "+r"(dst_y), // %1 3290 "+r"(dst_y), // %1
3291 "+r"(pix) // %2 3291 "+r"(width) // %2
3292 : 3292 :
3293 : "memory", "cc" 3293 : "memory", "cc"
3294 , "xmm0", "xmm1", "xmm5" 3294 , "xmm0", "xmm1", "xmm5"
3295 ); 3295 );
3296 } 3296 }
3297 3297
3298 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3298 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3299 uint8* dst_u, uint8* dst_v, int pix) { 3299 uint8* dst_u, uint8* dst_v, int width) {
3300 asm volatile ( 3300 asm volatile (
3301 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3301 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3302 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3302 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3303 "sub %1,%2 \n" 3303 "sub %1,%2 \n"
3304 LABELALIGN 3304 LABELALIGN
3305 "1: \n" 3305 "1: \n"
3306 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3306 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3307 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3307 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3308 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 3308 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3309 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 3309 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
(...skipping 10 matching lines...) Expand all
3320 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3320 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3321 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3321 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3322 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1) 3322 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1)
3323 "lea " MEMLEA(0x10,1) ",%1 \n" 3323 "lea " MEMLEA(0x10,1) ",%1 \n"
3324 "sub $0x20,%3 \n" 3324 "sub $0x20,%3 \n"
3325 "jg 1b \n" 3325 "jg 1b \n"
3326 "vzeroupper \n" 3326 "vzeroupper \n"
3327 : "+r"(src_yuy2), // %0 3327 : "+r"(src_yuy2), // %0
3328 "+r"(dst_u), // %1 3328 "+r"(dst_u), // %1
3329 "+r"(dst_v), // %2 3329 "+r"(dst_v), // %2
3330 "+r"(pix) // %3 3330 "+r"(width) // %3
3331 : "r"((intptr_t)(stride_yuy2)) // %4 3331 : "r"((intptr_t)(stride_yuy2)) // %4
3332 : "memory", "cc", NACL_R14 3332 : "memory", "cc", NACL_R14
3333 "xmm0", "xmm1", "xmm5" 3333 "xmm0", "xmm1", "xmm5"
3334 ); 3334 );
3335 } 3335 }
3336 3336
3337 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3337 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3338 uint8* dst_u, uint8* dst_v, int pix) { 3338 uint8* dst_u, uint8* dst_v, int width) {
3339 asm volatile ( 3339 asm volatile (
3340 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3340 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3341 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3341 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3342 "sub %1,%2 \n" 3342 "sub %1,%2 \n"
3343 LABELALIGN 3343 LABELALIGN
3344 "1: \n" 3344 "1: \n"
3345 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3345 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3346 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3346 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3347 "lea " MEMLEA(0x40,0) ",%0 \n" 3347 "lea " MEMLEA(0x40,0) ",%0 \n"
3348 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3348 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3349 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3349 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3350 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3350 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3351 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3351 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3352 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3352 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3353 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3353 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3354 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3354 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3355 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3355 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3356 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3356 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3357 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3357 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3358 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3358 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3359 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1) 3359 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1)
3360 "lea " MEMLEA(0x10,1) ",%1 \n" 3360 "lea " MEMLEA(0x10,1) ",%1 \n"
3361 "sub $0x20,%3 \n" 3361 "sub $0x20,%3 \n"
3362 "jg 1b \n" 3362 "jg 1b \n"
3363 "vzeroupper \n" 3363 "vzeroupper \n"
3364 : "+r"(src_yuy2), // %0 3364 : "+r"(src_yuy2), // %0
3365 "+r"(dst_u), // %1 3365 "+r"(dst_u), // %1
3366 "+r"(dst_v), // %2 3366 "+r"(dst_v), // %2
3367 "+r"(pix) // %3 3367 "+r"(width) // %3
3368 : 3368 :
3369 : "memory", "cc", NACL_R14 3369 : "memory", "cc", NACL_R14
3370 "xmm0", "xmm1", "xmm5" 3370 "xmm0", "xmm1", "xmm5"
3371 ); 3371 );
3372 } 3372 }
3373 3373
3374 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { 3374 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
3375 asm volatile ( 3375 asm volatile (
3376 LABELALIGN 3376 LABELALIGN
3377 "1: \n" 3377 "1: \n"
3378 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3378 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3379 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3379 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3380 "lea " MEMLEA(0x40,0) ",%0 \n" 3380 "lea " MEMLEA(0x40,0) ",%0 \n"
3381 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3381 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3382 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3382 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3383 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3383 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3384 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3384 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3385 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 3385 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3386 "lea " MEMLEA(0x20,1) ",%1 \n" 3386 "lea " MEMLEA(0x20,1) ",%1 \n"
3387 "sub $0x20,%2 \n" 3387 "sub $0x20,%2 \n"
3388 "jg 1b \n" 3388 "jg 1b \n"
3389 "vzeroupper \n" 3389 "vzeroupper \n"
3390 : "+r"(src_uyvy), // %0 3390 : "+r"(src_uyvy), // %0
3391 "+r"(dst_y), // %1 3391 "+r"(dst_y), // %1
3392 "+r"(pix) // %2 3392 "+r"(width) // %2
3393 : 3393 :
3394 : "memory", "cc" 3394 : "memory", "cc"
3395 , "xmm0", "xmm1", "xmm5" 3395 , "xmm0", "xmm1", "xmm5"
3396 ); 3396 );
3397 } 3397 }
3398 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 3398 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3399 uint8* dst_u, uint8* dst_v, int pix) { 3399 uint8* dst_u, uint8* dst_v, int width) {
3400 asm volatile ( 3400 asm volatile (
3401 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3401 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3402 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3402 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3403 "sub %1,%2 \n" 3403 "sub %1,%2 \n"
3404 3404
3405 LABELALIGN 3405 LABELALIGN
3406 "1: \n" 3406 "1: \n"
3407 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3407 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3408 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3408 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3409 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 3409 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
(...skipping 11 matching lines...) Expand all
3421 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3421 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3422 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3422 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3423 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1) 3423 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1)
3424 "lea " MEMLEA(0x10,1) ",%1 \n" 3424 "lea " MEMLEA(0x10,1) ",%1 \n"
3425 "sub $0x20,%3 \n" 3425 "sub $0x20,%3 \n"
3426 "jg 1b \n" 3426 "jg 1b \n"
3427 "vzeroupper \n" 3427 "vzeroupper \n"
3428 : "+r"(src_uyvy), // %0 3428 : "+r"(src_uyvy), // %0
3429 "+r"(dst_u), // %1 3429 "+r"(dst_u), // %1
3430 "+r"(dst_v), // %2 3430 "+r"(dst_v), // %2
3431 "+r"(pix) // %3 3431 "+r"(width) // %3
3432 : "r"((intptr_t)(stride_uyvy)) // %4 3432 : "r"((intptr_t)(stride_uyvy)) // %4
3433 : "memory", "cc", NACL_R14 3433 : "memory", "cc", NACL_R14
3434 "xmm0", "xmm1", "xmm5" 3434 "xmm0", "xmm1", "xmm5"
3435 ); 3435 );
3436 } 3436 }
3437 3437
3438 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3438 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3439 uint8* dst_u, uint8* dst_v, int pix) { 3439 uint8* dst_u, uint8* dst_v, int width) {
3440 asm volatile ( 3440 asm volatile (
3441 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3441 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3442 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3442 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3443 "sub %1,%2 \n" 3443 "sub %1,%2 \n"
3444 LABELALIGN 3444 LABELALIGN
3445 "1: \n" 3445 "1: \n"
3446 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3446 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3447 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3447 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3448 "lea " MEMLEA(0x40,0) ",%0 \n" 3448 "lea " MEMLEA(0x40,0) ",%0 \n"
3449 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3449 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3450 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 3450 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3451 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3451 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3452 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3452 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3453 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3453 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3454 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3454 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3455 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3455 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3456 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3456 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3457 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3457 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3458 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3458 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3459 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3459 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3460 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1) 3460 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2 ,1)
3461 "lea " MEMLEA(0x10,1) ",%1 \n" 3461 "lea " MEMLEA(0x10,1) ",%1 \n"
3462 "sub $0x20,%3 \n" 3462 "sub $0x20,%3 \n"
3463 "jg 1b \n" 3463 "jg 1b \n"
3464 "vzeroupper \n" 3464 "vzeroupper \n"
3465 : "+r"(src_uyvy), // %0 3465 : "+r"(src_uyvy), // %0
3466 "+r"(dst_u), // %1 3466 "+r"(dst_u), // %1
3467 "+r"(dst_v), // %2 3467 "+r"(dst_v), // %2
3468 "+r"(pix) // %3 3468 "+r"(width) // %3
3469 : 3469 :
3470 : "memory", "cc", NACL_R14 3470 : "memory", "cc", NACL_R14
3471 "xmm0", "xmm1", "xmm5" 3471 "xmm0", "xmm1", "xmm5"
3472 ); 3472 );
3473 } 3473 }
3474 #endif // HAS_YUY2TOYROW_AVX2 3474 #endif // HAS_YUY2TOYROW_AVX2
3475 3475
3476 #ifdef HAS_ARGBBLENDROW_SSSE3 3476 #ifdef HAS_ARGBBLENDROW_SSSE3
3477 // Shuffle table for isolating alpha. 3477 // Shuffle table for isolating alpha.
3478 static uvec8 kShuffleAlpha = { 3478 static uvec8 kShuffleAlpha = {
(...skipping 1599 matching lines...) Expand 10 before | Expand all | Expand 10 after
5078 : "r"((intptr_t)(src_stride)) // %4 5078 : "r"((intptr_t)(src_stride)) // %4
5079 : "memory", "cc", NACL_R14 5079 : "memory", "cc", NACL_R14
5080 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 5080 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5081 ); 5081 );
5082 } 5082 }
5083 #endif // HAS_INTERPOLATEROW_SSE2 5083 #endif // HAS_INTERPOLATEROW_SSE2
5084 5084
5085 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 5085 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
5086 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5086 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5087 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5087 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5088 const uint8* shuffler, int pix) { 5088 const uint8* shuffler, int width) {
5089 asm volatile ( 5089 asm volatile (
5090 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 5090 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
5091 LABELALIGN 5091 LABELALIGN
5092 "1: \n" 5092 "1: \n"
5093 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5093 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5094 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5094 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5095 "lea " MEMLEA(0x20,0) ",%0 \n" 5095 "lea " MEMLEA(0x20,0) ",%0 \n"
5096 "pshufb %%xmm5,%%xmm0 \n" 5096 "pshufb %%xmm5,%%xmm0 \n"
5097 "pshufb %%xmm5,%%xmm1 \n" 5097 "pshufb %%xmm5,%%xmm1 \n"
5098 "movdqu %%xmm0," MEMACCESS(1) " \n" 5098 "movdqu %%xmm0," MEMACCESS(1) " \n"
5099 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 5099 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
5100 "lea " MEMLEA(0x20,1) ",%1 \n" 5100 "lea " MEMLEA(0x20,1) ",%1 \n"
5101 "sub $0x8,%2 \n" 5101 "sub $0x8,%2 \n"
5102 "jg 1b \n" 5102 "jg 1b \n"
5103 : "+r"(src_argb), // %0 5103 : "+r"(src_argb), // %0
5104 "+r"(dst_argb), // %1 5104 "+r"(dst_argb), // %1
5105 "+r"(pix) // %2 5105 "+r"(width) // %2
5106 : "r"(shuffler) // %3 5106 : "r"(shuffler) // %3
5107 : "memory", "cc" 5107 : "memory", "cc"
5108 , "xmm0", "xmm1", "xmm5" 5108 , "xmm0", "xmm1", "xmm5"
5109 ); 5109 );
5110 } 5110 }
5111 #endif // HAS_ARGBSHUFFLEROW_SSSE3 5111 #endif // HAS_ARGBSHUFFLEROW_SSSE3
5112 5112
5113 #ifdef HAS_ARGBSHUFFLEROW_AVX2 5113 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5114 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5114 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5115 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5115 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5116 const uint8* shuffler, int pix) { 5116 const uint8* shuffler, int width) {
5117 asm volatile ( 5117 asm volatile (
5118 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" 5118 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
5119 LABELALIGN 5119 LABELALIGN
5120 "1: \n" 5120 "1: \n"
5121 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 5121 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
5122 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 5122 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
5123 "lea " MEMLEA(0x40,0) ",%0 \n" 5123 "lea " MEMLEA(0x40,0) ",%0 \n"
5124 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" 5124 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
5125 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 5125 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
5126 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 5126 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
5127 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 5127 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
5128 "lea " MEMLEA(0x40,1) ",%1 \n" 5128 "lea " MEMLEA(0x40,1) ",%1 \n"
5129 "sub $0x10,%2 \n" 5129 "sub $0x10,%2 \n"
5130 "jg 1b \n" 5130 "jg 1b \n"
5131 "vzeroupper \n" 5131 "vzeroupper \n"
5132 : "+r"(src_argb), // %0 5132 : "+r"(src_argb), // %0
5133 "+r"(dst_argb), // %1 5133 "+r"(dst_argb), // %1
5134 "+r"(pix) // %2 5134 "+r"(width) // %2
5135 : "r"(shuffler) // %3 5135 : "r"(shuffler) // %3
5136 : "memory", "cc" 5136 : "memory", "cc"
5137 , "xmm0", "xmm1", "xmm5" 5137 , "xmm0", "xmm1", "xmm5"
5138 ); 5138 );
5139 } 5139 }
5140 #endif // HAS_ARGBSHUFFLEROW_AVX2 5140 #endif // HAS_ARGBSHUFFLEROW_AVX2
5141 5141
5142 #ifdef HAS_ARGBSHUFFLEROW_SSE2 5142 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5143 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5143 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5144 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5144 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5145 const uint8* shuffler, int pix) { 5145 const uint8* shuffler, int width) {
5146 uintptr_t pixel_temp = 0u; 5146 uintptr_t pixel_temp = 0u;
5147 asm volatile ( 5147 asm volatile (
5148 "pxor %%xmm5,%%xmm5 \n" 5148 "pxor %%xmm5,%%xmm5 \n"
5149 "mov " MEMACCESS(4) ",%k2 \n" 5149 "mov " MEMACCESS(4) ",%k2 \n"
5150 "cmp $0x3000102,%k2 \n" 5150 "cmp $0x3000102,%k2 \n"
5151 "je 3012f \n" 5151 "je 3012f \n"
5152 "cmp $0x10203,%k2 \n" 5152 "cmp $0x10203,%k2 \n"
5153 "je 123f \n" 5153 "je 123f \n"
5154 "cmp $0x30201,%k2 \n" 5154 "cmp $0x30201,%k2 \n"
5155 "je 321f \n" 5155 "je 321f \n"
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after
5244 "packuswb %%xmm1,%%xmm0 \n" 5244 "packuswb %%xmm1,%%xmm0 \n"
5245 "movdqu %%xmm0," MEMACCESS(1) " \n" 5245 "movdqu %%xmm0," MEMACCESS(1) " \n"
5246 "lea " MEMLEA(0x10,1) ",%1 \n" 5246 "lea " MEMLEA(0x10,1) ",%1 \n"
5247 "sub $0x4,%3 \n" 5247 "sub $0x4,%3 \n"
5248 "jg 3012b \n" 5248 "jg 3012b \n"
5249 5249
5250 "99: \n" 5250 "99: \n"
5251 : "+r"(src_argb), // %0 5251 : "+r"(src_argb), // %0
5252 "+r"(dst_argb), // %1 5252 "+r"(dst_argb), // %1
5253 "+d"(pixel_temp), // %2 5253 "+d"(pixel_temp), // %2
5254 "+r"(pix) // %3 5254 "+r"(width) // %3
5255 : "r"(shuffler) // %4 5255 : "r"(shuffler) // %4
5256 : "memory", "cc", NACL_R14 5256 : "memory", "cc", NACL_R14
5257 "xmm0", "xmm1", "xmm5" 5257 "xmm0", "xmm1", "xmm5"
5258 ); 5258 );
5259 } 5259 }
5260 #endif // HAS_ARGBSHUFFLEROW_SSE2 5260 #endif // HAS_ARGBSHUFFLEROW_SSE2
5261 5261
5262 #ifdef HAS_I422TOYUY2ROW_SSE2 5262 #ifdef HAS_I422TOYUY2ROW_SSE2
5263 void I422ToYUY2Row_SSE2(const uint8* src_y, 5263 void I422ToYUY2Row_SSE2(const uint8* src_y,
5264 const uint8* src_u, 5264 const uint8* src_u,
(...skipping 319 matching lines...) Expand 10 before | Expand all | Expand 10 after
5584 ); 5584 );
5585 } 5585 }
5586 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5586 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5587 5587
5588 #endif // defined(__x86_64__) || defined(__i386__) 5588 #endif // defined(__x86_64__) || defined(__i386__)
5589 5589
5590 #ifdef __cplusplus 5590 #ifdef __cplusplus
5591 } // extern "C" 5591 } // extern "C"
5592 } // namespace libyuv 5592 } // namespace libyuv
5593 #endif 5593 #endif
OLDNEW
« no previous file with comments | « source/row_common.cc ('k') | source/row_neon.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698