Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(740)

Side by Side Diff: third_party/libjpeg_turbo/simd/jcclrss2.asm

Issue 5862001: Integrate premultiply/unpremultiply operations into libjpeg-turbo.... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/
Patch Set: Created 10 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/libjpeg_turbo/jpeglib.h ('k') | third_party/libjpeg_turbo/simd/jccolss2.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 ; 1 ;
2 ; jcclrss2.asm - colorspace conversion (SSE2) 2 ; jcclrss2.asm - colorspace conversion (SSE2)
3 ; 3 ;
4 ; x86 SIMD extension for IJG JPEG library 4 ; x86 SIMD extension for IJG JPEG library
5 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 5 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
6 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 6 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
7 ; 7 ;
8 ; This file should be assembled with NASM (Netwide Assembler), 8 ; This file should be assembled with NASM (Netwide Assembler),
9 ; can *not* be assembled with Microsoft's MASM or any compatible 9 ; can *not* be assembled with Microsoft's MASM or any compatible
10 ; assembler (including Borland's Turbo Assembler). 10 ; assembler (including Borland's Turbo Assembler).
(...skipping 275 matching lines...) Expand 10 before | Expand all | Expand 10 after
286 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 286 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
287 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 287 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
288 288
289 punpcklbw xmmF,xmmH 289 punpcklbw xmmF,xmmH
290 punpckhbw xmmH,xmmH 290 punpckhbw xmmH,xmmH
291 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 291 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
292 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 292 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
293 293
294 %endif ; RGB_PIXELSIZE ; --------------- 294 %endif ; RGB_PIXELSIZE ; ---------------
295 295
296 %if PREMULTIPLY == 1 ; ---------------
297 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE, xmm6=A( 02468ACE)=AE
298 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO, xmm7=A( 13579BDF)=AO
299
300 ; Unpremultiply even registers (i.e. xmm0, xmm2, xmm4, and xmm6).
301 movdqa [wk(0)], xmm1
302 movdqa [wk(1)], xmm3
303 movdqa [wk(2)], xmm5
304 movdqa [wk(3)], xmm7
305
306 ; for (int i = 0; i < 8; ++i)
307 ; xmm6.word[i] = xmm6.word[i] ? xmm6.word[i] : 255; // a[i] ? a[i] : 2 55;
308 pxor xmm1, xmm1
309 pcmpeqw xmm1, xmm6
310 psrlw xmm1, 8
311 pmaxuw xmm6, xmm1
312
313 ; xmm0.dword[i] = r[i]; xmm2.dword[i] = g[i]; xmm4.dword[i] = b[i]; xmm6.dword[i] = a[i];
314 ; xmm1.dword[i] = r[4+i]; xmm3.dword[i] = g[4+i]; xmm5.dword[i] = b[4+i] ; xmm7.dword[i] = a[4+i];
315 movdqa xmm1, xmm0
316 movdqa xmm3, xmm2
317 movdqa xmm5, xmm4
318 movdqa xmm7, xmm6
319
320 movdqa [wk(4)], xmm1
321 pxor xmm1, xmm1
322 punpcklwd xmm0, xmm1
323 punpcklwd xmm2, xmm1
324 punpcklwd xmm4, xmm1
325 punpcklwd xmm6, xmm1
326 movdqa xmm1, [wk(4)]
327
328 movdqa [wk(4)], xmm0
329 pxor xmm0, xmm0
330 punpckhwd xmm1, xmm0
331 punpckhwd xmm3, xmm0
332 punpckhwd xmm5, xmm0
333 punpckhwd xmm7, xmm0
334 movdqa xmm0, [wk(4)];
335
336 ; for (int i = 0; i < 4; ++i) {
337 ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[i];
338 ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[i];
339 ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[i];
340 ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[i];
341 ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[4+i];
342 ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[4+i];
343 ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[4+i];
344 ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[4+i];
345 ; }
346 cvtdq2ps xmm0, xmm0
347 cvtdq2ps xmm2, xmm2
348 cvtdq2ps xmm4, xmm4
349 cvtdq2ps xmm6, xmm6
350 cvtdq2ps xmm1, xmm1
351 cvtdq2ps xmm3, xmm3
352 cvtdq2ps xmm5, xmm5
353 cvtdq2ps xmm7, xmm7
354
355 ; for (int i = 0; i < 4; ++i) {
356 ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[i];
357 ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[4+i];
358 ; }
359 rcpps xmm6, xmm6
360 rcpps xmm7, xmm7
361 mulps xmm6, [GOTOFF(eax,PF_255)]
362 mulps xmm7, [GOTOFF(eax,PF_255)]
363
364 ; for (int i = 0; i < 4; ++i) {
365 ; xmm0.word[i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16 )((float)r[i] * 255.0 / (float)a[i]);
366 ; xmm2.word[i] = (uint16)(xmm2.float[i] * xmm6.float[i]); // (uint16 )((float)g[i] * 255.0 / (float)a[i]);
367 ; xmm4.word[i] = (uint16)(xmm4.float[i] * xmm6.float[i]); // (uint16 )((float)b[i] * 255.0 / (float)a[i]);
368 ; xmm0.word[4+i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16 )((float)r[4+i] * 255.0 / (float)a[4+i]);
369 ; xmm2.word[4+i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16 )((float)g[4+i] * 255.0 / (float)a[4+i]);
370 ; xmm4.word[4+i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16 )((float)b[4+i] * 255.0 / (float)a[4+i]);
371 ; }
372 mulps xmm0, xmm6
373 cvtps2dq xmm0, xmm0
374 mulps xmm1, xmm7
375 cvtps2dq xmm1, xmm1
376 packusdw xmm0, xmm1
377
378 mulps xmm2, xmm6
379 cvtps2dq xmm2, xmm2
380 mulps xmm3, xmm7
381 cvtps2dq xmm3, xmm3
382 packusdw xmm2, xmm3
383
384 mulps xmm4, xmm6
385 cvtps2dq xmm4, xmm4
386 mulps xmm5, xmm7
387 cvtps2dq xmm5, xmm5
388 packusdw xmm4, xmm5
389
390 movdqa xmm1, [wk(0)]
391 movdqa xmm3, [wk(1)]
392 movdqa xmm5, [wk(2)]
393 movdqa xmm7, [wk(3)]
394
395 ; Unpremultiply odd registers (i.e. xmm1, xmm3, xmm5, and xmm7).
396 movdqa [wk(0)], xmm0
397 movdqa [wk(1)], xmm2
398 movdqa [wk(2)], xmm4
399 movdqa [wk(3)], xmm6
400
401 ; for (int i = 0; i < 8; ++i)
402 ; xmm7.word[i] = xmm7.word[i] ? xmm7.word[i] : 255; // a[i] ? a[i] : 2 55;
403 pxor xmm0, xmm0
404 pcmpeqw xmm0, xmm7
405 psrlw xmm0, 8
406 pmaxuw xmm7, xmm0
407
408 ; xmm4.dword[i] = r[i]; xmm5.dword[i] = g[i]; xmm6.dword[i] = b[i]; xmm7.dword[i] = a[i];
409 ; xmm0.dword[i] = r[4+i]; xmm1.dword[i] = g[4+i]; xmm2.dword[i] = b[4+i] ; xmm3.dword[i] = a[4+i];
410 movdqa xmm0, xmm1
411 movdqa xmm2, xmm3
412 movdqa xmm4, xmm5
413 movdqa xmm6, xmm7
414
415 movdqa [wk(4)], xmm0
416 pxor xmm0, xmm0
417 punpcklwd xmm1, xmm0
418 punpcklwd xmm3, xmm0
419 punpcklwd xmm5, xmm0
420 punpcklwd xmm7, xmm0
421 movdqa xmm0, [wk(4)]
422
423 movdqa [wk(4)], xmm1
424 pxor xmm1, xmm1
425 punpckhwd xmm0, xmm1
426 punpckhwd xmm2, xmm1
427 punpckhwd xmm4, xmm1
428 punpckhwd xmm6, xmm1
429 movdqa xmm1, [wk(4)];
430
431 ; for (int i = 0; i < 4; ++i) {
432 ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[i];
433 ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[i];
434 ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[i];
435 ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[i];
436 ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[4+i];
437 ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[4+i];
438 ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[4+i];
439 ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[4+i];
440 ; }
441 cvtdq2ps xmm1, xmm1
442 cvtdq2ps xmm3, xmm3
443 cvtdq2ps xmm5, xmm5
444 cvtdq2ps xmm7, xmm7
445 cvtdq2ps xmm0, xmm0
446 cvtdq2ps xmm2, xmm2
447 cvtdq2ps xmm4, xmm4
448 cvtdq2ps xmm6, xmm6
449
450 ; for (int i = 0; i < 4; ++i) {
451 ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[i];
452 ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[4+i];
453 ; }
454 rcpps xmm7, xmm7
455 rcpps xmm6, xmm6
456 mulps xmm7, [GOTOFF(eax,PF_255)]
457 mulps xmm6, [GOTOFF(eax,PF_255)]
458
459 ; for (int i = 0; i < 4; ++i) {
460 ; xmm1.word[i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16 )((float)r[i] * 255.0 / (float)a[i]);
461 ; xmm3.word[i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16 )((float)g[i] * 255.0 / (float)a[i]);
462 ; xmm5.word[i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16 )((float)b[i] * 255.0 / (float)a[i]);
463 ; xmm1.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16 )((float)r[4+i] * 255.0 / (float)a[4+i]);
464 ; xmm3.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16 )((float)g[4+i] * 255.0 / (float)a[4+i]);
465 ; xmm5.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16 )((float)b[4+i] * 255.0 / (float)a[4+i]);
466 ; }
467 mulps xmm1, xmm7
468 cvtps2dq xmm1, xmm1
469 mulps xmm0, xmm6
470 cvtps2dq xmm0, xmm0
471 packusdw xmm1, xmm0
472
473 mulps xmm3, xmm7
474 cvtps2dq xmm3, xmm3
475 mulps xmm2, xmm6
476 cvtps2dq xmm2, xmm2
477 packusdw xmm3, xmm2
478
479 mulps xmm5, xmm7
480 cvtps2dq xmm5, xmm5
481 mulps xmm4, xmm6
482 cvtps2dq xmm4, xmm4
483 packusdw xmm5, xmm4
484
485 movdqa xmm0, [wk(0)]
486 movdqa xmm2, [wk(1)]
487 movdqa xmm4, [wk(2)]
488 movdqa xmm6, [wk(3)]
489 %endif ; PREMULTIPLY == 1
490
296 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 491 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
297 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 492 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
298 493
299 ; (Original) 494 ; (Original)
300 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 495 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
301 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 496 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
302 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 497 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
303 ; 498 ;
304 ; (This implementation) 499 ; (This implementation)
305 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 500 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
(...skipping 190 matching lines...) Expand 10 before | Expand all | Expand 10 after
496 ; pop ecx ; need not be preserved 691 ; pop ecx ; need not be preserved
497 pop ebx 692 pop ebx
498 mov esp,ebp ; esp <- aligned ebp 693 mov esp,ebp ; esp <- aligned ebp
499 pop esp ; esp <- original ebp 694 pop esp ; esp <- original ebp
500 pop ebp 695 pop ebp
501 ret 696 ret
502 697
503 ; For some reason, the OS X linker does not honor the request to align the 698 ; For some reason, the OS X linker does not honor the request to align the
504 ; segment unless we do this. 699 ; segment unless we do this.
505 align 16 700 align 16
OLDNEW
« no previous file with comments | « third_party/libjpeg_turbo/jpeglib.h ('k') | third_party/libjpeg_turbo/simd/jccolss2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698