Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(15)

Unified Diff: third_party/libjpeg_turbo/simd/jcclrss2.asm

Issue 5862001: Integrate premultiply/unpremultiply operations into libjpeg-turbo.... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/
Patch Set: Created 10 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/libjpeg_turbo/jpeglib.h ('k') | third_party/libjpeg_turbo/simd/jccolss2.asm » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/libjpeg_turbo/simd/jcclrss2.asm
===================================================================
--- third_party/libjpeg_turbo/simd/jcclrss2.asm (revision 69212)
+++ third_party/libjpeg_turbo/simd/jcclrss2.asm (working copy)
@@ -293,6 +293,201 @@
%endif ; RGB_PIXELSIZE ; ---------------
+%if PREMULTIPLY == 1 ; ---------------
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE, xmm6=A(02468ACE)=AE
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO, xmm7=A(13579BDF)=AO
+
+ ; Unpremultiply even registers (i.e. xmm0, xmm2, xmm4, and xmm6).
+ movdqa [wk(0)], xmm1
+ movdqa [wk(1)], xmm3
+ movdqa [wk(2)], xmm5
+ movdqa [wk(3)], xmm7
+
+ ; for (int i = 0; i < 8; ++i)
+ ; xmm6.word[i] = xmm6.word[i] ? xmm6.word[i] : 255; // a[i] ? a[i] : 255;
+ pxor xmm1, xmm1
+ pcmpeqw xmm1, xmm6
+ psrlw xmm1, 8
+ pmaxuw xmm6, xmm1
+
+ ; xmm0.dword[i] = r[i]; xmm2.dword[i] = g[i]; xmm4.dword[i] = b[i]; xmm6.dword[i] = a[i];
+ ; xmm1.dword[i] = r[4+i]; xmm3.dword[i] = g[4+i]; xmm5.dword[i] = b[4+i]; xmm7.dword[i] = a[4+i];
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ movdqa xmm5, xmm4
+ movdqa xmm7, xmm6
+
+ movdqa [wk(4)], xmm1
+ pxor xmm1, xmm1
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm1
+ punpcklwd xmm4, xmm1
+ punpcklwd xmm6, xmm1
+ movdqa xmm1, [wk(4)]
+
+ movdqa [wk(4)], xmm0
+ pxor xmm0, xmm0
+ punpckhwd xmm1, xmm0
+ punpckhwd xmm3, xmm0
+ punpckhwd xmm5, xmm0
+ punpckhwd xmm7, xmm0
+ movdqa xmm0, [wk(4)];
+
+ ; for (int i = 0; i < 4; ++i) {
+ ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[i];
+ ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[i];
+ ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[i];
+ ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[i];
+ ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[4+i];
+ ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[4+i];
+ ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[4+i];
+ ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[4+i];
+ ; }
+ cvtdq2ps xmm0, xmm0
+ cvtdq2ps xmm2, xmm2
+ cvtdq2ps xmm4, xmm4
+ cvtdq2ps xmm6, xmm6
+ cvtdq2ps xmm1, xmm1
+ cvtdq2ps xmm3, xmm3
+ cvtdq2ps xmm5, xmm5
+ cvtdq2ps xmm7, xmm7
+
+ ; for (int i = 0; i < 4; ++i) {
+ ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[i];
+ ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[4+i];
+ ; }
+ rcpps xmm6, xmm6
+ rcpps xmm7, xmm7
+ mulps xmm6, [GOTOFF(eax,PF_255)]
+ mulps xmm7, [GOTOFF(eax,PF_255)]
+
+ ; for (int i = 0; i < 4; ++i) {
+ ; xmm0.word[i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)r[i] * 255.0 / (float)a[i]);
+ ; xmm2.word[i] = (uint16)(xmm2.float[i] * xmm6.float[i]); // (uint16)((float)g[i] * 255.0 / (float)a[i]);
+ ; xmm4.word[i] = (uint16)(xmm4.float[i] * xmm6.float[i]); // (uint16)((float)b[i] * 255.0 / (float)a[i]);
+ ; xmm0.word[4+i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16)((float)r[4+i] * 255.0 / (float)a[4+i]);
+ ; xmm2.word[4+i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16)((float)g[4+i] * 255.0 / (float)a[4+i]);
+ ; xmm4.word[4+i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16)((float)b[4+i] * 255.0 / (float)a[4+i]);
+ ; }
+ mulps xmm0, xmm6
+ cvtps2dq xmm0, xmm0
+ mulps xmm1, xmm7
+ cvtps2dq xmm1, xmm1
+ packusdw xmm0, xmm1
+
+ mulps xmm2, xmm6
+ cvtps2dq xmm2, xmm2
+ mulps xmm3, xmm7
+ cvtps2dq xmm3, xmm3
+ packusdw xmm2, xmm3
+
+ mulps xmm4, xmm6
+ cvtps2dq xmm4, xmm4
+ mulps xmm5, xmm7
+ cvtps2dq xmm5, xmm5
+ packusdw xmm4, xmm5
+
+ movdqa xmm1, [wk(0)]
+ movdqa xmm3, [wk(1)]
+ movdqa xmm5, [wk(2)]
+ movdqa xmm7, [wk(3)]
+
+ ; Unpremultiply odd registers (i.e. xmm1, xmm3, xmm5, and xmm7).
+ movdqa [wk(0)], xmm0
+ movdqa [wk(1)], xmm2
+ movdqa [wk(2)], xmm4
+ movdqa [wk(3)], xmm6
+
+ ; for (int i = 0; i < 8; ++i)
+ ; xmm7.word[i] = xmm7.word[i] ? xmm7.word[i] : 255; // a[i] ? a[i] : 255;
+ pxor xmm0, xmm0
+ pcmpeqw xmm0, xmm7
+ psrlw xmm0, 8
+ pmaxuw xmm7, xmm0
+
+ ; xmm4.dword[i] = r[i]; xmm5.dword[i] = g[i]; xmm6.dword[i] = b[i]; xmm7.dword[i] = a[i];
+ ; xmm0.dword[i] = r[4+i]; xmm1.dword[i] = g[4+i]; xmm2.dword[i] = b[4+i]; xmm3.dword[i] = a[4+i];
+ movdqa xmm0, xmm1
+ movdqa xmm2, xmm3
+ movdqa xmm4, xmm5
+ movdqa xmm6, xmm7
+
+ movdqa [wk(4)], xmm0
+ pxor xmm0, xmm0
+ punpcklwd xmm1, xmm0
+ punpcklwd xmm3, xmm0
+ punpcklwd xmm5, xmm0
+ punpcklwd xmm7, xmm0
+ movdqa xmm0, [wk(4)]
+
+ movdqa [wk(4)], xmm1
+ pxor xmm1, xmm1
+ punpckhwd xmm0, xmm1
+ punpckhwd xmm2, xmm1
+ punpckhwd xmm4, xmm1
+ punpckhwd xmm6, xmm1
+ movdqa xmm1, [wk(4)];
+
+ ; for (int i = 0; i < 4; ++i) {
+ ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[i];
+ ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[i];
+ ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[i];
+ ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[i];
+ ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[4+i];
+ ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[4+i];
+ ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[4+i];
+ ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[4+i];
+ ; }
+ cvtdq2ps xmm1, xmm1
+ cvtdq2ps xmm3, xmm3
+ cvtdq2ps xmm5, xmm5
+ cvtdq2ps xmm7, xmm7
+ cvtdq2ps xmm0, xmm0
+ cvtdq2ps xmm2, xmm2
+ cvtdq2ps xmm4, xmm4
+ cvtdq2ps xmm6, xmm6
+
+ ; for (int i = 0; i < 4; ++i) {
+ ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[i];
+ ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[4+i];
+ ; }
+ rcpps xmm7, xmm7
+ rcpps xmm6, xmm6
+ mulps xmm7, [GOTOFF(eax,PF_255)]
+ mulps xmm6, [GOTOFF(eax,PF_255)]
+
+ ; for (int i = 0; i < 4; ++i) {
+ ; xmm1.word[i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16)((float)r[i] * 255.0 / (float)a[i]);
+ ; xmm3.word[i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16)((float)g[i] * 255.0 / (float)a[i]);
+ ; xmm5.word[i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16)((float)b[i] * 255.0 / (float)a[i]);
+ ; xmm1.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)r[4+i] * 255.0 / (float)a[4+i]);
+ ; xmm3.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)g[4+i] * 255.0 / (float)a[4+i]);
+ ; xmm5.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)b[4+i] * 255.0 / (float)a[4+i]);
+ ; }
+ mulps xmm1, xmm7
+ cvtps2dq xmm1, xmm1
+ mulps xmm0, xmm6
+ cvtps2dq xmm0, xmm0
+ packusdw xmm1, xmm0
+
+ mulps xmm3, xmm7
+ cvtps2dq xmm3, xmm3
+ mulps xmm2, xmm6
+ cvtps2dq xmm2, xmm2
+ packusdw xmm3, xmm2
+
+ mulps xmm5, xmm7
+ cvtps2dq xmm5, xmm5
+ mulps xmm4, xmm6
+ cvtps2dq xmm4, xmm4
+ packusdw xmm5, xmm4
+
+ movdqa xmm0, [wk(0)]
+ movdqa xmm2, [wk(1)]
+ movdqa xmm4, [wk(2)]
+ movdqa xmm6, [wk(3)]
+%endif ; PREMULTIPLY == 1
+
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
« no previous file with comments | « third_party/libjpeg_turbo/jpeglib.h ('k') | third_party/libjpeg_turbo/simd/jccolss2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698