Index: third_party/libjpeg_turbo/simd/jcclrss2.asm |
=================================================================== |
--- third_party/libjpeg_turbo/simd/jcclrss2.asm (revision 69212) |
+++ third_party/libjpeg_turbo/simd/jcclrss2.asm (working copy) |
@@ -293,6 +293,201 @@ |
%endif ; RGB_PIXELSIZE ; --------------- |
+%if PREMULTIPLY == 1 ; --------------- |
+ ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE, xmm6=A(02468ACE)=AE |
+ ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO, xmm7=A(13579BDF)=AO |
+ |
+ ; Unpremultiply even registers (i.e. xmm0, xmm2, xmm4, and xmm6). |
+ movdqa [wk(0)], xmm1 |
+ movdqa [wk(1)], xmm3 |
+ movdqa [wk(2)], xmm5 |
+ movdqa [wk(3)], xmm7 |
+ |
+ ; for (int i = 0; i < 8; ++i) |
+ ; xmm6.word[i] = xmm6.word[i] ? xmm6.word[i] : 255; // a[i] ? a[i] : 255; |
+ pxor xmm1, xmm1 |
+ pcmpeqw xmm1, xmm6 |
+ psrlw xmm1, 8 |
+ pmaxuw xmm6, xmm1 |
+ |
+ ; xmm0.dword[i] = r[i]; xmm2.dword[i] = g[i]; xmm4.dword[i] = b[i]; xmm6.dword[i] = a[i]; |
+ ; xmm1.dword[i] = r[4+i]; xmm3.dword[i] = g[4+i]; xmm5.dword[i] = b[4+i]; xmm7.dword[i] = a[4+i]; |
+ movdqa xmm1, xmm0 |
+ movdqa xmm3, xmm2 |
+ movdqa xmm5, xmm4 |
+ movdqa xmm7, xmm6 |
+ |
+ movdqa [wk(4)], xmm1 |
+ pxor xmm1, xmm1 |
+ punpcklwd xmm0, xmm1 |
+ punpcklwd xmm2, xmm1 |
+ punpcklwd xmm4, xmm1 |
+ punpcklwd xmm6, xmm1 |
+ movdqa xmm1, [wk(4)] |
+ |
+ movdqa [wk(4)], xmm0 |
+ pxor xmm0, xmm0 |
+ punpckhwd xmm1, xmm0 |
+ punpckhwd xmm3, xmm0 |
+ punpckhwd xmm5, xmm0 |
+ punpckhwd xmm7, xmm0 |
+ movdqa xmm0, [wk(4)]; |
+ |
+ ; for (int i = 0; i < 4; ++i) { |
+ ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[i]; |
+ ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[i]; |
+ ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[i]; |
+ ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[i]; |
+ ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[4+i]; |
+ ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[4+i]; |
+ ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[4+i]; |
+ ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[4+i]; |
+ ; } |
+ cvtdq2ps xmm0, xmm0 |
+ cvtdq2ps xmm2, xmm2 |
+ cvtdq2ps xmm4, xmm4 |
+ cvtdq2ps xmm6, xmm6 |
+ cvtdq2ps xmm1, xmm1 |
+ cvtdq2ps xmm3, xmm3 |
+ cvtdq2ps xmm5, xmm5 |
+ cvtdq2ps xmm7, xmm7 |
+ |
+ ; for (int i = 0; i < 4; ++i) { |
+ ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[i]; |
+ ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[4+i]; |
+ ; } |
+ rcpps xmm6, xmm6 |
+ rcpps xmm7, xmm7 |
+ mulps xmm6, [GOTOFF(eax,PF_255)] |
+ mulps xmm7, [GOTOFF(eax,PF_255)] |
+ |
+ ; for (int i = 0; i < 4; ++i) { |
+ ; xmm0.word[i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)r[i] * 255.0 / (float)a[i]); |
+ ; xmm2.word[i] = (uint16)(xmm2.float[i] * xmm6.float[i]); // (uint16)((float)g[i] * 255.0 / (float)a[i]); |
+ ; xmm4.word[i] = (uint16)(xmm4.float[i] * xmm6.float[i]); // (uint16)((float)b[i] * 255.0 / (float)a[i]); |
+ ; xmm0.word[4+i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16)((float)r[4+i] * 255.0 / (float)a[4+i]); |
+ ; xmm2.word[4+i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16)((float)g[4+i] * 255.0 / (float)a[4+i]); |
+ ; xmm4.word[4+i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16)((float)b[4+i] * 255.0 / (float)a[4+i]); |
+ ; } |
+ mulps xmm0, xmm6 |
+ cvtps2dq xmm0, xmm0 |
+ mulps xmm1, xmm7 |
+ cvtps2dq xmm1, xmm1 |
+ packusdw xmm0, xmm1 |
+ |
+ mulps xmm2, xmm6 |
+ cvtps2dq xmm2, xmm2 |
+ mulps xmm3, xmm7 |
+ cvtps2dq xmm3, xmm3 |
+ packusdw xmm2, xmm3 |
+ |
+ mulps xmm4, xmm6 |
+ cvtps2dq xmm4, xmm4 |
+ mulps xmm5, xmm7 |
+ cvtps2dq xmm5, xmm5 |
+ packusdw xmm4, xmm5 |
+ |
+ movdqa xmm1, [wk(0)] |
+ movdqa xmm3, [wk(1)] |
+ movdqa xmm5, [wk(2)] |
+ movdqa xmm7, [wk(3)] |
+ |
+ ; Unpremultiply odd registers (i.e. xmm1, xmm3, xmm5, and xmm7). |
+ movdqa [wk(0)], xmm0 |
+ movdqa [wk(1)], xmm2 |
+ movdqa [wk(2)], xmm4 |
+ movdqa [wk(3)], xmm6 |
+ |
+ ; for (int i = 0; i < 8; ++i) |
+ ; xmm7.word[i] = xmm7.word[i] ? xmm7.word[i] : 255; // a[i] ? a[i] : 255; |
+ pxor xmm0, xmm0 |
+ pcmpeqw xmm0, xmm7 |
+ psrlw xmm0, 8 |
+ pmaxuw xmm7, xmm0 |
+ |
+ ; xmm4.dword[i] = r[i]; xmm5.dword[i] = g[i]; xmm6.dword[i] = b[i]; xmm7.dword[i] = a[i]; |
+ ; xmm0.dword[i] = r[4+i]; xmm1.dword[i] = g[4+i]; xmm2.dword[i] = b[4+i]; xmm3.dword[i] = a[4+i]; |
+ movdqa xmm0, xmm1 |
+ movdqa xmm2, xmm3 |
+ movdqa xmm4, xmm5 |
+ movdqa xmm6, xmm7 |
+ |
+ movdqa [wk(4)], xmm0 |
+ pxor xmm0, xmm0 |
+ punpcklwd xmm1, xmm0 |
+ punpcklwd xmm3, xmm0 |
+ punpcklwd xmm5, xmm0 |
+ punpcklwd xmm7, xmm0 |
+ movdqa xmm0, [wk(4)] |
+ |
+ movdqa [wk(4)], xmm1 |
+ pxor xmm1, xmm1 |
+ punpckhwd xmm0, xmm1 |
+ punpckhwd xmm2, xmm1 |
+ punpckhwd xmm4, xmm1 |
+ punpckhwd xmm6, xmm1 |
+ movdqa xmm1, [wk(4)]; |
+ |
+ ; for (int i = 0; i < 4; ++i) { |
+ ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[i]; |
+ ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[i]; |
+ ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[i]; |
+ ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[i]; |
+ ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[4+i]; |
+ ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[4+i]; |
+ ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[4+i]; |
+ ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[4+i]; |
+ ; } |
+ cvtdq2ps xmm1, xmm1 |
+ cvtdq2ps xmm3, xmm3 |
+ cvtdq2ps xmm5, xmm5 |
+ cvtdq2ps xmm7, xmm7 |
+ cvtdq2ps xmm0, xmm0 |
+ cvtdq2ps xmm2, xmm2 |
+ cvtdq2ps xmm4, xmm4 |
+ cvtdq2ps xmm6, xmm6 |
+ |
+ ; for (int i = 0; i < 4; ++i) { |
+ ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[i]; |
+ ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[4+i]; |
+ ; } |
+ rcpps xmm7, xmm7 |
+ rcpps xmm6, xmm6 |
+ mulps xmm7, [GOTOFF(eax,PF_255)] |
+ mulps xmm6, [GOTOFF(eax,PF_255)] |
+ |
+ ; for (int i = 0; i < 4; ++i) { |
+ ; xmm1.word[i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16)((float)r[i] * 255.0 / (float)a[i]); |
+ ; xmm3.word[i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16)((float)g[i] * 255.0 / (float)a[i]); |
+ ; xmm5.word[i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16)((float)b[i] * 255.0 / (float)a[i]); |
+ ; xmm1.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)r[4+i] * 255.0 / (float)a[4+i]); |
+ ; xmm3.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)g[4+i] * 255.0 / (float)a[4+i]); |
+ ; xmm5.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)b[4+i] * 255.0 / (float)a[4+i]); |
+ ; } |
+ mulps xmm1, xmm7 |
+ cvtps2dq xmm1, xmm1 |
+ mulps xmm0, xmm6 |
+ cvtps2dq xmm0, xmm0 |
+ packusdw xmm1, xmm0 |
+ |
+ mulps xmm3, xmm7 |
+ cvtps2dq xmm3, xmm3 |
+ mulps xmm2, xmm6 |
+ cvtps2dq xmm2, xmm2 |
+ packusdw xmm3, xmm2 |
+ |
+ mulps xmm5, xmm7 |
+ cvtps2dq xmm5, xmm5 |
+ mulps xmm4, xmm6 |
+ cvtps2dq xmm4, xmm4 |
+ packusdw xmm5, xmm4 |
+ |
+ movdqa xmm0, [wk(0)] |
+ movdqa xmm2, [wk(1)] |
+ movdqa xmm4, [wk(2)] |
+ movdqa xmm6, [wk(3)] |
+%endif ; PREMULTIPLY == 1 |
+ |
; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE |
; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO |