| Index: third_party/libjpeg_turbo/simd/jcclrss2.asm
|
| ===================================================================
|
| --- third_party/libjpeg_turbo/simd/jcclrss2.asm (revision 69212)
|
| +++ third_party/libjpeg_turbo/simd/jcclrss2.asm (working copy)
|
| @@ -293,6 +293,201 @@
|
|
|
| %endif ; RGB_PIXELSIZE ; ---------------
|
|
|
| +%if PREMULTIPLY == 1 ; ---------------
|
| + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE, xmm6=A(02468ACE)=AE
|
| + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO, xmm7=A(13579BDF)=AO
|
| +
|
| + ; Unpremultiply even registers (i.e. xmm0, xmm2, xmm4, and xmm6).
|
| + movdqa [wk(0)], xmm1
|
| + movdqa [wk(1)], xmm3
|
| + movdqa [wk(2)], xmm5
|
| + movdqa [wk(3)], xmm7
|
| +
|
| + ; for (int i = 0; i < 8; ++i)
|
| + ; xmm6.word[i] = xmm6.word[i] ? xmm6.word[i] : 255; // a[i] ? a[i] : 255;
|
| + pxor xmm1, xmm1
|
| + pcmpeqw xmm1, xmm6
|
| + psrlw xmm1, 8
|
| + pmaxuw xmm6, xmm1
|
| +
|
| + ; xmm0.dword[i] = r[i]; xmm2.dword[i] = g[i]; xmm4.dword[i] = b[i]; xmm6.dword[i] = a[i];
|
| + ; xmm1.dword[i] = r[4+i]; xmm3.dword[i] = g[4+i]; xmm5.dword[i] = b[4+i]; xmm7.dword[i] = a[4+i];
|
| + movdqa xmm1, xmm0
|
| + movdqa xmm3, xmm2
|
| + movdqa xmm5, xmm4
|
| + movdqa xmm7, xmm6
|
| +
|
| + movdqa [wk(4)], xmm1
|
| + pxor xmm1, xmm1
|
| + punpcklwd xmm0, xmm1
|
| + punpcklwd xmm2, xmm1
|
| + punpcklwd xmm4, xmm1
|
| + punpcklwd xmm6, xmm1
|
| + movdqa xmm1, [wk(4)]
|
| +
|
| + movdqa [wk(4)], xmm0
|
| + pxor xmm0, xmm0
|
| + punpckhwd xmm1, xmm0
|
| + punpckhwd xmm3, xmm0
|
| + punpckhwd xmm5, xmm0
|
| + punpckhwd xmm7, xmm0
|
| + movdqa xmm0, [wk(4)];
|
| +
|
| + ; for (int i = 0; i < 4; ++i) {
|
| + ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[i];
|
| + ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[i];
|
| + ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[i];
|
| + ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[i];
|
| + ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[4+i];
|
| + ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[4+i];
|
| + ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[4+i];
|
| + ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[4+i];
|
| + ; }
|
| + cvtdq2ps xmm0, xmm0
|
| + cvtdq2ps xmm2, xmm2
|
| + cvtdq2ps xmm4, xmm4
|
| + cvtdq2ps xmm6, xmm6
|
| + cvtdq2ps xmm1, xmm1
|
| + cvtdq2ps xmm3, xmm3
|
| + cvtdq2ps xmm5, xmm5
|
| + cvtdq2ps xmm7, xmm7
|
| +
|
| + ; for (int i = 0; i < 4; ++i) {
|
| + ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[i];
|
| + ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[4+i];
|
| + ; }
|
| + rcpps xmm6, xmm6
|
| + rcpps xmm7, xmm7
|
| + mulps xmm6, [GOTOFF(eax,PF_255)]
|
| + mulps xmm7, [GOTOFF(eax,PF_255)]
|
| +
|
| + ; for (int i = 0; i < 4; ++i) {
|
| + ; xmm0.word[i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)r[i] * 255.0 / (float)a[i]);
|
| + ; xmm2.word[i] = (uint16)(xmm2.float[i] * xmm6.float[i]); // (uint16)((float)g[i] * 255.0 / (float)a[i]);
|
| + ; xmm4.word[i] = (uint16)(xmm4.float[i] * xmm6.float[i]); // (uint16)((float)b[i] * 255.0 / (float)a[i]);
|
| + ; xmm0.word[4+i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16)((float)r[4+i] * 255.0 / (float)a[4+i]);
|
| + ; xmm2.word[4+i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16)((float)g[4+i] * 255.0 / (float)a[4+i]);
|
| + ; xmm4.word[4+i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16)((float)b[4+i] * 255.0 / (float)a[4+i]);
|
| + ; }
|
| + mulps xmm0, xmm6
|
| + cvtps2dq xmm0, xmm0
|
| + mulps xmm1, xmm7
|
| + cvtps2dq xmm1, xmm1
|
| + packusdw xmm0, xmm1
|
| +
|
| + mulps xmm2, xmm6
|
| + cvtps2dq xmm2, xmm2
|
| + mulps xmm3, xmm7
|
| + cvtps2dq xmm3, xmm3
|
| + packusdw xmm2, xmm3
|
| +
|
| + mulps xmm4, xmm6
|
| + cvtps2dq xmm4, xmm4
|
| + mulps xmm5, xmm7
|
| + cvtps2dq xmm5, xmm5
|
| + packusdw xmm4, xmm5
|
| +
|
| + movdqa xmm1, [wk(0)]
|
| + movdqa xmm3, [wk(1)]
|
| + movdqa xmm5, [wk(2)]
|
| + movdqa xmm7, [wk(3)]
|
| +
|
| + ; Unpremultiply odd registers (i.e. xmm1, xmm3, xmm5, and xmm7).
|
| + movdqa [wk(0)], xmm0
|
| + movdqa [wk(1)], xmm2
|
| + movdqa [wk(2)], xmm4
|
| + movdqa [wk(3)], xmm6
|
| +
|
| + ; for (int i = 0; i < 8; ++i)
|
| + ; xmm7.word[i] = xmm7.word[i] ? xmm7.word[i] : 255; // a[i] ? a[i] : 255;
|
| + pxor xmm0, xmm0
|
| + pcmpeqw xmm0, xmm7
|
| + psrlw xmm0, 8
|
| + pmaxuw xmm7, xmm0
|
| +
|
| + ; xmm4.dword[i] = r[i]; xmm5.dword[i] = g[i]; xmm6.dword[i] = b[i]; xmm7.dword[i] = a[i];
|
| + ; xmm0.dword[i] = r[4+i]; xmm1.dword[i] = g[4+i]; xmm2.dword[i] = b[4+i]; xmm3.dword[i] = a[4+i];
|
| + movdqa xmm0, xmm1
|
| + movdqa xmm2, xmm3
|
| + movdqa xmm4, xmm5
|
| + movdqa xmm6, xmm7
|
| +
|
| + movdqa [wk(4)], xmm0
|
| + pxor xmm0, xmm0
|
| + punpcklwd xmm1, xmm0
|
| + punpcklwd xmm3, xmm0
|
| + punpcklwd xmm5, xmm0
|
| + punpcklwd xmm7, xmm0
|
| + movdqa xmm0, [wk(4)]
|
| +
|
| + movdqa [wk(4)], xmm1
|
| + pxor xmm1, xmm1
|
| + punpckhwd xmm0, xmm1
|
| + punpckhwd xmm2, xmm1
|
| + punpckhwd xmm4, xmm1
|
| + punpckhwd xmm6, xmm1
|
| + movdqa xmm1, [wk(4)];
|
| +
|
| + ; for (int i = 0; i < 4; ++i) {
|
| + ; xmm1.float[i] = (float)xmm1.dword[i]; // (float)r[i];
|
| + ; xmm3.float[i] = (float)xmm3.dword[i]; // (float)g[i];
|
| + ; xmm5.float[i] = (float)xmm5.dword[i]; // (float)b[i];
|
| + ; xmm7.float[i] = (float)xmm7.dword[i]; // (float)a[i];
|
| + ; xmm0.float[i] = (float)xmm0.dword[i]; // (float)r[4+i];
|
| + ; xmm2.float[i] = (float)xmm2.dword[i]; // (float)g[4+i];
|
| + ; xmm4.float[i] = (float)xmm4.dword[i]; // (float)b[4+i];
|
| + ; xmm6.float[i] = (float)xmm6.dword[i]; // (float)a[4+i];
|
| + ; }
|
| + cvtdq2ps xmm1, xmm1
|
| + cvtdq2ps xmm3, xmm3
|
| + cvtdq2ps xmm5, xmm5
|
| + cvtdq2ps xmm7, xmm7
|
| + cvtdq2ps xmm0, xmm0
|
| + cvtdq2ps xmm2, xmm2
|
| + cvtdq2ps xmm4, xmm4
|
| + cvtdq2ps xmm6, xmm6
|
| +
|
| + ; for (int i = 0; i < 4; ++i) {
|
| + ; xmm7.float[i] = 255.0 / xmm7.float[i]; // 255.0 / (float)a[i];
|
| + ; xmm6.float[i] = 255.0 / xmm6.float[i]; // 255.0 / (float)a[4+i];
|
| + ; }
|
| + rcpps xmm7, xmm7
|
| + rcpps xmm6, xmm6
|
| + mulps xmm7, [GOTOFF(eax,PF_255)]
|
| + mulps xmm6, [GOTOFF(eax,PF_255)]
|
| +
|
| + ; for (int i = 0; i < 4; ++i) {
|
| + ; xmm1.word[i] = (uint16)(xmm1.float[i] * xmm7.float[i]); // (uint16)((float)r[i] * 255.0 / (float)a[i]);
|
| + ; xmm3.word[i] = (uint16)(xmm3.float[i] * xmm7.float[i]); // (uint16)((float)g[i] * 255.0 / (float)a[i]);
|
| + ; xmm5.word[i] = (uint16)(xmm5.float[i] * xmm7.float[i]); // (uint16)((float)b[i] * 255.0 / (float)a[i]);
|
| + ; xmm1.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)r[4+i] * 255.0 / (float)a[4+i]);
|
| + ; xmm3.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)g[4+i] * 255.0 / (float)a[4+i]);
|
| + ; xmm5.word[4+i] = (uint16)(xmm0.float[i] * xmm6.float[i]); // (uint16)((float)b[4+i] * 255.0 / (float)a[4+i]);
|
| + ; }
|
| + mulps xmm1, xmm7
|
| + cvtps2dq xmm1, xmm1
|
| + mulps xmm0, xmm6
|
| + cvtps2dq xmm0, xmm0
|
| + packusdw xmm1, xmm0
|
| +
|
| + mulps xmm3, xmm7
|
| + cvtps2dq xmm3, xmm3
|
| + mulps xmm2, xmm6
|
| + cvtps2dq xmm2, xmm2
|
| + packusdw xmm3, xmm2
|
| +
|
| + mulps xmm5, xmm7
|
| + cvtps2dq xmm5, xmm5
|
| + mulps xmm4, xmm6
|
| + cvtps2dq xmm4, xmm4
|
| + packusdw xmm5, xmm4
|
| +
|
| + movdqa xmm0, [wk(0)]
|
| + movdqa xmm2, [wk(1)]
|
| + movdqa xmm4, [wk(2)]
|
| + movdqa xmm6, [wk(3)]
|
| +%endif ; PREMULTIPLY == 1
|
| +
|
| ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
|
| ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
|
|
|
|
|