| OLD | NEW | 
| (Empty) |  | 
 |    1 ; | 
 |    2 ; jcclrss2.asm - colorspace conversion (SSE2) | 
 |    3 ; | 
 |    4 ; x86 SIMD extension for IJG JPEG library | 
 |    5 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 
 |    6 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 
 |    7 ; | 
 |    8 ; This file should be assembled with NASM (Netwide Assembler), | 
 |    9 ; can *not* be assembled with Microsoft's MASM or any compatible | 
 |   10 ; assembler (including Borland's Turbo Assembler). | 
 |   11 ; NASM is available from http://nasm.sourceforge.net/ or | 
 |   12 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 
 |   13 ; | 
 |   14 ; [TAB8] | 
 |   15  | 
 |   16 %include "jcolsamp.inc" | 
 |   17  | 
 |   18 ; -------------------------------------------------------------------------- | 
 |   19         SECTION SEG_TEXT | 
 |   20         BITS    32 | 
 |   21 ; | 
 |   22 ; Convert some rows of samples to the output colorspace. | 
 |   23 ; | 
 |   24 ; GLOBAL(void) | 
 |   25 ; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, | 
 |   26 ;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | 
 |   27 ;                             JDIMENSION output_row, int num_rows); | 
 |   28 ; | 
 |   29  | 
 |   30 %define img_width(b)    (b)+8                   ; JDIMENSION img_width | 
 |   31 %define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf | 
 |   32 %define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf | 
 |   33 %define output_row(b)   (b)+20          ; JDIMENSION output_row | 
 |   34 %define num_rows(b)     (b)+24          ; int num_rows | 
 |   35  | 
 |   36 %define original_ebp    ebp+0 | 
 |   37 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 
 |   38 %define WK_NUM          8 | 
 |   39 %define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr | 
 |   40  | 
 |   41         align   16 | 
 |   42  | 
 |   43         global  EXTN(jsimd_rgb_ycc_convert_sse2) | 
 |   44  | 
 |   45 EXTN(jsimd_rgb_ycc_convert_sse2): | 
 |   46         push    ebp | 
 |   47         mov     eax,esp                         ; eax = original ebp | 
 |   48         sub     esp, byte 4 | 
 |   49         and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits | 
 |   50         mov     [esp],eax | 
 |   51         mov     ebp,esp                         ; ebp = aligned ebp | 
 |   52         lea     esp, [wk(0)] | 
 |   53         pushpic eax             ; make a room for GOT address | 
 |   54         push    ebx | 
 |   55 ;       push    ecx             ; need not be preserved | 
 |   56 ;       push    edx             ; need not be preserved | 
 |   57         push    esi | 
 |   58         push    edi | 
 |   59  | 
 |   60         get_GOT ebx                     ; get GOT address | 
 |   61         movpic  POINTER [gotptr], ebx   ; save GOT address | 
 |   62  | 
 |   63         mov     ecx, JDIMENSION [img_width(eax)] | 
 |   64         test    ecx,ecx | 
 |   65         jz      near .return | 
 |   66  | 
 |   67         push    ecx | 
 |   68  | 
 |   69         mov     esi, JSAMPIMAGE [output_buf(eax)] | 
 |   70         mov     ecx, JDIMENSION [output_row(eax)] | 
 |   71         mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] | 
 |   72         mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] | 
 |   73         mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] | 
 |   74         lea     edi, [edi+ecx*SIZEOF_JSAMPROW] | 
 |   75         lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW] | 
 |   76         lea     edx, [edx+ecx*SIZEOF_JSAMPROW] | 
 |   77  | 
 |   78         pop     ecx | 
 |   79  | 
 |   80         mov     esi, JSAMPARRAY [input_buf(eax)] | 
 |   81         mov     eax, INT [num_rows(eax)] | 
 |   82         test    eax,eax | 
 |   83         jle     near .return | 
 |   84         alignx  16,7 | 
 |   85 .rowloop: | 
 |   86         pushpic eax | 
 |   87         push    edx | 
 |   88         push    ebx | 
 |   89         push    edi | 
 |   90         push    esi | 
 |   91         push    ecx                     ; col | 
 |   92  | 
 |   93         mov     esi, JSAMPROW [esi]     ; inptr | 
 |   94         mov     edi, JSAMPROW [edi]     ; outptr0 | 
 |   95         mov     ebx, JSAMPROW [ebx]     ; outptr1 | 
 |   96         mov     edx, JSAMPROW [edx]     ; outptr2 | 
 |   97         movpic  eax, POINTER [gotptr]   ; load GOT address (eax) | 
 |   98  | 
 |   99         cmp     ecx, byte SIZEOF_XMMWORD | 
 |  100         jae     near .columnloop | 
 |  101         alignx  16,7 | 
 |  102  | 
 |  103 %if RGB_PIXELSIZE == 3 ; --------------- | 
 |  104  | 
 |  105 .column_ld1: | 
 |  106         push    eax | 
 |  107         push    edx | 
 |  108         lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE | 
 |  109         test    cl, SIZEOF_BYTE | 
 |  110         jz      short .column_ld2 | 
 |  111         sub     ecx, byte SIZEOF_BYTE | 
 |  112         movzx   eax, BYTE [esi+ecx] | 
 |  113 .column_ld2: | 
 |  114         test    cl, SIZEOF_WORD | 
 |  115         jz      short .column_ld4 | 
 |  116         sub     ecx, byte SIZEOF_WORD | 
 |  117         movzx   edx, WORD [esi+ecx] | 
 |  118         shl     eax, WORD_BIT | 
 |  119         or      eax,edx | 
 |  120 .column_ld4: | 
 |  121         movd    xmmA,eax | 
 |  122         pop     edx | 
 |  123         pop     eax | 
 |  124         test    cl, SIZEOF_DWORD | 
 |  125         jz      short .column_ld8 | 
 |  126         sub     ecx, byte SIZEOF_DWORD | 
 |  127         movd    xmmF, XMM_DWORD [esi+ecx] | 
 |  128         pslldq  xmmA, SIZEOF_DWORD | 
 |  129         por     xmmA,xmmF | 
 |  130 .column_ld8: | 
 |  131         test    cl, SIZEOF_MMWORD | 
 |  132         jz      short .column_ld16 | 
 |  133         sub     ecx, byte SIZEOF_MMWORD | 
 |  134         movq    xmmB, XMM_MMWORD [esi+ecx] | 
 |  135         pslldq  xmmA, SIZEOF_MMWORD | 
 |  136         por     xmmA,xmmB | 
 |  137 .column_ld16: | 
 |  138         test    cl, SIZEOF_XMMWORD | 
 |  139         jz      short .column_ld32 | 
 |  140         movdqa  xmmF,xmmA | 
 |  141         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] | 
 |  142         mov     ecx, SIZEOF_XMMWORD | 
 |  143         jmp     short .rgb_ycc_cnv | 
 |  144 .column_ld32: | 
 |  145         test    cl, 2*SIZEOF_XMMWORD | 
 |  146         mov     ecx, SIZEOF_XMMWORD | 
 |  147         jz      short .rgb_ycc_cnv | 
 |  148         movdqa  xmmB,xmmA | 
 |  149         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] | 
 |  150         movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] | 
 |  151         jmp     short .rgb_ycc_cnv | 
 |  152         alignx  16,7 | 
 |  153  | 
 |  154 .columnloop: | 
 |  155         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] | 
 |  156         movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] | 
 |  157         movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] | 
 |  158  | 
 |  159 .rgb_ycc_cnv: | 
 |  160         ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) | 
 |  161         ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) | 
 |  162         ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) | 
 |  163  | 
 |  164         movdqa    xmmG,xmmA | 
 |  165         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 2
     1 02 12) | 
 |  166         psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -
     - -- --) | 
 |  167  | 
 |  168         punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0
     A 12 1A) | 
 |  169         pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 0
     7 17 27) | 
 |  170  | 
 |  171         punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2
     C 05 0D) | 
 |  172         punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1
     F 27 2F) | 
 |  173  | 
 |  174         movdqa    xmmD,xmmA | 
 |  175         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 2
     8 01 09) | 
 |  176         psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -
     - -- --) | 
 |  177  | 
 |  178         punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 0
     5 09 0D) | 
 |  179         pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1
     B 23 2B) | 
 |  180  | 
 |  181         punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 1
     6 1A 1E) | 
 |  182         punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 2
     7 2B 2F) | 
 |  183  | 
 |  184         movdqa    xmmE,xmmA | 
 |  185         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 1
     4 18 1C) | 
 |  186         psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -
     - -- --) | 
 |  187  | 
 |  188         punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1
     A 1C 1E) | 
 |  189         pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 2
     5 29 2D) | 
 |  190  | 
 |  191         punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0
     B 0D 0F) | 
 |  192         punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2
     B 2D 2F) | 
 |  193  | 
 |  194         pxor      xmmH,xmmH | 
 |  195  | 
 |  196         movdqa    xmmC,xmmA | 
 |  197         punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E) | 
 |  198         punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E) | 
 |  199  | 
 |  200         movdqa    xmmB,xmmE | 
 |  201         punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E) | 
 |  202         punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F) | 
 |  203  | 
 |  204         movdqa    xmmF,xmmD | 
 |  205         punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F) | 
 |  206         punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F) | 
 |  207  | 
 |  208 %else ; RGB_PIXELSIZE == 4 ; ----------- | 
 |  209  | 
 |  210 .column_ld1: | 
 |  211         test    cl, SIZEOF_XMMWORD/16 | 
 |  212         jz      short .column_ld2 | 
 |  213         sub     ecx, byte SIZEOF_XMMWORD/16 | 
 |  214         movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] | 
 |  215 .column_ld2: | 
 |  216         test    cl, SIZEOF_XMMWORD/8 | 
 |  217         jz      short .column_ld4 | 
 |  218         sub     ecx, byte SIZEOF_XMMWORD/8 | 
 |  219         movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] | 
 |  220         pslldq  xmmA, SIZEOF_MMWORD | 
 |  221         por     xmmA,xmmE | 
 |  222 .column_ld4: | 
 |  223         test    cl, SIZEOF_XMMWORD/4 | 
 |  224         jz      short .column_ld8 | 
 |  225         sub     ecx, byte SIZEOF_XMMWORD/4 | 
 |  226         movdqa  xmmE,xmmA | 
 |  227         movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] | 
 |  228 .column_ld8: | 
 |  229         test    cl, SIZEOF_XMMWORD/2 | 
 |  230         mov     ecx, SIZEOF_XMMWORD | 
 |  231         jz      short .rgb_ycc_cnv | 
 |  232         movdqa  xmmF,xmmA | 
 |  233         movdqa  xmmH,xmmE | 
 |  234         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] | 
 |  235         movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] | 
 |  236         jmp     short .rgb_ycc_cnv | 
 |  237         alignx  16,7 | 
 |  238  | 
 |  239 .columnloop: | 
 |  240         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] | 
 |  241         movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] | 
 |  242         movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] | 
 |  243         movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] | 
 |  244  | 
 |  245 .rgb_ycc_cnv: | 
 |  246         ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) | 
 |  247         ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) | 
 |  248         ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) | 
 |  249         ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) | 
 |  250  | 
 |  251         movdqa    xmmD,xmmA | 
 |  252         punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 2
     5 31 35) | 
 |  253         punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 2
     7 33 37) | 
 |  254  | 
 |  255         movdqa    xmmC,xmmF | 
 |  256         punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2
     D 39 3D) | 
 |  257         punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2
     F 3B 3F) | 
 |  258  | 
 |  259         movdqa    xmmB,xmmA | 
 |  260         punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 3
     4 38 3C) | 
 |  261         punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 3
     5 39 3D) | 
 |  262  | 
 |  263         movdqa    xmmG,xmmD | 
 |  264         punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 3
     6 3A 3E) | 
 |  265         punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 3
     7 3B 3F) | 
 |  266  | 
 |  267         movdqa    xmmE,xmmA | 
 |  268         punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1
     A 1C 1E) | 
 |  269         punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3
     A 3C 3E) | 
 |  270  | 
 |  271         movdqa    xmmH,xmmB | 
 |  272         punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1
     B 1D 1F) | 
 |  273         punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3
     B 3D 3F) | 
 |  274  | 
 |  275         pxor      xmmF,xmmF | 
 |  276  | 
 |  277         movdqa    xmmC,xmmA | 
 |  278         punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E) | 
 |  279         punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E) | 
 |  280  | 
 |  281         movdqa    xmmD,xmmB | 
 |  282         punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F) | 
 |  283         punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F) | 
 |  284  | 
 |  285         movdqa    xmmG,xmmE | 
 |  286         punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E) | 
 |  287         punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E) | 
 |  288  | 
 |  289         punpcklbw xmmF,xmmH | 
 |  290         punpckhbw xmmH,xmmH | 
 |  291         psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) | 
 |  292         psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) | 
 |  293  | 
 |  294 %endif ; RGB_PIXELSIZE ; --------------- | 
 |  295  | 
 |  296         ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE | 
 |  297         ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO | 
 |  298  | 
 |  299         ; (Original) | 
 |  300         ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B | 
 |  301         ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE | 
 |  302         ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE | 
 |  303         ; | 
 |  304         ; (This implementation) | 
 |  305         ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G | 
 |  306         ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE | 
 |  307         ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE | 
 |  308  | 
 |  309         movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE | 
 |  310         movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO | 
 |  311         movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE | 
 |  312         movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO | 
 |  313  | 
 |  314         movdqa    xmm6,xmm1 | 
 |  315         punpcklwd xmm1,xmm3 | 
 |  316         punpckhwd xmm6,xmm3 | 
 |  317         movdqa    xmm7,xmm1 | 
 |  318         movdqa    xmm4,xmm6 | 
 |  319         pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FI
     X(0.337) | 
 |  320         pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FI
     X(0.337) | 
 |  321         pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-
     FIX(0.331) | 
 |  322         pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-
     FIX(0.331) | 
 |  323  | 
 |  324         movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) | 
 |  325         movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) | 
 |  326  | 
 |  327         pxor      xmm1,xmm1 | 
 |  328         pxor      xmm6,xmm6 | 
 |  329         punpcklwd xmm1,xmm5             ; xmm1=BOL | 
 |  330         punpckhwd xmm6,xmm5             ; xmm6=BOH | 
 |  331         psrld     xmm1,1                ; xmm1=BOL*FIX(0.500) | 
 |  332         psrld     xmm6,1                ; xmm6=BOH*FIX(0.500) | 
 |  333  | 
 |  334         movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] | 
 |  335  | 
 |  336         paddd     xmm7,xmm1 | 
 |  337         paddd     xmm4,xmm6 | 
 |  338         paddd     xmm7,xmm5 | 
 |  339         paddd     xmm4,xmm5 | 
 |  340         psrld     xmm7,SCALEBITS        ; xmm7=CbOL | 
 |  341         psrld     xmm4,SCALEBITS        ; xmm4=CbOH | 
 |  342         packssdw  xmm7,xmm4             ; xmm7=CbO | 
 |  343  | 
 |  344         movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE | 
 |  345  | 
 |  346         movdqa    xmm6,xmm0 | 
 |  347         punpcklwd xmm0,xmm2 | 
 |  348         punpckhwd xmm6,xmm2 | 
 |  349         movdqa    xmm5,xmm0 | 
 |  350         movdqa    xmm4,xmm6 | 
 |  351         pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FI
     X(0.337) | 
 |  352         pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FI
     X(0.337) | 
 |  353         pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-
     FIX(0.331) | 
 |  354         pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-
     FIX(0.331) | 
 |  355  | 
 |  356         movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) | 
 |  357         movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) | 
 |  358  | 
 |  359         pxor      xmm0,xmm0 | 
 |  360         pxor      xmm6,xmm6 | 
 |  361         punpcklwd xmm0,xmm1             ; xmm0=BEL | 
 |  362         punpckhwd xmm6,xmm1             ; xmm6=BEH | 
 |  363         psrld     xmm0,1                ; xmm0=BEL*FIX(0.500) | 
 |  364         psrld     xmm6,1                ; xmm6=BEH*FIX(0.500) | 
 |  365  | 
 |  366         movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] | 
 |  367  | 
 |  368         paddd     xmm5,xmm0 | 
 |  369         paddd     xmm4,xmm6 | 
 |  370         paddd     xmm5,xmm1 | 
 |  371         paddd     xmm4,xmm1 | 
 |  372         psrld     xmm5,SCALEBITS        ; xmm5=CbEL | 
 |  373         psrld     xmm4,SCALEBITS        ; xmm4=CbEH | 
 |  374         packssdw  xmm5,xmm4             ; xmm5=CbE | 
 |  375  | 
 |  376         psllw     xmm7,BYTE_BIT | 
 |  377         por       xmm5,xmm7             ; xmm5=Cb | 
 |  378         movdqa    XMMWORD [ebx], xmm5   ; Save Cb | 
 |  379  | 
 |  380         movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO | 
 |  381         movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE | 
 |  382         movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO | 
 |  383  | 
 |  384         movdqa    xmm4,xmm0 | 
 |  385         punpcklwd xmm0,xmm3 | 
 |  386         punpckhwd xmm4,xmm3 | 
 |  387         movdqa    xmm7,xmm0 | 
 |  388         movdqa    xmm5,xmm4 | 
 |  389         pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FI
     X(0.250) | 
 |  390         pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FI
     X(0.250) | 
 |  391         pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-
     FIX(0.418) | 
 |  392         pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-
     FIX(0.418) | 
 |  393  | 
 |  394         movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] | 
 |  395  | 
 |  396         paddd     xmm0, XMMWORD [wk(4)] | 
 |  397         paddd     xmm4, XMMWORD [wk(5)] | 
 |  398         paddd     xmm0,xmm3 | 
 |  399         paddd     xmm4,xmm3 | 
 |  400         psrld     xmm0,SCALEBITS        ; xmm0=YOL | 
 |  401         psrld     xmm4,SCALEBITS        ; xmm4=YOH | 
 |  402         packssdw  xmm0,xmm4             ; xmm0=YO | 
 |  403  | 
 |  404         pxor      xmm3,xmm3 | 
 |  405         pxor      xmm4,xmm4 | 
 |  406         punpcklwd xmm3,xmm1             ; xmm3=ROL | 
 |  407         punpckhwd xmm4,xmm1             ; xmm4=ROH | 
 |  408         psrld     xmm3,1                ; xmm3=ROL*FIX(0.500) | 
 |  409         psrld     xmm4,1                ; xmm4=ROH*FIX(0.500) | 
 |  410  | 
 |  411         movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] | 
 |  412  | 
 |  413         paddd     xmm7,xmm3 | 
 |  414         paddd     xmm5,xmm4 | 
 |  415         paddd     xmm7,xmm1 | 
 |  416         paddd     xmm5,xmm1 | 
 |  417         psrld     xmm7,SCALEBITS        ; xmm7=CrOL | 
 |  418         psrld     xmm5,SCALEBITS        ; xmm5=CrOH | 
 |  419         packssdw  xmm7,xmm5             ; xmm7=CrO | 
 |  420  | 
 |  421         movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE | 
 |  422  | 
 |  423         movdqa    xmm4,xmm6 | 
 |  424         punpcklwd xmm6,xmm2 | 
 |  425         punpckhwd xmm4,xmm2 | 
 |  426         movdqa    xmm1,xmm6 | 
 |  427         movdqa    xmm5,xmm4 | 
 |  428         pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FI
     X(0.250) | 
 |  429         pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FI
     X(0.250) | 
 |  430         pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-
     FIX(0.418) | 
 |  431         pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-
     FIX(0.418) | 
 |  432  | 
 |  433         movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] | 
 |  434  | 
 |  435         paddd     xmm6, XMMWORD [wk(6)] | 
 |  436         paddd     xmm4, XMMWORD [wk(7)] | 
 |  437         paddd     xmm6,xmm2 | 
 |  438         paddd     xmm4,xmm2 | 
 |  439         psrld     xmm6,SCALEBITS        ; xmm6=YEL | 
 |  440         psrld     xmm4,SCALEBITS        ; xmm4=YEH | 
 |  441         packssdw  xmm6,xmm4             ; xmm6=YE | 
 |  442  | 
 |  443         psllw     xmm0,BYTE_BIT | 
 |  444         por       xmm6,xmm0             ; xmm6=Y | 
 |  445         movdqa    XMMWORD [edi], xmm6   ; Save Y | 
 |  446  | 
 |  447         pxor      xmm2,xmm2 | 
 |  448         pxor      xmm4,xmm4 | 
 |  449         punpcklwd xmm2,xmm3             ; xmm2=REL | 
 |  450         punpckhwd xmm4,xmm3             ; xmm4=REH | 
 |  451         psrld     xmm2,1                ; xmm2=REL*FIX(0.500) | 
 |  452         psrld     xmm4,1                ; xmm4=REH*FIX(0.500) | 
 |  453  | 
 |  454         movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] | 
 |  455  | 
 |  456         paddd     xmm1,xmm2 | 
 |  457         paddd     xmm5,xmm4 | 
 |  458         paddd     xmm1,xmm0 | 
 |  459         paddd     xmm5,xmm0 | 
 |  460         psrld     xmm1,SCALEBITS        ; xmm1=CrEL | 
 |  461         psrld     xmm5,SCALEBITS        ; xmm5=CrEH | 
 |  462         packssdw  xmm1,xmm5             ; xmm1=CrE | 
 |  463  | 
 |  464         psllw     xmm7,BYTE_BIT | 
 |  465         por       xmm1,xmm7             ; xmm1=Cr | 
 |  466         movdqa    XMMWORD [edx], xmm1   ; Save Cr | 
 |  467  | 
 |  468         sub     ecx, byte SIZEOF_XMMWORD | 
 |  469         add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr | 
 |  470         add     edi, byte SIZEOF_XMMWORD                ; outptr0 | 
 |  471         add     ebx, byte SIZEOF_XMMWORD                ; outptr1 | 
 |  472         add     edx, byte SIZEOF_XMMWORD                ; outptr2 | 
 |  473         cmp     ecx, byte SIZEOF_XMMWORD | 
 |  474         jae     near .columnloop | 
 |  475         test    ecx,ecx | 
 |  476         jnz     near .column_ld1 | 
 |  477  | 
 |  478         pop     ecx                     ; col | 
 |  479         pop     esi | 
 |  480         pop     edi | 
 |  481         pop     ebx | 
 |  482         pop     edx | 
 |  483         poppic  eax | 
 |  484  | 
 |  485         add     esi, byte SIZEOF_JSAMPROW       ; input_buf | 
 |  486         add     edi, byte SIZEOF_JSAMPROW | 
 |  487         add     ebx, byte SIZEOF_JSAMPROW | 
 |  488         add     edx, byte SIZEOF_JSAMPROW | 
 |  489         dec     eax                             ; num_rows | 
 |  490         jg      near .rowloop | 
 |  491  | 
 |  492 .return: | 
 |  493         pop     edi | 
 |  494         pop     esi | 
 |  495 ;       pop     edx             ; need not be preserved | 
 |  496 ;       pop     ecx             ; need not be preserved | 
 |  497         pop     ebx | 
 |  498         mov     esp,ebp         ; esp <- aligned ebp | 
 |  499         pop     esp             ; esp <- original ebp | 
 |  500         pop     ebp | 
 |  501         ret | 
 |  502  | 
 |  503 ; For some reason, the OS X linker does not honor the request to align the | 
 |  504 ; segment unless we do this. | 
 |  505         align   16 | 
| OLD | NEW |