| OLD | NEW | 
 | (Empty) | 
|    1 ; |  | 
|    2 ; jcclrss2.asm - colorspace conversion (SSE2) |  | 
|    3 ; |  | 
|    4 ; x86 SIMD extension for IJG JPEG library |  | 
|    5 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |  | 
|    6 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |  | 
|    7 ; |  | 
|    8 ; This file should be assembled with NASM (Netwide Assembler), |  | 
|    9 ; can *not* be assembled with Microsoft's MASM or any compatible |  | 
|   10 ; assembler (including Borland's Turbo Assembler). |  | 
|   11 ; NASM is available from http://nasm.sourceforge.net/ or |  | 
|   12 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |  | 
|   13 ; |  | 
|   14 ; [TAB8] |  | 
|   15  |  | 
|   16 %include "jcolsamp.inc" |  | 
|   17  |  | 
|   18 ; -------------------------------------------------------------------------- |  | 
|   19 ; |  | 
|   20 ; Convert some rows of samples to the output colorspace. |  | 
|   21 ; |  | 
|   22 ; GLOBAL(void) |  | 
|   23 ; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, |  | 
|   24 ;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf, |  | 
|   25 ;                             JDIMENSION output_row, int num_rows); |  | 
|   26 ; |  | 
|   27  |  | 
|   28 %define img_width(b)    (b)+8                   ; JDIMENSION img_width |  | 
|   29 %define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf |  | 
|   30 %define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf |  | 
|   31 %define output_row(b)   (b)+20          ; JDIMENSION output_row |  | 
|   32 %define num_rows(b)     (b)+24          ; int num_rows |  | 
|   33  |  | 
|   34 %define original_ebp    ebp+0 |  | 
|   35 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |  | 
|   36 %define WK_NUM          8 |  | 
|   37 %define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr |  | 
|   38  |  | 
|   39         align   16 |  | 
|   40  |  | 
|   41         global  EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE |  | 
|   42  |  | 
|   43 EXTN(jsimd_rgb_ycc_convert_sse2): |  | 
|   44         push    ebp |  | 
|   45         mov     eax,esp                         ; eax = original ebp |  | 
|   46         sub     esp, byte 4 |  | 
|   47         and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits |  | 
|   48         mov     [esp],eax |  | 
|   49         mov     ebp,esp                         ; ebp = aligned ebp |  | 
|   50         lea     esp, [wk(0)] |  | 
|   51         pushpic eax             ; make a room for GOT address |  | 
|   52         push    ebx |  | 
|   53 ;       push    ecx             ; need not be preserved |  | 
|   54 ;       push    edx             ; need not be preserved |  | 
|   55         push    esi |  | 
|   56         push    edi |  | 
|   57  |  | 
|   58         get_GOT ebx                     ; get GOT address |  | 
|   59         movpic  POINTER [gotptr], ebx   ; save GOT address |  | 
|   60  |  | 
|   61         mov     ecx, JDIMENSION [img_width(eax)] |  | 
|   62         test    ecx,ecx |  | 
|   63         jz      near .return |  | 
|   64  |  | 
|   65         push    ecx |  | 
|   66  |  | 
|   67         mov     esi, JSAMPIMAGE [output_buf(eax)] |  | 
|   68         mov     ecx, JDIMENSION [output_row(eax)] |  | 
|   69         mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] |  | 
|   70         mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] |  | 
|   71         mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] |  | 
|   72         lea     edi, [edi+ecx*SIZEOF_JSAMPROW] |  | 
|   73         lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW] |  | 
|   74         lea     edx, [edx+ecx*SIZEOF_JSAMPROW] |  | 
|   75  |  | 
|   76         pop     ecx |  | 
|   77  |  | 
|   78         mov     esi, JSAMPARRAY [input_buf(eax)] |  | 
|   79         mov     eax, INT [num_rows(eax)] |  | 
|   80         test    eax,eax |  | 
|   81         jle     near .return |  | 
|   82         alignx  16,7 |  | 
|   83 .rowloop: |  | 
|   84         pushpic eax |  | 
|   85         push    edx |  | 
|   86         push    ebx |  | 
|   87         push    edi |  | 
|   88         push    esi |  | 
|   89         push    ecx                     ; col |  | 
|   90  |  | 
|   91         mov     esi, JSAMPROW [esi]     ; inptr |  | 
|   92         mov     edi, JSAMPROW [edi]     ; outptr0 |  | 
|   93         mov     ebx, JSAMPROW [ebx]     ; outptr1 |  | 
|   94         mov     edx, JSAMPROW [edx]     ; outptr2 |  | 
|   95         movpic  eax, POINTER [gotptr]   ; load GOT address (eax) |  | 
|   96  |  | 
|   97         cmp     ecx, byte SIZEOF_XMMWORD |  | 
|   98         jae     near .columnloop |  | 
|   99         alignx  16,7 |  | 
|  100  |  | 
|  101 %if RGB_PIXELSIZE == 3 ; --------------- |  | 
|  102  |  | 
|  103 .column_ld1: |  | 
|  104         push    eax |  | 
|  105         push    edx |  | 
|  106         lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE |  | 
|  107         test    cl, SIZEOF_BYTE |  | 
|  108         jz      short .column_ld2 |  | 
|  109         sub     ecx, byte SIZEOF_BYTE |  | 
|  110         movzx   eax, BYTE [esi+ecx] |  | 
|  111 .column_ld2: |  | 
|  112         test    cl, SIZEOF_WORD |  | 
|  113         jz      short .column_ld4 |  | 
|  114         sub     ecx, byte SIZEOF_WORD |  | 
|  115         movzx   edx, WORD [esi+ecx] |  | 
|  116         shl     eax, WORD_BIT |  | 
|  117         or      eax,edx |  | 
|  118 .column_ld4: |  | 
|  119         movd    xmmA,eax |  | 
|  120         pop     edx |  | 
|  121         pop     eax |  | 
|  122         test    cl, SIZEOF_DWORD |  | 
|  123         jz      short .column_ld8 |  | 
|  124         sub     ecx, byte SIZEOF_DWORD |  | 
|  125         movd    xmmF, XMM_DWORD [esi+ecx] |  | 
|  126         pslldq  xmmA, SIZEOF_DWORD |  | 
|  127         por     xmmA,xmmF |  | 
|  128 .column_ld8: |  | 
|  129         test    cl, SIZEOF_MMWORD |  | 
|  130         jz      short .column_ld16 |  | 
|  131         sub     ecx, byte SIZEOF_MMWORD |  | 
|  132         movq    xmmB, XMM_MMWORD [esi+ecx] |  | 
|  133         pslldq  xmmA, SIZEOF_MMWORD |  | 
|  134         por     xmmA,xmmB |  | 
|  135 .column_ld16: |  | 
|  136         test    cl, SIZEOF_XMMWORD |  | 
|  137         jz      short .column_ld32 |  | 
|  138         movdqa  xmmF,xmmA |  | 
|  139         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] |  | 
|  140         mov     ecx, SIZEOF_XMMWORD |  | 
|  141         jmp     short .rgb_ycc_cnv |  | 
|  142 .column_ld32: |  | 
|  143         test    cl, 2*SIZEOF_XMMWORD |  | 
|  144         mov     ecx, SIZEOF_XMMWORD |  | 
|  145         jz      short .rgb_ycc_cnv |  | 
|  146         movdqa  xmmB,xmmA |  | 
|  147         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] |  | 
|  148         movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] |  | 
|  149         jmp     short .rgb_ycc_cnv |  | 
|  150         alignx  16,7 |  | 
|  151  |  | 
|  152 .columnloop: |  | 
|  153         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] |  | 
|  154         movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] |  | 
|  155         movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] |  | 
|  156  |  | 
|  157 .rgb_ycc_cnv: |  | 
|  158         ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) |  | 
|  159         ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) |  | 
|  160         ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) |  | 
|  161  |  | 
|  162         movdqa    xmmG,xmmA |  | 
|  163         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 2
     1 02 12) |  | 
|  164         psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -
     - -- --) |  | 
|  165  |  | 
|  166         punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0
     A 12 1A) |  | 
|  167         pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 0
     7 17 27) |  | 
|  168  |  | 
|  169         punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2
     C 05 0D) |  | 
|  170         punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1
     F 27 2F) |  | 
|  171  |  | 
|  172         movdqa    xmmD,xmmA |  | 
|  173         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 2
     8 01 09) |  | 
|  174         psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -
     - -- --) |  | 
|  175  |  | 
|  176         punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 0
     5 09 0D) |  | 
|  177         pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1
     B 23 2B) |  | 
|  178  |  | 
|  179         punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 1
     6 1A 1E) |  | 
|  180         punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 2
     7 2B 2F) |  | 
|  181  |  | 
|  182         movdqa    xmmE,xmmA |  | 
|  183         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 1
     4 18 1C) |  | 
|  184         psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -
     - -- --) |  | 
|  185  |  | 
|  186         punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1
     A 1C 1E) |  | 
|  187         pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 2
     5 29 2D) |  | 
|  188  |  | 
|  189         punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0
     B 0D 0F) |  | 
|  190         punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2
     B 2D 2F) |  | 
|  191  |  | 
|  192         pxor      xmmH,xmmH |  | 
|  193  |  | 
|  194         movdqa    xmmC,xmmA |  | 
|  195         punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E) |  | 
|  196         punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E) |  | 
|  197  |  | 
|  198         movdqa    xmmB,xmmE |  | 
|  199         punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E) |  | 
|  200         punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F) |  | 
|  201  |  | 
|  202         movdqa    xmmF,xmmD |  | 
|  203         punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F) |  | 
|  204         punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F) |  | 
|  205  |  | 
|  206 %else ; RGB_PIXELSIZE == 4 ; ----------- |  | 
|  207  |  | 
|  208 .column_ld1: |  | 
|  209         test    cl, SIZEOF_XMMWORD/16 |  | 
|  210         jz      short .column_ld2 |  | 
|  211         sub     ecx, byte SIZEOF_XMMWORD/16 |  | 
|  212         movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] |  | 
|  213 .column_ld2: |  | 
|  214         test    cl, SIZEOF_XMMWORD/8 |  | 
|  215         jz      short .column_ld4 |  | 
|  216         sub     ecx, byte SIZEOF_XMMWORD/8 |  | 
|  217         movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] |  | 
|  218         pslldq  xmmA, SIZEOF_MMWORD |  | 
|  219         por     xmmA,xmmE |  | 
|  220 .column_ld4: |  | 
|  221         test    cl, SIZEOF_XMMWORD/4 |  | 
|  222         jz      short .column_ld8 |  | 
|  223         sub     ecx, byte SIZEOF_XMMWORD/4 |  | 
|  224         movdqa  xmmE,xmmA |  | 
|  225         movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] |  | 
|  226 .column_ld8: |  | 
|  227         test    cl, SIZEOF_XMMWORD/2 |  | 
|  228         mov     ecx, SIZEOF_XMMWORD |  | 
|  229         jz      short .rgb_ycc_cnv |  | 
|  230         movdqa  xmmF,xmmA |  | 
|  231         movdqa  xmmH,xmmE |  | 
|  232         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] |  | 
|  233         movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] |  | 
|  234         jmp     short .rgb_ycc_cnv |  | 
|  235         alignx  16,7 |  | 
|  236  |  | 
|  237 .columnloop: |  | 
|  238         movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] |  | 
|  239         movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] |  | 
|  240         movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] |  | 
|  241         movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] |  | 
|  242  |  | 
|  243 .rgb_ycc_cnv: |  | 
|  244         ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) |  | 
|  245         ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) |  | 
|  246         ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) |  | 
|  247         ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) |  | 
|  248  |  | 
|  249         movdqa    xmmD,xmmA |  | 
|  250         punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 2
     5 31 35) |  | 
|  251         punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 2
     7 33 37) |  | 
|  252  |  | 
|  253         movdqa    xmmC,xmmF |  | 
|  254         punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2
     D 39 3D) |  | 
|  255         punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2
     F 3B 3F) |  | 
|  256  |  | 
|  257         movdqa    xmmB,xmmA |  | 
|  258         punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 3
     4 38 3C) |  | 
|  259         punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 3
     5 39 3D) |  | 
|  260  |  | 
|  261         movdqa    xmmG,xmmD |  | 
|  262         punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 3
     6 3A 3E) |  | 
|  263         punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 3
     7 3B 3F) |  | 
|  264  |  | 
|  265         movdqa    xmmE,xmmA |  | 
|  266         punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1
     A 1C 1E) |  | 
|  267         punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3
     A 3C 3E) |  | 
|  268  |  | 
|  269         movdqa    xmmH,xmmB |  | 
|  270         punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1
     B 1D 1F) |  | 
|  271         punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3
     B 3D 3F) |  | 
|  272  |  | 
|  273         pxor      xmmF,xmmF |  | 
|  274  |  | 
|  275         movdqa    xmmC,xmmA |  | 
|  276         punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E) |  | 
|  277         punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E) |  | 
|  278  |  | 
|  279         movdqa    xmmD,xmmB |  | 
|  280         punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F) |  | 
|  281         punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F) |  | 
|  282  |  | 
|  283         movdqa    xmmG,xmmE |  | 
|  284         punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E) |  | 
|  285         punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E) |  | 
|  286  |  | 
|  287         punpcklbw xmmF,xmmH |  | 
|  288         punpckhbw xmmH,xmmH |  | 
|  289         psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) |  | 
|  290         psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) |  | 
|  291  |  | 
|  292 %endif ; RGB_PIXELSIZE ; --------------- |  | 
|  293  |  | 
|  294         ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE |  | 
|  295         ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO |  | 
|  296  |  | 
|  297         ; (Original) |  | 
|  298         ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B |  | 
|  299         ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE |  | 
|  300         ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE |  | 
|  301         ; |  | 
|  302         ; (This implementation) |  | 
|  303         ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G |  | 
|  304         ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE |  | 
|  305         ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE |  | 
|  306  |  | 
|  307         movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE |  | 
|  308         movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO |  | 
|  309         movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE |  | 
|  310         movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO |  | 
|  311  |  | 
|  312         movdqa    xmm6,xmm1 |  | 
|  313         punpcklwd xmm1,xmm3 |  | 
|  314         punpckhwd xmm6,xmm3 |  | 
|  315         movdqa    xmm7,xmm1 |  | 
|  316         movdqa    xmm4,xmm6 |  | 
|  317         pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FI
     X(0.337) |  | 
|  318         pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FI
     X(0.337) |  | 
|  319         pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-
     FIX(0.331) |  | 
|  320         pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-
     FIX(0.331) |  | 
|  321  |  | 
|  322         movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) |  | 
|  323         movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) |  | 
|  324  |  | 
|  325         pxor      xmm1,xmm1 |  | 
|  326         pxor      xmm6,xmm6 |  | 
|  327         punpcklwd xmm1,xmm5             ; xmm1=BOL |  | 
|  328         punpckhwd xmm6,xmm5             ; xmm6=BOH |  | 
|  329         psrld     xmm1,1                ; xmm1=BOL*FIX(0.500) |  | 
|  330         psrld     xmm6,1                ; xmm6=BOH*FIX(0.500) |  | 
|  331  |  | 
|  332         movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] |  | 
|  333  |  | 
|  334         paddd     xmm7,xmm1 |  | 
|  335         paddd     xmm4,xmm6 |  | 
|  336         paddd     xmm7,xmm5 |  | 
|  337         paddd     xmm4,xmm5 |  | 
|  338         psrld     xmm7,SCALEBITS        ; xmm7=CbOL |  | 
|  339         psrld     xmm4,SCALEBITS        ; xmm4=CbOH |  | 
|  340         packssdw  xmm7,xmm4             ; xmm7=CbO |  | 
|  341  |  | 
|  342         movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE |  | 
|  343  |  | 
|  344         movdqa    xmm6,xmm0 |  | 
|  345         punpcklwd xmm0,xmm2 |  | 
|  346         punpckhwd xmm6,xmm2 |  | 
|  347         movdqa    xmm5,xmm0 |  | 
|  348         movdqa    xmm4,xmm6 |  | 
|  349         pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FI
     X(0.337) |  | 
|  350         pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FI
     X(0.337) |  | 
|  351         pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-
     FIX(0.331) |  | 
|  352         pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-
     FIX(0.331) |  | 
|  353  |  | 
|  354         movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) |  | 
|  355         movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) |  | 
|  356  |  | 
|  357         pxor      xmm0,xmm0 |  | 
|  358         pxor      xmm6,xmm6 |  | 
|  359         punpcklwd xmm0,xmm1             ; xmm0=BEL |  | 
|  360         punpckhwd xmm6,xmm1             ; xmm6=BEH |  | 
|  361         psrld     xmm0,1                ; xmm0=BEL*FIX(0.500) |  | 
|  362         psrld     xmm6,1                ; xmm6=BEH*FIX(0.500) |  | 
|  363  |  | 
|  364         movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] |  | 
|  365  |  | 
|  366         paddd     xmm5,xmm0 |  | 
|  367         paddd     xmm4,xmm6 |  | 
|  368         paddd     xmm5,xmm1 |  | 
|  369         paddd     xmm4,xmm1 |  | 
|  370         psrld     xmm5,SCALEBITS        ; xmm5=CbEL |  | 
|  371         psrld     xmm4,SCALEBITS        ; xmm4=CbEH |  | 
|  372         packssdw  xmm5,xmm4             ; xmm5=CbE |  | 
|  373  |  | 
|  374         psllw     xmm7,BYTE_BIT |  | 
|  375         por       xmm5,xmm7             ; xmm5=Cb |  | 
|  376         movdqa    XMMWORD [ebx], xmm5   ; Save Cb |  | 
|  377  |  | 
|  378         movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO |  | 
|  379         movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE |  | 
|  380         movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO |  | 
|  381  |  | 
|  382         movdqa    xmm4,xmm0 |  | 
|  383         punpcklwd xmm0,xmm3 |  | 
|  384         punpckhwd xmm4,xmm3 |  | 
|  385         movdqa    xmm7,xmm0 |  | 
|  386         movdqa    xmm5,xmm4 |  | 
|  387         pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FI
     X(0.250) |  | 
|  388         pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FI
     X(0.250) |  | 
|  389         pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-
     FIX(0.418) |  | 
|  390         pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-
     FIX(0.418) |  | 
|  391  |  | 
|  392         movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] |  | 
|  393  |  | 
|  394         paddd     xmm0, XMMWORD [wk(4)] |  | 
|  395         paddd     xmm4, XMMWORD [wk(5)] |  | 
|  396         paddd     xmm0,xmm3 |  | 
|  397         paddd     xmm4,xmm3 |  | 
|  398         psrld     xmm0,SCALEBITS        ; xmm0=YOL |  | 
|  399         psrld     xmm4,SCALEBITS        ; xmm4=YOH |  | 
|  400         packssdw  xmm0,xmm4             ; xmm0=YO |  | 
|  401  |  | 
|  402         pxor      xmm3,xmm3 |  | 
|  403         pxor      xmm4,xmm4 |  | 
|  404         punpcklwd xmm3,xmm1             ; xmm3=ROL |  | 
|  405         punpckhwd xmm4,xmm1             ; xmm4=ROH |  | 
|  406         psrld     xmm3,1                ; xmm3=ROL*FIX(0.500) |  | 
|  407         psrld     xmm4,1                ; xmm4=ROH*FIX(0.500) |  | 
|  408  |  | 
|  409         movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] |  | 
|  410  |  | 
|  411         paddd     xmm7,xmm3 |  | 
|  412         paddd     xmm5,xmm4 |  | 
|  413         paddd     xmm7,xmm1 |  | 
|  414         paddd     xmm5,xmm1 |  | 
|  415         psrld     xmm7,SCALEBITS        ; xmm7=CrOL |  | 
|  416         psrld     xmm5,SCALEBITS        ; xmm5=CrOH |  | 
|  417         packssdw  xmm7,xmm5             ; xmm7=CrO |  | 
|  418  |  | 
|  419         movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE |  | 
|  420  |  | 
|  421         movdqa    xmm4,xmm6 |  | 
|  422         punpcklwd xmm6,xmm2 |  | 
|  423         punpckhwd xmm4,xmm2 |  | 
|  424         movdqa    xmm1,xmm6 |  | 
|  425         movdqa    xmm5,xmm4 |  | 
|  426         pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FI
     X(0.250) |  | 
|  427         pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FI
     X(0.250) |  | 
|  428         pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-
     FIX(0.418) |  | 
|  429         pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-
     FIX(0.418) |  | 
|  430  |  | 
|  431         movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] |  | 
|  432  |  | 
|  433         paddd     xmm6, XMMWORD [wk(6)] |  | 
|  434         paddd     xmm4, XMMWORD [wk(7)] |  | 
|  435         paddd     xmm6,xmm2 |  | 
|  436         paddd     xmm4,xmm2 |  | 
|  437         psrld     xmm6,SCALEBITS        ; xmm6=YEL |  | 
|  438         psrld     xmm4,SCALEBITS        ; xmm4=YEH |  | 
|  439         packssdw  xmm6,xmm4             ; xmm6=YE |  | 
|  440  |  | 
|  441         psllw     xmm0,BYTE_BIT |  | 
|  442         por       xmm6,xmm0             ; xmm6=Y |  | 
|  443         movdqa    XMMWORD [edi], xmm6   ; Save Y |  | 
|  444  |  | 
|  445         pxor      xmm2,xmm2 |  | 
|  446         pxor      xmm4,xmm4 |  | 
|  447         punpcklwd xmm2,xmm3             ; xmm2=REL |  | 
|  448         punpckhwd xmm4,xmm3             ; xmm4=REH |  | 
|  449         psrld     xmm2,1                ; xmm2=REL*FIX(0.500) |  | 
|  450         psrld     xmm4,1                ; xmm4=REH*FIX(0.500) |  | 
|  451  |  | 
|  452         movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] |  | 
|  453  |  | 
|  454         paddd     xmm1,xmm2 |  | 
|  455         paddd     xmm5,xmm4 |  | 
|  456         paddd     xmm1,xmm0 |  | 
|  457         paddd     xmm5,xmm0 |  | 
|  458         psrld     xmm1,SCALEBITS        ; xmm1=CrEL |  | 
|  459         psrld     xmm5,SCALEBITS        ; xmm5=CrEH |  | 
|  460         packssdw  xmm1,xmm5             ; xmm1=CrE |  | 
|  461  |  | 
|  462         psllw     xmm7,BYTE_BIT |  | 
|  463         por       xmm1,xmm7             ; xmm1=Cr |  | 
|  464         movdqa    XMMWORD [edx], xmm1   ; Save Cr |  | 
|  465  |  | 
|  466         sub     ecx, byte SIZEOF_XMMWORD |  | 
|  467         add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr |  | 
|  468         add     edi, byte SIZEOF_XMMWORD                ; outptr0 |  | 
|  469         add     ebx, byte SIZEOF_XMMWORD                ; outptr1 |  | 
|  470         add     edx, byte SIZEOF_XMMWORD                ; outptr2 |  | 
|  471         cmp     ecx, byte SIZEOF_XMMWORD |  | 
|  472         jae     near .columnloop |  | 
|  473         test    ecx,ecx |  | 
|  474         jnz     near .column_ld1 |  | 
|  475  |  | 
|  476         pop     ecx                     ; col |  | 
|  477         pop     esi |  | 
|  478         pop     edi |  | 
|  479         pop     ebx |  | 
|  480         pop     edx |  | 
|  481         poppic  eax |  | 
|  482  |  | 
|  483         add     esi, byte SIZEOF_JSAMPROW       ; input_buf |  | 
|  484         add     edi, byte SIZEOF_JSAMPROW |  | 
|  485         add     ebx, byte SIZEOF_JSAMPROW |  | 
|  486         add     edx, byte SIZEOF_JSAMPROW |  | 
|  487         dec     eax                             ; num_rows |  | 
|  488         jg      near .rowloop |  | 
|  489  |  | 
|  490 .return: |  | 
|  491         pop     edi |  | 
|  492         pop     esi |  | 
|  493 ;       pop     edx             ; need not be preserved |  | 
|  494 ;       pop     ecx             ; need not be preserved |  | 
|  495         pop     ebx |  | 
|  496         mov     esp,ebp         ; esp <- aligned ebp |  | 
|  497         pop     esp             ; esp <- original ebp |  | 
|  498         pop     ebp |  | 
|  499         ret |  | 
|  500  |  | 
|  501 ; For some reason, the OS X linker does not honor the request to align the |  | 
|  502 ; segment unless we do this. |  | 
|  503         align   16 |  | 
| OLD | NEW |