| OLD | NEW | 
| (Empty) |  | 
 |    1 ; | 
 |    2 ; jcclrss2-64.asm - colorspace conversion (64-bit SSE2) | 
 |    3 ; | 
 |    4 ; x86 SIMD extension for IJG JPEG library | 
 |    5 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 
 |    6 ; Copyright (C) 2009, D. R. Commander. | 
 |    7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 
 |    8 ; | 
 |    9 ; This file should be assembled with NASM (Netwide Assembler), | 
 |   10 ; can *not* be assembled with Microsoft's MASM or any compatible | 
 |   11 ; assembler (including Borland's Turbo Assembler). | 
 |   12 ; NASM is available from http://nasm.sourceforge.net/ or | 
 |   13 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 
 |   14 ; | 
 |   15 ; [TAB8] | 
 |   16  | 
 |   17 %include "jcolsamp.inc" | 
 |   18  | 
 |   19 ; -------------------------------------------------------------------------- | 
 |   20         SECTION SEG_TEXT | 
 |   21         BITS    64 | 
 |   22 ; | 
 |   23 ; Convert some rows of samples to the output colorspace. | 
 |   24 ; | 
 |   25 ; GLOBAL(void) | 
 |   26 ; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, | 
 |   27 ;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf, | 
 |   28 ;                             JDIMENSION output_row, int num_rows); | 
 |   29 ; | 
 |   30  | 
 |   31 ; r10 = JDIMENSION img_width | 
 |   32 ; r11 = JSAMPARRAY input_buf | 
 |   33 ; r12 = JSAMPIMAGE output_buf | 
 |   34 ; r13 = JDIMENSION output_row | 
 |   35 ; r14 = int num_rows | 
 |   36  | 
 |   37 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 
 |   38 %define WK_NUM          8 | 
 |   39  | 
 |   40         align   16 | 
 |   41  | 
 |   42         global  EXTN(jsimd_rgb_ycc_convert_sse2) | 
 |   43  | 
 |   44 EXTN(jsimd_rgb_ycc_convert_sse2): | 
 |   45         push    rbp | 
 |   46         mov     rax,rsp                         ; rax = original rbp | 
 |   47         sub     rsp, byte 4 | 
 |   48         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits | 
 |   49         mov     [rsp],rax | 
 |   50         mov     rbp,rsp                         ; rbp = aligned rbp | 
 |   51         lea     rsp, [wk(0)] | 
 |   52         collect_args | 
 |   53         push    rbx | 
 |   54  | 
 |   55         mov     rcx, r10 | 
 |   56         test    rcx,rcx | 
 |   57         jz      near .return | 
 |   58  | 
 |   59         push    rcx | 
 |   60  | 
 |   61         mov rsi, r12 | 
 |   62         mov rcx, r13 | 
 |   63         mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] | 
 |   64         mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] | 
 |   65         mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] | 
 |   66         lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW] | 
 |   67         lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW] | 
 |   68         lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW] | 
 |   69  | 
 |   70         pop     rcx | 
 |   71  | 
 |   72         mov rsi, r11 | 
 |   73         mov     eax, r14d | 
 |   74         test    rax,rax | 
 |   75         jle     near .return | 
 |   76 .rowloop: | 
 |   77         push    rdx | 
 |   78         push    rbx | 
 |   79         push    rdi | 
 |   80         push    rsi | 
 |   81         push    rcx                     ; col | 
 |   82  | 
 |   83         mov     rsi, JSAMPROW [rsi]     ; inptr | 
 |   84         mov     rdi, JSAMPROW [rdi]     ; outptr0 | 
 |   85         mov     rbx, JSAMPROW [rbx]     ; outptr1 | 
 |   86         mov     rdx, JSAMPROW [rdx]     ; outptr2 | 
 |   87  | 
 |   88         cmp     rcx, byte SIZEOF_XMMWORD | 
 |   89         jae     near .columnloop | 
 |   90  | 
 |   91 %if RGB_PIXELSIZE == 3 ; --------------- | 
 |   92  | 
 |   93 .column_ld1: | 
 |   94         push    rax | 
 |   95         push    rdx | 
 |   96         lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE | 
 |   97         test    cl, SIZEOF_BYTE | 
 |   98         jz      short .column_ld2 | 
 |   99         sub     rcx, byte SIZEOF_BYTE | 
 |  100         movzx   rax, BYTE [rsi+rcx] | 
 |  101 .column_ld2: | 
 |  102         test    cl, SIZEOF_WORD | 
 |  103         jz      short .column_ld4 | 
 |  104         sub     rcx, byte SIZEOF_WORD | 
 |  105         movzx   rdx, WORD [rsi+rcx] | 
 |  106         shl     rax, WORD_BIT | 
 |  107         or      rax,rdx | 
 |  108 .column_ld4: | 
 |  109         movd    xmmA,eax | 
 |  110         pop     rdx | 
 |  111         pop     rax | 
 |  112         test    cl, SIZEOF_DWORD | 
 |  113         jz      short .column_ld8 | 
 |  114         sub     rcx, byte SIZEOF_DWORD | 
 |  115         movd    xmmF, XMM_DWORD [rsi+rcx] | 
 |  116         pslldq  xmmA, SIZEOF_DWORD | 
 |  117         por     xmmA,xmmF | 
 |  118 .column_ld8: | 
 |  119         test    cl, SIZEOF_MMWORD | 
 |  120         jz      short .column_ld16 | 
 |  121         sub     rcx, byte SIZEOF_MMWORD | 
 |  122         movq    xmmB, XMM_MMWORD [rsi+rcx] | 
 |  123         pslldq  xmmA, SIZEOF_MMWORD | 
 |  124         por     xmmA,xmmB | 
 |  125 .column_ld16: | 
 |  126         test    cl, SIZEOF_XMMWORD | 
 |  127         jz      short .column_ld32 | 
 |  128         movdqa  xmmF,xmmA | 
 |  129         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
 |  130         mov     rcx, SIZEOF_XMMWORD | 
 |  131         jmp     short .rgb_ycc_cnv | 
 |  132 .column_ld32: | 
 |  133         test    cl, 2*SIZEOF_XMMWORD | 
 |  134         mov     rcx, SIZEOF_XMMWORD | 
 |  135         jz      short .rgb_ycc_cnv | 
 |  136         movdqa  xmmB,xmmA | 
 |  137         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
 |  138         movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] | 
 |  139         jmp     short .rgb_ycc_cnv | 
 |  140  | 
 |  141 .columnloop: | 
 |  142         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
 |  143         movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] | 
 |  144         movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] | 
 |  145  | 
 |  146 .rgb_ycc_cnv: | 
 |  147         ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) | 
 |  148         ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) | 
 |  149         ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) | 
 |  150  | 
 |  151         movdqa    xmmG,xmmA | 
 |  152         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 2
     1 02 12) | 
 |  153         psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -
     - -- --) | 
 |  154  | 
 |  155         punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0
     A 12 1A) | 
 |  156         pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 0
     7 17 27) | 
 |  157  | 
 |  158         punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2
     C 05 0D) | 
 |  159         punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1
     F 27 2F) | 
 |  160  | 
 |  161         movdqa    xmmD,xmmA | 
 |  162         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 2
     8 01 09) | 
 |  163         psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -
     - -- --) | 
 |  164  | 
 |  165         punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 0
     5 09 0D) | 
 |  166         pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1
     B 23 2B) | 
 |  167  | 
 |  168         punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 1
     6 1A 1E) | 
 |  169         punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 2
     7 2B 2F) | 
 |  170  | 
 |  171         movdqa    xmmE,xmmA | 
 |  172         pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 1
     4 18 1C) | 
 |  173         psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -
     - -- --) | 
 |  174  | 
 |  175         punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1
     A 1C 1E) | 
 |  176         pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 2
     5 29 2D) | 
 |  177  | 
 |  178         punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0
     B 0D 0F) | 
 |  179         punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2
     B 2D 2F) | 
 |  180  | 
 |  181         pxor      xmmH,xmmH | 
 |  182  | 
 |  183         movdqa    xmmC,xmmA | 
 |  184         punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E) | 
 |  185         punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E) | 
 |  186  | 
 |  187         movdqa    xmmB,xmmE | 
 |  188         punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E) | 
 |  189         punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F) | 
 |  190  | 
 |  191         movdqa    xmmF,xmmD | 
 |  192         punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F) | 
 |  193         punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F) | 
 |  194  | 
 |  195 %else ; RGB_PIXELSIZE == 4 ; ----------- | 
 |  196  | 
 |  197 .column_ld1: | 
 |  198         test    cl, SIZEOF_XMMWORD/16 | 
 |  199         jz      short .column_ld2 | 
 |  200         sub     rcx, byte SIZEOF_XMMWORD/16 | 
 |  201         movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] | 
 |  202 .column_ld2: | 
 |  203         test    cl, SIZEOF_XMMWORD/8 | 
 |  204         jz      short .column_ld4 | 
 |  205         sub     rcx, byte SIZEOF_XMMWORD/8 | 
 |  206         movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] | 
 |  207         pslldq  xmmA, SIZEOF_MMWORD | 
 |  208         por     xmmA,xmmE | 
 |  209 .column_ld4: | 
 |  210         test    cl, SIZEOF_XMMWORD/4 | 
 |  211         jz      short .column_ld8 | 
 |  212         sub     rcx, byte SIZEOF_XMMWORD/4 | 
 |  213         movdqa  xmmE,xmmA | 
 |  214         movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] | 
 |  215 .column_ld8: | 
 |  216         test    cl, SIZEOF_XMMWORD/2 | 
 |  217         mov     rcx, SIZEOF_XMMWORD | 
 |  218         jz      short .rgb_ycc_cnv | 
 |  219         movdqa  xmmF,xmmA | 
 |  220         movdqa  xmmH,xmmE | 
 |  221         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
 |  222         movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] | 
 |  223         jmp     short .rgb_ycc_cnv | 
 |  224  | 
 |  225 .columnloop: | 
 |  226         movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] | 
 |  227         movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] | 
 |  228         movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] | 
 |  229         movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] | 
 |  230  | 
 |  231 .rgb_ycc_cnv: | 
 |  232         ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) | 
 |  233         ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) | 
 |  234         ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) | 
 |  235         ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) | 
 |  236  | 
 |  237         movdqa    xmmD,xmmA | 
 |  238         punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 2
     5 31 35) | 
 |  239         punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 2
     7 33 37) | 
 |  240  | 
 |  241         movdqa    xmmC,xmmF | 
 |  242         punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2
     D 39 3D) | 
 |  243         punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2
     F 3B 3F) | 
 |  244  | 
 |  245         movdqa    xmmB,xmmA | 
 |  246         punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 3
     4 38 3C) | 
 |  247         punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 3
     5 39 3D) | 
 |  248  | 
 |  249         movdqa    xmmG,xmmD | 
 |  250         punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 3
     6 3A 3E) | 
 |  251         punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 3
     7 3B 3F) | 
 |  252  | 
 |  253         movdqa    xmmE,xmmA | 
 |  254         punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1
     A 1C 1E) | 
 |  255         punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3
     A 3C 3E) | 
 |  256  | 
 |  257         movdqa    xmmH,xmmB | 
 |  258         punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1
     B 1D 1F) | 
 |  259         punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3
     B 3D 3F) | 
 |  260  | 
 |  261         pxor      xmmF,xmmF | 
 |  262  | 
 |  263         movdqa    xmmC,xmmA | 
 |  264         punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E) | 
 |  265         punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E) | 
 |  266  | 
 |  267         movdqa    xmmD,xmmB | 
 |  268         punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F) | 
 |  269         punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F) | 
 |  270  | 
 |  271         movdqa    xmmG,xmmE | 
 |  272         punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E) | 
 |  273         punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E) | 
 |  274  | 
 |  275         punpcklbw xmmF,xmmH | 
 |  276         punpckhbw xmmH,xmmH | 
 |  277         psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) | 
 |  278         psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) | 
 |  279  | 
 |  280 %endif ; RGB_PIXELSIZE ; --------------- | 
 |  281  | 
 |  282         ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE | 
 |  283         ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO | 
 |  284  | 
 |  285         ; (Original) | 
 |  286         ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B | 
 |  287         ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE | 
 |  288         ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE | 
 |  289         ; | 
 |  290         ; (This implementation) | 
 |  291         ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G | 
 |  292         ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE | 
 |  293         ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE | 
 |  294  | 
 |  295         movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE | 
 |  296         movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO | 
 |  297         movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE | 
 |  298         movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO | 
 |  299  | 
 |  300         movdqa    xmm6,xmm1 | 
 |  301         punpcklwd xmm1,xmm3 | 
 |  302         punpckhwd xmm6,xmm3 | 
 |  303         movdqa    xmm7,xmm1 | 
 |  304         movdqa    xmm4,xmm6 | 
 |  305         pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) | 
 |  306         pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) | 
 |  307         pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.33
     1) | 
 |  308         pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.33
     1) | 
 |  309  | 
 |  310         movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) | 
 |  311         movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) | 
 |  312  | 
 |  313         pxor      xmm1,xmm1 | 
 |  314         pxor      xmm6,xmm6 | 
 |  315         punpcklwd xmm1,xmm5             ; xmm1=BOL | 
 |  316         punpckhwd xmm6,xmm5             ; xmm6=BOH | 
 |  317         psrld     xmm1,1                ; xmm1=BOL*FIX(0.500) | 
 |  318         psrld     xmm6,1                ; xmm6=BOH*FIX(0.500) | 
 |  319  | 
 |  320         movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] | 
 |  321  | 
 |  322         paddd     xmm7,xmm1 | 
 |  323         paddd     xmm4,xmm6 | 
 |  324         paddd     xmm7,xmm5 | 
 |  325         paddd     xmm4,xmm5 | 
 |  326         psrld     xmm7,SCALEBITS        ; xmm7=CbOL | 
 |  327         psrld     xmm4,SCALEBITS        ; xmm4=CbOH | 
 |  328         packssdw  xmm7,xmm4             ; xmm7=CbO | 
 |  329  | 
 |  330         movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE | 
 |  331  | 
 |  332         movdqa    xmm6,xmm0 | 
 |  333         punpcklwd xmm0,xmm2 | 
 |  334         punpckhwd xmm6,xmm2 | 
 |  335         movdqa    xmm5,xmm0 | 
 |  336         movdqa    xmm4,xmm6 | 
 |  337         pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) | 
 |  338         pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) | 
 |  339         pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.33
     1) | 
 |  340         pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.33
     1) | 
 |  341  | 
 |  342         movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) | 
 |  343         movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) | 
 |  344  | 
 |  345         pxor      xmm0,xmm0 | 
 |  346         pxor      xmm6,xmm6 | 
 |  347         punpcklwd xmm0,xmm1             ; xmm0=BEL | 
 |  348         punpckhwd xmm6,xmm1             ; xmm6=BEH | 
 |  349         psrld     xmm0,1                ; xmm0=BEL*FIX(0.500) | 
 |  350         psrld     xmm6,1                ; xmm6=BEH*FIX(0.500) | 
 |  351  | 
 |  352         movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] | 
 |  353  | 
 |  354         paddd     xmm5,xmm0 | 
 |  355         paddd     xmm4,xmm6 | 
 |  356         paddd     xmm5,xmm1 | 
 |  357         paddd     xmm4,xmm1 | 
 |  358         psrld     xmm5,SCALEBITS        ; xmm5=CbEL | 
 |  359         psrld     xmm4,SCALEBITS        ; xmm4=CbEH | 
 |  360         packssdw  xmm5,xmm4             ; xmm5=CbE | 
 |  361  | 
 |  362         psllw     xmm7,BYTE_BIT | 
 |  363         por       xmm5,xmm7             ; xmm5=Cb | 
 |  364         movdqa    XMMWORD [rbx], xmm5   ; Save Cb | 
 |  365  | 
 |  366         movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO | 
 |  367         movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE | 
 |  368         movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO | 
 |  369  | 
 |  370         movdqa    xmm4,xmm0 | 
 |  371         punpcklwd xmm0,xmm3 | 
 |  372         punpckhwd xmm4,xmm3 | 
 |  373         movdqa    xmm7,xmm0 | 
 |  374         movdqa    xmm5,xmm4 | 
 |  375         pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) | 
 |  376         pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) | 
 |  377         pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.41
     8) | 
 |  378         pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.41
     8) | 
 |  379  | 
 |  380         movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] | 
 |  381  | 
 |  382         paddd     xmm0, XMMWORD [wk(4)] | 
 |  383         paddd     xmm4, XMMWORD [wk(5)] | 
 |  384         paddd     xmm0,xmm3 | 
 |  385         paddd     xmm4,xmm3 | 
 |  386         psrld     xmm0,SCALEBITS        ; xmm0=YOL | 
 |  387         psrld     xmm4,SCALEBITS        ; xmm4=YOH | 
 |  388         packssdw  xmm0,xmm4             ; xmm0=YO | 
 |  389  | 
 |  390         pxor      xmm3,xmm3 | 
 |  391         pxor      xmm4,xmm4 | 
 |  392         punpcklwd xmm3,xmm1             ; xmm3=ROL | 
 |  393         punpckhwd xmm4,xmm1             ; xmm4=ROH | 
 |  394         psrld     xmm3,1                ; xmm3=ROL*FIX(0.500) | 
 |  395         psrld     xmm4,1                ; xmm4=ROH*FIX(0.500) | 
 |  396  | 
 |  397         movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] | 
 |  398  | 
 |  399         paddd     xmm7,xmm3 | 
 |  400         paddd     xmm5,xmm4 | 
 |  401         paddd     xmm7,xmm1 | 
 |  402         paddd     xmm5,xmm1 | 
 |  403         psrld     xmm7,SCALEBITS        ; xmm7=CrOL | 
 |  404         psrld     xmm5,SCALEBITS        ; xmm5=CrOH | 
 |  405         packssdw  xmm7,xmm5             ; xmm7=CrO | 
 |  406  | 
 |  407         movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE | 
 |  408  | 
 |  409         movdqa    xmm4,xmm6 | 
 |  410         punpcklwd xmm6,xmm2 | 
 |  411         punpckhwd xmm4,xmm2 | 
 |  412         movdqa    xmm1,xmm6 | 
 |  413         movdqa    xmm5,xmm4 | 
 |  414         pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) | 
 |  415         pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) | 
 |  416         pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.41
     8) | 
 |  417         pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.41
     8) | 
 |  418  | 
 |  419         movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] | 
 |  420  | 
 |  421         paddd     xmm6, XMMWORD [wk(6)] | 
 |  422         paddd     xmm4, XMMWORD [wk(7)] | 
 |  423         paddd     xmm6,xmm2 | 
 |  424         paddd     xmm4,xmm2 | 
 |  425         psrld     xmm6,SCALEBITS        ; xmm6=YEL | 
 |  426         psrld     xmm4,SCALEBITS        ; xmm4=YEH | 
 |  427         packssdw  xmm6,xmm4             ; xmm6=YE | 
 |  428  | 
 |  429         psllw     xmm0,BYTE_BIT | 
 |  430         por       xmm6,xmm0             ; xmm6=Y | 
 |  431         movdqa    XMMWORD [rdi], xmm6   ; Save Y | 
 |  432  | 
 |  433         pxor      xmm2,xmm2 | 
 |  434         pxor      xmm4,xmm4 | 
 |  435         punpcklwd xmm2,xmm3             ; xmm2=REL | 
 |  436         punpckhwd xmm4,xmm3             ; xmm4=REH | 
 |  437         psrld     xmm2,1                ; xmm2=REL*FIX(0.500) | 
 |  438         psrld     xmm4,1                ; xmm4=REH*FIX(0.500) | 
 |  439  | 
 |  440         movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] | 
 |  441  | 
 |  442         paddd     xmm1,xmm2 | 
 |  443         paddd     xmm5,xmm4 | 
 |  444         paddd     xmm1,xmm0 | 
 |  445         paddd     xmm5,xmm0 | 
 |  446         psrld     xmm1,SCALEBITS        ; xmm1=CrEL | 
 |  447         psrld     xmm5,SCALEBITS        ; xmm5=CrEH | 
 |  448         packssdw  xmm1,xmm5             ; xmm1=CrE | 
 |  449  | 
 |  450         psllw     xmm7,BYTE_BIT | 
 |  451         por       xmm1,xmm7             ; xmm1=Cr | 
 |  452         movdqa    XMMWORD [rdx], xmm1   ; Save Cr | 
 |  453  | 
 |  454         sub     rcx, byte SIZEOF_XMMWORD | 
 |  455         add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr | 
 |  456         add     rdi, byte SIZEOF_XMMWORD                ; outptr0 | 
 |  457         add     rbx, byte SIZEOF_XMMWORD                ; outptr1 | 
 |  458         add     rdx, byte SIZEOF_XMMWORD                ; outptr2 | 
 |  459         cmp     rcx, byte SIZEOF_XMMWORD | 
 |  460         jae     near .columnloop | 
 |  461         test    rcx,rcx | 
 |  462         jnz     near .column_ld1 | 
 |  463  | 
 |  464         pop     rcx                     ; col | 
 |  465         pop     rsi | 
 |  466         pop     rdi | 
 |  467         pop     rbx | 
 |  468         pop     rdx | 
 |  469  | 
 |  470         add     rsi, byte SIZEOF_JSAMPROW       ; input_buf | 
 |  471         add     rdi, byte SIZEOF_JSAMPROW | 
 |  472         add     rbx, byte SIZEOF_JSAMPROW | 
 |  473         add     rdx, byte SIZEOF_JSAMPROW | 
 |  474         dec     rax                             ; num_rows | 
 |  475         jg      near .rowloop | 
 |  476  | 
 |  477 .return: | 
 |  478         pop     rbx | 
 |  479         uncollect_args | 
 |  480         mov     rsp,rbp         ; rsp <- aligned rbp | 
 |  481         pop     rsp             ; rsp <- original rbp | 
 |  482         pop     rbp | 
 |  483         ret | 
 |  484  | 
 |  485 ; For some reason, the OS X linker does not honor the request to align the | 
 |  486 ; segment unless we do this. | 
 |  487         align   16 | 
| OLD | NEW |