| OLD | NEW | 
| (Empty) |  | 
 |    1 ; | 
 |    2 ; jcqnts2i.asm - sample data conversion and quantization (SSE2) | 
 |    3 ; | 
 |    4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 
 |    5 ; | 
 |    6 ; Based on | 
 |    7 ; x86 SIMD extension for IJG JPEG library | 
 |    8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 
 |    9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 
 |   10 ; | 
 |   11 ; This file should be assembled with NASM (Netwide Assembler), | 
 |   12 ; can *not* be assembled with Microsoft's MASM or any compatible | 
 |   13 ; assembler (including Borland's Turbo Assembler). | 
 |   14 ; NASM is available from http://nasm.sourceforge.net/ or | 
 |   15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 
 |   16 ; | 
 |   17 ; [TAB8] | 
 |   18  | 
 |   19 %include "jsimdext.inc" | 
 |   20 %include "jdct.inc" | 
 |   21  | 
 |   22 ; -------------------------------------------------------------------------- | 
 |   23         SECTION SEG_TEXT | 
 |   24         BITS    32 | 
 |   25 ; | 
 |   26 ; Load data into workspace, applying unsigned->signed conversion | 
 |   27 ; | 
 |   28 ; GLOBAL(void) | 
 |   29 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, | 
 |   30 ;                      DCTELEM * workspace); | 
 |   31 ; | 
 |   32  | 
 |   33 %define sample_data     ebp+8           ; JSAMPARRAY sample_data | 
 |   34 %define start_col       ebp+12          ; JDIMENSION start_col | 
 |   35 %define workspace       ebp+16          ; DCTELEM * workspace | 
 |   36  | 
 |   37         align   16 | 
 |   38         global  EXTN(jsimd_convsamp_sse2) | 
 |   39  | 
 |   40 EXTN(jsimd_convsamp_sse2): | 
 |   41         push    ebp | 
 |   42         mov     ebp,esp | 
 |   43         push    ebx | 
 |   44 ;       push    ecx             ; need not be preserved | 
 |   45 ;       push    edx             ; need not be preserved | 
 |   46         push    esi | 
 |   47         push    edi | 
 |   48  | 
 |   49         pxor    xmm6,xmm6               ; xmm6=(all 0's) | 
 |   50         pcmpeqw xmm7,xmm7 | 
 |   51         psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} | 
 |   52  | 
 |   53         mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *) | 
 |   54         mov     eax, JDIMENSION [start_col] | 
 |   55         mov     edi, POINTER [workspace]        ; (DCTELEM *) | 
 |   56         mov     ecx, DCTSIZE/4 | 
 |   57         alignx  16,7 | 
 |   58 .convloop: | 
 |   59         mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *) | 
 |   60         mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *) | 
 |   61  | 
 |   62         movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm0=(01234567
     ) | 
 |   63         movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF
     ) | 
 |   64  | 
 |   65         mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *) | 
 |   66         mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *) | 
 |   67  | 
 |   68         movq    xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN
     ) | 
 |   69         movq    xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV
     ) | 
 |   70  | 
 |   71         punpcklbw xmm0,xmm6             ; xmm0=(01234567) | 
 |   72         punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF) | 
 |   73         paddw     xmm0,xmm7 | 
 |   74         paddw     xmm1,xmm7 | 
 |   75         punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN) | 
 |   76         punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV) | 
 |   77         paddw     xmm2,xmm7 | 
 |   78         paddw     xmm3,xmm7 | 
 |   79  | 
 |   80         movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 | 
 |   81         movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 | 
 |   82         movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 | 
 |   83         movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 | 
 |   84  | 
 |   85         add     esi, byte 4*SIZEOF_JSAMPROW | 
 |   86         add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM | 
 |   87         dec     ecx | 
 |   88         jnz     short .convloop | 
 |   89  | 
 |   90         pop     edi | 
 |   91         pop     esi | 
 |   92 ;       pop     edx             ; need not be preserved | 
 |   93 ;       pop     ecx             ; need not be preserved | 
 |   94         pop     ebx | 
 |   95         pop     ebp | 
 |   96         ret | 
 |   97  | 
 |   98 ; -------------------------------------------------------------------------- | 
 |   99 ; | 
 |  100 ; Quantize/descale the coefficients, and store into coef_block | 
 |  101 ; | 
 |  102 ; This implementation is based on an algorithm described in | 
 |  103 ;   "How to optimize for the Pentium family of microprocessors" | 
 |  104 ;   (http://www.agner.org/assem/). | 
 |  105 ; | 
 |  106 ; GLOBAL(void) | 
 |  107 ; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors, | 
 |  108 ;                      DCTELEM * workspace); | 
 |  109 ; | 
 |  110  | 
 |  111 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) | 
 |  112 %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) | 
 |  113 %define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) | 
 |  114  | 
 |  115 %define coef_block      ebp+8           ; JCOEFPTR coef_block | 
 |  116 %define divisors        ebp+12          ; DCTELEM * divisors | 
 |  117 %define workspace       ebp+16          ; DCTELEM * workspace | 
 |  118  | 
 |  119         align   16 | 
 |  120         global  EXTN(jsimd_quantize_sse2) | 
 |  121  | 
 |  122 EXTN(jsimd_quantize_sse2): | 
 |  123         push    ebp | 
 |  124         mov     ebp,esp | 
 |  125 ;       push    ebx             ; unused | 
 |  126 ;       push    ecx             ; unused | 
 |  127 ;       push    edx             ; need not be preserved | 
 |  128         push    esi | 
 |  129         push    edi | 
 |  130  | 
 |  131         mov     esi, POINTER [workspace] | 
 |  132         mov     edx, POINTER [divisors] | 
 |  133         mov     edi, JCOEFPTR [coef_block] | 
 |  134         mov     eax, DCTSIZE2/32 | 
 |  135         alignx  16,7 | 
 |  136 .quantloop: | 
 |  137         movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] | 
 |  138         movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] | 
 |  139         movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] | 
 |  140         movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] | 
 |  141         movdqa  xmm0,xmm4 | 
 |  142         movdqa  xmm1,xmm5 | 
 |  143         movdqa  xmm2,xmm6 | 
 |  144         movdqa  xmm3,xmm7 | 
 |  145         psraw   xmm4,(WORD_BIT-1) | 
 |  146         psraw   xmm5,(WORD_BIT-1) | 
 |  147         psraw   xmm6,(WORD_BIT-1) | 
 |  148         psraw   xmm7,(WORD_BIT-1) | 
 |  149         pxor    xmm0,xmm4 | 
 |  150         pxor    xmm1,xmm5 | 
 |  151         pxor    xmm2,xmm6 | 
 |  152         pxor    xmm3,xmm7 | 
 |  153         psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0; | 
 |  154         psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1; | 
 |  155         psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2; | 
 |  156         psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3; | 
 |  157  | 
 |  158         paddw   xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor | 
 |  159         paddw   xmm1, XMMWORD [CORRECTION(1,0,edx)] | 
 |  160         paddw   xmm2, XMMWORD [CORRECTION(2,0,edx)] | 
 |  161         paddw   xmm3, XMMWORD [CORRECTION(3,0,edx)] | 
 |  162         pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal | 
 |  163         pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] | 
 |  164         pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] | 
 |  165         pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] | 
 |  166         pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)]  ; scale | 
 |  167         pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] | 
 |  168         pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] | 
 |  169         pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] | 
 |  170  | 
 |  171         pxor    xmm0,xmm4 | 
 |  172         pxor    xmm1,xmm5 | 
 |  173         pxor    xmm2,xmm6 | 
 |  174         pxor    xmm3,xmm7 | 
 |  175         psubw   xmm0,xmm4 | 
 |  176         psubw   xmm1,xmm5 | 
 |  177         psubw   xmm2,xmm6 | 
 |  178         psubw   xmm3,xmm7 | 
 |  179         movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 | 
 |  180         movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 | 
 |  181         movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 | 
 |  182         movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 | 
 |  183  | 
 |  184         add     esi, byte 32*SIZEOF_DCTELEM | 
 |  185         add     edx, byte 32*SIZEOF_DCTELEM | 
 |  186         add     edi, byte 32*SIZEOF_JCOEF | 
 |  187         dec     eax | 
 |  188         jnz     near .quantloop | 
 |  189  | 
 |  190         pop     edi | 
 |  191         pop     esi | 
 |  192 ;       pop     edx             ; need not be preserved | 
 |  193 ;       pop     ecx             ; unused | 
 |  194 ;       pop     ebx             ; unused | 
 |  195         pop     ebp | 
 |  196         ret | 
 |  197  | 
 |  198 ; For some reason, the OS X linker does not honor the request to align the | 
 |  199 ; segment unless we do this. | 
 |  200         align   16 | 
| OLD | NEW |