| OLD | NEW | 
|---|
| (Empty) |  | 
|  | 1 ; | 
|  | 2 ; jfss2fst.asm - fast integer FDCT (SSE2) | 
|  | 3 ; | 
|  | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 
|  | 5 ; | 
|  | 6 ; Based on | 
|  | 7 ; x86 SIMD extension for IJG JPEG library | 
|  | 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 
|  | 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 
|  | 10 ; | 
|  | 11 ; This file should be assembled with NASM (Netwide Assembler), | 
|  | 12 ; can *not* be assembled with Microsoft's MASM or any compatible | 
|  | 13 ; assembler (including Borland's Turbo Assembler). | 
|  | 14 ; NASM is available from http://nasm.sourceforge.net/ or | 
|  | 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 
|  | 16 ; | 
|  | 17 ; This file contains a fast, not so accurate integer implementation of | 
|  | 18 ; the forward DCT (Discrete Cosine Transform). The following code is | 
|  | 19 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c | 
|  | 20 ; for more details. | 
|  | 21 ; | 
|  | 22 ; [TAB8] | 
|  | 23 | 
|  | 24 %include "jsimdext.inc" | 
|  | 25 %include "jdct.inc" | 
|  | 26 | 
|  | 27 ; -------------------------------------------------------------------------- | 
|  | 28 | 
|  | 29 %define CONST_BITS      8       ; 14 is also OK. | 
|  | 30 | 
|  | 31 %if CONST_BITS == 8 | 
|  | 32 F_0_382 equ      98             ; FIX(0.382683433) | 
|  | 33 F_0_541 equ     139             ; FIX(0.541196100) | 
|  | 34 F_0_707 equ     181             ; FIX(0.707106781) | 
|  | 35 F_1_306 equ     334             ; FIX(1.306562965) | 
|  | 36 %else | 
|  | 37 ; NASM cannot do compile-time arithmetic on floating-point constants. | 
|  | 38 %define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n)) | 
|  | 39 F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433) | 
|  | 40 F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100) | 
|  | 41 F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781) | 
|  | 42 F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965) | 
|  | 43 %endif | 
|  | 44 | 
|  | 45 ; -------------------------------------------------------------------------- | 
|  | 46         SECTION SEG_CONST | 
|  | 47 | 
|  | 48 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) | 
|  | 49 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) | 
|  | 50 | 
|  | 51 %define PRE_MULTIPLY_SCALE_BITS   2 | 
|  | 52 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | 
|  | 53 | 
|  | 54         alignz  16 | 
|  | 55         global  EXTN(jconst_fdct_ifast_sse2) | 
|  | 56 | 
|  | 57 EXTN(jconst_fdct_ifast_sse2): | 
|  | 58 | 
|  | 59 PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT | 
|  | 60 PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT | 
|  | 61 PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT | 
|  | 62 PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT | 
|  | 63 | 
|  | 64         alignz  16 | 
|  | 65 | 
|  | 66 ; -------------------------------------------------------------------------- | 
|  | 67         SECTION SEG_TEXT | 
|  | 68         BITS    32 | 
|  | 69 ; | 
|  | 70 ; Perform the forward DCT on one block of samples. | 
|  | 71 ; | 
|  | 72 ; GLOBAL(void) | 
|  | 73 ; jsimd_fdct_ifast_sse2 (DCTELEM * data) | 
|  | 74 ; | 
|  | 75 | 
|  | 76 %define data(b)         (b)+8           ; DCTELEM * data | 
|  | 77 | 
|  | 78 %define original_ebp    ebp+0 | 
|  | 79 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 
|  | 80 %define WK_NUM          2 | 
|  | 81 | 
|  | 82         align   16 | 
|  | 83         global  EXTN(jsimd_fdct_ifast_sse2) | 
|  | 84 | 
|  | 85 EXTN(jsimd_fdct_ifast_sse2): | 
|  | 86         push    ebp | 
|  | 87         mov     eax,esp                         ; eax = original ebp | 
|  | 88         sub     esp, byte 4 | 
|  | 89         and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits | 
|  | 90         mov     [esp],eax | 
|  | 91         mov     ebp,esp                         ; ebp = aligned ebp | 
|  | 92         lea     esp, [wk(0)] | 
|  | 93         pushpic ebx | 
|  | 94 ;       push    ecx             ; unused | 
|  | 95 ;       push    edx             ; need not be preserved | 
|  | 96 ;       push    esi             ; unused | 
|  | 97 ;       push    edi             ; unused | 
|  | 98 | 
|  | 99         get_GOT ebx             ; get GOT address | 
|  | 100 | 
|  | 101         ; ---- Pass 1: process rows. | 
|  | 102 | 
|  | 103         mov     edx, POINTER [data(eax)]        ; (DCTELEM *) | 
|  | 104 | 
|  | 105         movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] | 
|  | 106         movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] | 
|  | 107         movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] | 
|  | 108         movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] | 
|  | 109 | 
|  | 110         ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) | 
|  | 111         ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) | 
|  | 112 | 
|  | 113         movdqa    xmm4,xmm0             ; transpose coefficients(phase 1) | 
|  | 114         punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13) | 
|  | 115         punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17) | 
|  | 116         movdqa    xmm5,xmm2             ; transpose coefficients(phase 1) | 
|  | 117         punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33) | 
|  | 118         punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37) | 
|  | 119 | 
|  | 120         movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] | 
|  | 121         movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] | 
|  | 122         movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] | 
|  | 123         movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] | 
|  | 124 | 
|  | 125         ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) | 
|  | 126         ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) | 
|  | 127 | 
|  | 128         movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33) | 
|  | 129         movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37) | 
|  | 130 | 
|  | 131         movdqa    xmm2,xmm6             ; transpose coefficients(phase 1) | 
|  | 132         punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53) | 
|  | 133         punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57) | 
|  | 134         movdqa    xmm5,xmm1             ; transpose coefficients(phase 1) | 
|  | 135         punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73) | 
|  | 136         punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77) | 
|  | 137 | 
|  | 138         movdqa    xmm7,xmm6             ; transpose coefficients(phase 2) | 
|  | 139         punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71) | 
|  | 140         punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73) | 
|  | 141         movdqa    xmm3,xmm2             ; transpose coefficients(phase 2) | 
|  | 142         punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75) | 
|  | 143         punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77) | 
|  | 144 | 
|  | 145         movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33) | 
|  | 146         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37) | 
|  | 147         movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73) | 
|  | 148         movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75) | 
|  | 149 | 
|  | 150         movdqa    xmm7,xmm0             ; transpose coefficients(phase 2) | 
|  | 151         punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31) | 
|  | 152         punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33) | 
|  | 153         movdqa    xmm2,xmm4             ; transpose coefficients(phase 2) | 
|  | 154         punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35) | 
|  | 155         punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37) | 
|  | 156 | 
|  | 157         movdqa     xmm1,xmm0            ; transpose coefficients(phase 3) | 
|  | 158         punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0 | 
|  | 159         punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1 | 
|  | 160         movdqa     xmm5,xmm2            ; transpose coefficients(phase 3) | 
|  | 161         punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6 | 
|  | 162         punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7 | 
|  | 163 | 
|  | 164         movdqa  xmm6,xmm1 | 
|  | 165         movdqa  xmm3,xmm0 | 
|  | 166         psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6 | 
|  | 167         psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7 | 
|  | 168         paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1 | 
|  | 169         paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0 | 
|  | 170 | 
|  | 171         movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73) | 
|  | 172         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75) | 
|  | 173         movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6 | 
|  | 174         movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7 | 
|  | 175 | 
|  | 176         movdqa     xmm1,xmm7            ; transpose coefficients(phase 3) | 
|  | 177         punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2 | 
|  | 178         punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3 | 
|  | 179         movdqa     xmm0,xmm4            ; transpose coefficients(phase 3) | 
|  | 180         punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4 | 
|  | 181         punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5 | 
|  | 182 | 
|  | 183         movdqa  xmm2,xmm1 | 
|  | 184         movdqa  xmm5,xmm7 | 
|  | 185         paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3 | 
|  | 186         paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2 | 
|  | 187         psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4 | 
|  | 188         psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5 | 
|  | 189 | 
|  | 190         ; -- Even part | 
|  | 191 | 
|  | 192         movdqa  xmm4,xmm3 | 
|  | 193         movdqa  xmm0,xmm6 | 
|  | 194         psubw   xmm3,xmm1               ; xmm3=tmp13 | 
|  | 195         psubw   xmm6,xmm7               ; xmm6=tmp12 | 
|  | 196         paddw   xmm4,xmm1               ; xmm4=tmp10 | 
|  | 197         paddw   xmm0,xmm7               ; xmm0=tmp11 | 
|  | 198 | 
|  | 199         paddw   xmm6,xmm3 | 
|  | 200         psllw   xmm6,PRE_MULTIPLY_SCALE_BITS | 
|  | 201         pmulhw  xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 | 
|  | 202 | 
|  | 203         movdqa  xmm1,xmm4 | 
|  | 204         movdqa  xmm7,xmm3 | 
|  | 205         psubw   xmm4,xmm0               ; xmm4=data4 | 
|  | 206         psubw   xmm3,xmm6               ; xmm3=data6 | 
|  | 207         paddw   xmm1,xmm0               ; xmm1=data0 | 
|  | 208         paddw   xmm7,xmm6               ; xmm7=data2 | 
|  | 209 | 
|  | 210         movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6 | 
|  | 211         movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7 | 
|  | 212         movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4 | 
|  | 213         movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6 | 
|  | 214 | 
|  | 215         ; -- Odd part | 
|  | 216 | 
|  | 217         paddw   xmm2,xmm5               ; xmm2=tmp10 | 
|  | 218         paddw   xmm5,xmm0               ; xmm5=tmp11 | 
|  | 219         paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7 | 
|  | 220 | 
|  | 221         psllw   xmm2,PRE_MULTIPLY_SCALE_BITS | 
|  | 222         psllw   xmm0,PRE_MULTIPLY_SCALE_BITS | 
|  | 223 | 
|  | 224         psllw   xmm5,PRE_MULTIPLY_SCALE_BITS | 
|  | 225         pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 | 
|  | 226 | 
|  | 227         movdqa  xmm4,xmm2               ; xmm4=tmp10 | 
|  | 228         psubw   xmm2,xmm0 | 
|  | 229         pmulhw  xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 | 
|  | 230         pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) | 
|  | 231         pmulhw  xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) | 
|  | 232         paddw   xmm4,xmm2               ; xmm4=z2 | 
|  | 233         paddw   xmm0,xmm2               ; xmm0=z4 | 
|  | 234 | 
|  | 235         movdqa  xmm3,xmm6 | 
|  | 236         psubw   xmm6,xmm5               ; xmm6=z13 | 
|  | 237         paddw   xmm3,xmm5               ; xmm3=z11 | 
|  | 238 | 
|  | 239         movdqa  xmm2,xmm6 | 
|  | 240         movdqa  xmm5,xmm3 | 
|  | 241         psubw   xmm6,xmm4               ; xmm6=data3 | 
|  | 242         psubw   xmm3,xmm0               ; xmm3=data7 | 
|  | 243         paddw   xmm2,xmm4               ; xmm2=data5 | 
|  | 244         paddw   xmm5,xmm0               ; xmm5=data1 | 
|  | 245 | 
|  | 246         ; ---- Pass 2: process columns. | 
|  | 247 | 
|  | 248 ;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *) | 
|  | 249 | 
|  | 250         ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) | 
|  | 251         ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) | 
|  | 252 | 
|  | 253         movdqa    xmm4,xmm1             ; transpose coefficients(phase 1) | 
|  | 254         punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31) | 
|  | 255         punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71) | 
|  | 256         movdqa    xmm0,xmm7             ; transpose coefficients(phase 1) | 
|  | 257         punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33) | 
|  | 258         punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73) | 
|  | 259 | 
|  | 260         movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4 | 
|  | 261         movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6 | 
|  | 262 | 
|  | 263         ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) | 
|  | 264         ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) | 
|  | 265 | 
|  | 266         movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33) | 
|  | 267         movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73) | 
|  | 268 | 
|  | 269         movdqa    xmm7,xmm5             ; transpose coefficients(phase 1) | 
|  | 270         punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35) | 
|  | 271         punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75) | 
|  | 272         movdqa    xmm0,xmm6             ; transpose coefficients(phase 1) | 
|  | 273         punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37) | 
|  | 274         punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77) | 
|  | 275 | 
|  | 276         movdqa    xmm2,xmm5             ; transpose coefficients(phase 2) | 
|  | 277         punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17) | 
|  | 278         punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37) | 
|  | 279         movdqa    xmm3,xmm7             ; transpose coefficients(phase 2) | 
|  | 280         punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57) | 
|  | 281         punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77) | 
|  | 282 | 
|  | 283         movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33) | 
|  | 284         movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73) | 
|  | 285         movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37) | 
|  | 286         movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57) | 
|  | 287 | 
|  | 288         movdqa    xmm2,xmm1             ; transpose coefficients(phase 2) | 
|  | 289         punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13) | 
|  | 290         punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33) | 
|  | 291         movdqa    xmm7,xmm4             ; transpose coefficients(phase 2) | 
|  | 292         punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53) | 
|  | 293         punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73) | 
|  | 294 | 
|  | 295         movdqa     xmm6,xmm1            ; transpose coefficients(phase 3) | 
|  | 296         punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0 | 
|  | 297         punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1 | 
|  | 298         movdqa     xmm0,xmm7            ; transpose coefficients(phase 3) | 
|  | 299         punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6 | 
|  | 300         punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7 | 
|  | 301 | 
|  | 302         movdqa  xmm5,xmm6 | 
|  | 303         movdqa  xmm3,xmm1 | 
|  | 304         psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6 | 
|  | 305         psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7 | 
|  | 306         paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1 | 
|  | 307         paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0 | 
|  | 308 | 
|  | 309         movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37) | 
|  | 310         movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57) | 
|  | 311         movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6 | 
|  | 312         movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7 | 
|  | 313 | 
|  | 314         movdqa     xmm6,xmm2            ; transpose coefficients(phase 3) | 
|  | 315         punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2 | 
|  | 316         punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3 | 
|  | 317         movdqa     xmm1,xmm4            ; transpose coefficients(phase 3) | 
|  | 318         punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4 | 
|  | 319         punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5 | 
|  | 320 | 
|  | 321         movdqa  xmm7,xmm6 | 
|  | 322         movdqa  xmm0,xmm2 | 
|  | 323         paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3 | 
|  | 324         paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2 | 
|  | 325         psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4 | 
|  | 326         psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5 | 
|  | 327 | 
|  | 328         ; -- Even part | 
|  | 329 | 
|  | 330         movdqa  xmm4,xmm3 | 
|  | 331         movdqa  xmm1,xmm5 | 
|  | 332         psubw   xmm3,xmm6               ; xmm3=tmp13 | 
|  | 333         psubw   xmm5,xmm2               ; xmm5=tmp12 | 
|  | 334         paddw   xmm4,xmm6               ; xmm4=tmp10 | 
|  | 335         paddw   xmm1,xmm2               ; xmm1=tmp11 | 
|  | 336 | 
|  | 337         paddw   xmm5,xmm3 | 
|  | 338         psllw   xmm5,PRE_MULTIPLY_SCALE_BITS | 
|  | 339         pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 | 
|  | 340 | 
|  | 341         movdqa  xmm6,xmm4 | 
|  | 342         movdqa  xmm2,xmm3 | 
|  | 343         psubw   xmm4,xmm1               ; xmm4=data4 | 
|  | 344         psubw   xmm3,xmm5               ; xmm3=data6 | 
|  | 345         paddw   xmm6,xmm1               ; xmm6=data0 | 
|  | 346         paddw   xmm2,xmm5               ; xmm2=data2 | 
|  | 347 | 
|  | 348         movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 | 
|  | 349         movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 | 
|  | 350         movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 | 
|  | 351         movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 | 
|  | 352 | 
|  | 353         ; -- Odd part | 
|  | 354 | 
|  | 355         movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6 | 
|  | 356         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7 | 
|  | 357 | 
|  | 358         paddw   xmm7,xmm0               ; xmm7=tmp10 | 
|  | 359         paddw   xmm0,xmm1               ; xmm0=tmp11 | 
|  | 360         paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7 | 
|  | 361 | 
|  | 362         psllw   xmm7,PRE_MULTIPLY_SCALE_BITS | 
|  | 363         psllw   xmm1,PRE_MULTIPLY_SCALE_BITS | 
|  | 364 | 
|  | 365         psllw   xmm0,PRE_MULTIPLY_SCALE_BITS | 
|  | 366         pmulhw  xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 | 
|  | 367 | 
|  | 368         movdqa  xmm4,xmm7               ; xmm4=tmp10 | 
|  | 369         psubw   xmm7,xmm1 | 
|  | 370         pmulhw  xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 | 
|  | 371         pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) | 
|  | 372         pmulhw  xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) | 
|  | 373         paddw   xmm4,xmm7               ; xmm4=z2 | 
|  | 374         paddw   xmm1,xmm7               ; xmm1=z4 | 
|  | 375 | 
|  | 376         movdqa  xmm3,xmm5 | 
|  | 377         psubw   xmm5,xmm0               ; xmm5=z13 | 
|  | 378         paddw   xmm3,xmm0               ; xmm3=z11 | 
|  | 379 | 
|  | 380         movdqa  xmm6,xmm5 | 
|  | 381         movdqa  xmm2,xmm3 | 
|  | 382         psubw   xmm5,xmm4               ; xmm5=data3 | 
|  | 383         psubw   xmm3,xmm1               ; xmm3=data7 | 
|  | 384         paddw   xmm6,xmm4               ; xmm6=data5 | 
|  | 385         paddw   xmm2,xmm1               ; xmm2=data1 | 
|  | 386 | 
|  | 387         movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 | 
|  | 388         movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 | 
|  | 389         movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 | 
|  | 390         movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 | 
|  | 391 | 
|  | 392 ;       pop     edi             ; unused | 
|  | 393 ;       pop     esi             ; unused | 
|  | 394 ;       pop     edx             ; need not be preserved | 
|  | 395 ;       pop     ecx             ; unused | 
|  | 396         poppic  ebx | 
|  | 397         mov     esp,ebp         ; esp <- aligned ebp | 
|  | 398         pop     esp             ; esp <- original ebp | 
|  | 399         pop     ebp | 
|  | 400         ret | 
|  | 401 | 
|  | 402 ; For some reason, the OS X linker does not honor the request to align the | 
|  | 403 ; segment unless we do this. | 
|  | 404         align   16 | 
| OLD | NEW | 
|---|