Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(419)

Side by Side Diff: simd/jquant-sse.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « simd/jquant-mmx.asm ('k') | simd/jquantf-sse2.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ;
2 ; jquant.asm - sample data conversion and quantization (SSE & MMX)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ;
6 ; Based on
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ;
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
16 ;
17 ; [TAB8]
18
19 %include "jsimdext.inc"
20 %include "jdct.inc"
21
22 ; --------------------------------------------------------------------------
23 SECTION SEG_TEXT
24 BITS 32
25 ;
26 ; Load data into workspace, applying unsigned->signed conversion
27 ;
28 ; GLOBAL(void)
29 ; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
30 ; FAST_FLOAT *workspace);
31 ;
32
33 %define sample_data ebp+8 ; JSAMPARRAY sample_data
34 %define start_col ebp+12 ; JDIMENSION start_col
35 %define workspace ebp+16 ; FAST_FLOAT *workspace
36
37 align 16
38 global EXTN(jsimd_convsamp_float_sse)
39
40 EXTN(jsimd_convsamp_float_sse):
41 push ebp
42 mov ebp,esp
43 push ebx
44 ; push ecx ; need not be preserved
45 ; push edx ; need not be preserved
46 push esi
47 push edi
48
49 pcmpeqw mm7,mm7
50 psllw mm7,7
51 packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
52
53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
54 mov eax, JDIMENSION [start_col]
55 mov edi, POINTER [workspace] ; (DCTELEM *)
56 mov ecx, DCTSIZE/2
57 alignx 16,7
58 .convloop:
59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
61
62 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
64
65 psubb mm0,mm7 ; mm0=(01234567)
66 psubb mm1,mm7 ; mm1=(89ABCDEF)
67
68 punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
69 punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
70 punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
71 punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
72
73 punpcklwd mm4,mm2 ; mm4=(***0***1)
74 punpckhwd mm2,mm2 ; mm2=(***2***3)
75 punpcklwd mm5,mm0 ; mm5=(***4***5)
76 punpckhwd mm0,mm0 ; mm0=(***6***7)
77
78 psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
79 psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
80 cvtpi2ps xmm0,mm4 ; xmm0=(01**)
81 cvtpi2ps xmm1,mm2 ; xmm1=(23**)
82 psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
83 psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
84 cvtpi2ps xmm2,mm5 ; xmm2=(45**)
85 cvtpi2ps xmm3,mm0 ; xmm3=(67**)
86
87 punpcklwd mm6,mm3 ; mm6=(***8***9)
88 punpckhwd mm3,mm3 ; mm3=(***A***B)
89 punpcklwd mm4,mm1 ; mm4=(***C***D)
90 punpckhwd mm1,mm1 ; mm1=(***E***F)
91
92 psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
93 psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
94 cvtpi2ps xmm4,mm6 ; xmm4=(89**)
95 cvtpi2ps xmm5,mm3 ; xmm5=(AB**)
96 psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
97 psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
98 cvtpi2ps xmm6,mm4 ; xmm6=(CD**)
99 cvtpi2ps xmm7,mm1 ; xmm7=(EF**)
100
101 movlhps xmm0,xmm1 ; xmm0=(0123)
102 movlhps xmm2,xmm3 ; xmm2=(4567)
103 movlhps xmm4,xmm5 ; xmm4=(89AB)
104 movlhps xmm6,xmm7 ; xmm6=(CDEF)
105
106 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
107 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
108 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
109 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
110
111 add esi, byte 2*SIZEOF_JSAMPROW
112 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
113 dec ecx
114 jnz near .convloop
115
116 emms ; empty MMX state
117
118 pop edi
119 pop esi
120 ; pop edx ; need not be preserved
121 ; pop ecx ; need not be preserved
122 pop ebx
123 pop ebp
124 ret
125
126
127 ; --------------------------------------------------------------------------
128 ;
129 ; Quantize/descale the coefficients, and store into coef_block
130 ;
131 ; GLOBAL(void)
132 ; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
133 ; FAST_FLOAT *workspace);
134 ;
135
136 %define coef_block ebp+8 ; JCOEFPTR coef_block
137 %define divisors ebp+12 ; FAST_FLOAT *divisors
138 %define workspace ebp+16 ; FAST_FLOAT *workspace
139
140 align 16
141 global EXTN(jsimd_quantize_float_sse)
142
143 EXTN(jsimd_quantize_float_sse):
144 push ebp
145 mov ebp,esp
146 ; push ebx ; unused
147 ; push ecx ; unused
148 ; push edx ; need not be preserved
149 push esi
150 push edi
151
152 mov esi, POINTER [workspace]
153 mov edx, POINTER [divisors]
154 mov edi, JCOEFPTR [coef_block]
155 mov eax, DCTSIZE2/16
156 alignx 16,7
157 .quantloop:
158 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
159 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
160 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
161 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
162 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
163 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
164 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
165 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
166
167 movhlps xmm4,xmm0
168 movhlps xmm5,xmm1
169
170 cvtps2pi mm0,xmm0
171 cvtps2pi mm1,xmm1
172 cvtps2pi mm4,xmm4
173 cvtps2pi mm5,xmm5
174
175 movhlps xmm6,xmm2
176 movhlps xmm7,xmm3
177
178 cvtps2pi mm2,xmm2
179 cvtps2pi mm3,xmm3
180 cvtps2pi mm6,xmm6
181 cvtps2pi mm7,xmm7
182
183 packssdw mm0,mm4
184 packssdw mm1,mm5
185 packssdw mm2,mm6
186 packssdw mm3,mm7
187
188 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
189 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
190 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
191 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
192
193 add esi, byte 16*SIZEOF_FAST_FLOAT
194 add edx, byte 16*SIZEOF_FAST_FLOAT
195 add edi, byte 16*SIZEOF_JCOEF
196 dec eax
197 jnz short .quantloop
198
199 emms ; empty MMX state
200
201 pop edi
202 pop esi
203 ; pop edx ; need not be preserved
204 ; pop ecx ; unused
205 ; pop ebx ; unused
206 pop ebp
207 ret
208
209 ; For some reason, the OS X linker does not honor the request to align the
210 ; segment unless we do this.
211 align 16
OLDNEW
« no previous file with comments | « simd/jquant-mmx.asm ('k') | simd/jquantf-sse2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698