simd/jcqntsse.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jcqntsse.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)

3 ;

4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ;

6 ; Based on

7 ; x86 SIMD extension for IJG JPEG library

8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

10 ;

11 ; This file should be assembled with NASM (Netwide Assembler),

12 ; can not be assembled with Microsoft's MASM or any compatible

13 ; assembler (including Borland's Turbo Assembler).

14 ; NASM is available from http://nasm.sourceforge.net/ or

15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

16 ;

17 ; [TAB8]

18

19 %include "jsimdext.inc"

20 %include "jdct.inc"

21

22 ; --------------------------------------------------------------------------

23 SECTION SEG_TEXT

24 BITS 32

25 ;

26 ; Load data into workspace, applying unsigned->signed conversion

27 ;

28 ; GLOBAL(void)

29 ; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,

30 ; FAST_FLOAT * workspace);

31 ;

32

33 %define sample_data ebp+8 ; JSAMPARRAY sample_data

34 %define start_col ebp+12 ; JDIMENSION start_col

35 %define workspace ebp+16 ; FAST_FLOAT * workspace

36

37 align 16

38 global EXTN(jsimd_convsamp_float_sse) PRIVATE

39

40 EXTN(jsimd_convsamp_float_sse):

41 push ebp

42 mov ebp,esp

43 push ebx

44 ; push ecx ; need not be preserved

45 ; push edx ; need not be preserved

46 push esi

47 push edi

48

49 pcmpeqw mm7,mm7

50 psllw mm7,7

51 packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)

52

53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)

54 mov eax, JDIMENSION [start_col]

55 mov edi, POINTER [workspace] ; (DCTELEM *)

56 mov ecx, DCTSIZE/2

57 alignx 16,7

58 .convloop:

59 mov ebx, JSAMPROW [esi+0SIZEOF_JSAMPROW] ; (JSAMPLE )

60 mov edx, JSAMPROW [esi+1SIZEOF_JSAMPROW] ; (JSAMPLE )

61

62 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]

63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]

64

65 psubb mm0,mm7 ; mm0=(01234567)

66 psubb mm1,mm7 ; mm1=(89ABCDEF)

67

68 punpcklbw mm2,mm0 ; mm2=(0123)

69 punpckhbw mm0,mm0 ; mm0=(4567)

70 punpcklbw mm3,mm1 ; mm3=(89AB)

71 punpckhbw mm1,mm1 ; mm1=(CDEF)

72

73 punpcklwd mm4,mm2 ; mm4=(*0*1)

74 punpckhwd mm2,mm2 ; mm2=(*2*3)

75 punpcklwd mm5,mm0 ; mm5=(*4*5)

76 punpckhwd mm0,mm0 ; mm0=(*6*7)

77

78 psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)

79 psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)

80 cvtpi2ps xmm0,mm4 ; xmm0=(01**)

81 cvtpi2ps xmm1,mm2 ; xmm1=(23**)

82 psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)

83 psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)

84 cvtpi2ps xmm2,mm5 ; xmm2=(45**)

85 cvtpi2ps xmm3,mm0 ; xmm3=(67**)

86

87 punpcklwd mm6,mm3 ; mm6=(*8*9)

88 punpckhwd mm3,mm3 ; mm3=(*A*B)

89 punpcklwd mm4,mm1 ; mm4=(*C*D)

90 punpckhwd mm1,mm1 ; mm1=(*E*F)

91

92 psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)

93 psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)

94 cvtpi2ps xmm4,mm6 ; xmm4=(89**)

95 cvtpi2ps xmm5,mm3 ; xmm5=(AB**)

96 psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)

97 psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)

98 cvtpi2ps xmm6,mm4 ; xmm6=(CD**)

99 cvtpi2ps xmm7,mm1 ; xmm7=(EF**)

100

101 movlhps xmm0,xmm1 ; xmm0=(0123)

102 movlhps xmm2,xmm3 ; xmm2=(4567)

103 movlhps xmm4,xmm5 ; xmm4=(89AB)

104 movlhps xmm6,xmm7 ; xmm6=(CDEF)

105

106 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0

107 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2

108 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4

109 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6

110

111 add esi, byte 2*SIZEOF_JSAMPROW

112 add edi, byte 2DCTSIZESIZEOF_FAST_FLOAT

113 dec ecx

114 jnz near .convloop

115

116 emms ; empty MMX state

117

118 pop edi

119 pop esi

120 ; pop edx ; need not be preserved

121 ; pop ecx ; need not be preserved

122 pop ebx

123 pop ebp

124 ret

125

126

127 ; --------------------------------------------------------------------------

128 ;

129 ; Quantize/descale the coefficients, and store into coef_block

130 ;

131 ; GLOBAL(void)

132 ; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,

133 ; FAST_FLOAT * workspace);

134 ;

135

136 %define coef_block ebp+8 ; JCOEFPTR coef_block

137 %define divisors ebp+12 ; FAST_FLOAT * divisors

138 %define workspace ebp+16 ; FAST_FLOAT * workspace

139

140 align 16

141 global EXTN(jsimd_quantize_float_sse) PRIVATE

142

143 EXTN(jsimd_quantize_float_sse):

144 push ebp

145 mov ebp,esp

146 ; push ebx ; unused

147 ; push ecx ; unused

148 ; push edx ; need not be preserved

149 push esi

150 push edi

151

152 mov esi, POINTER [workspace]

153 mov edx, POINTER [divisors]

154 mov edi, JCOEFPTR [coef_block]

155 mov eax, DCTSIZE2/16

156 alignx 16,7

157 .quantloop:

158 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]

159 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]

160 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]

161 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]

162 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]

163 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]

164 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]

165 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]

166

167 movhlps xmm4,xmm0

168 movhlps xmm5,xmm1

169

170 cvtps2pi mm0,xmm0

171 cvtps2pi mm1,xmm1

172 cvtps2pi mm4,xmm4

173 cvtps2pi mm5,xmm5

174

175 movhlps xmm6,xmm2

176 movhlps xmm7,xmm3

177

178 cvtps2pi mm2,xmm2

179 cvtps2pi mm3,xmm3

180 cvtps2pi mm6,xmm6

181 cvtps2pi mm7,xmm7

182

183 packssdw mm0,mm4

184 packssdw mm1,mm5

185 packssdw mm2,mm6

186 packssdw mm3,mm7

187

188 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0

189 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1

190 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2

191 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3

192

193 add esi, byte 16*SIZEOF_FAST_FLOAT

194 add edx, byte 16*SIZEOF_FAST_FLOAT

195 add edi, byte 16*SIZEOF_JCOEF

196 dec eax

197 jnz short .quantloop

198

199 emms ; empty MMX state

200

201 pop edi

202 pop esi

203 ; pop edx ; need not be preserved

204 ; pop ecx ; unused

205 ; pop ebx ; unused

206 pop ebp

207 ret

208

209 ; For some reason, the OS X linker does not honor the request to align the

210 ; segment unless we do this.

211 align 16

OLD	NEW

« no previous file with comments | « simd/jcqnts2i-64.asm ('k') | simd/jcsammmx.asm » ('j') | no next file with comments »