simd/jcqnts2f.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jcqnts2f.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)

3 ;

4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ;

6 ; Based on

7 ; x86 SIMD extension for IJG JPEG library

8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

10 ;

11 ; This file should be assembled with NASM (Netwide Assembler),

12 ; can not be assembled with Microsoft's MASM or any compatible

13 ; assembler (including Borland's Turbo Assembler).

14 ; NASM is available from http://nasm.sourceforge.net/ or

15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

16 ;

17 ; [TAB8]

18

19 %include "jsimdext.inc"

20 %include "jdct.inc"

21

22 ; --------------------------------------------------------------------------

23 SECTION SEG_TEXT

24 BITS 32

25 ;

26 ; Load data into workspace, applying unsigned->signed conversion

27 ;

28 ; GLOBAL(void)

29 ; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,

30 ; FAST_FLOAT * workspace);

31 ;

32

33 %define sample_data ebp+8 ; JSAMPARRAY sample_data

34 %define start_col ebp+12 ; JDIMENSION start_col

35 %define workspace ebp+16 ; FAST_FLOAT * workspace

36

37 align 16

38 global EXTN(jsimd_convsamp_float_sse2) PRIVATE

39

40 EXTN(jsimd_convsamp_float_sse2):

41 push ebp

42 mov ebp,esp

43 push ebx

44 ; push ecx ; need not be preserved

45 ; push edx ; need not be preserved

46 push esi

47 push edi

48

49 pcmpeqw xmm7,xmm7

50 psllw xmm7,7

51 packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)

52

53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)

54 mov eax, JDIMENSION [start_col]

55 mov edi, POINTER [workspace] ; (DCTELEM *)

56 mov ecx, DCTSIZE/2

57 alignx 16,7

58 .convloop:

59 mov ebx, JSAMPROW [esi+0SIZEOF_JSAMPROW] ; (JSAMPLE )

60 mov edx, JSAMPROW [esi+1SIZEOF_JSAMPROW] ; (JSAMPLE )

61

62 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]

63 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]

64

65 psubb xmm0,xmm7 ; xmm0=(01234567)

66 psubb xmm1,xmm7 ; xmm1=(89ABCDEF)

67

68 punpcklbw xmm0,xmm0 ; xmm0=(01234567)

69 punpcklbw xmm1,xmm1 ; xmm1=(89ABCDEF)

70

71 punpcklwd xmm2,xmm0 ; xmm2=(*012*3)

72 punpckhwd xmm0,xmm0 ; xmm0=(*456*7)

73 punpcklwd xmm3,xmm1 ; xmm3=(*89A*B)

74 punpckhwd xmm1,xmm1 ; xmm1=(*CDE*F)

75

76 psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)

77 psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)

78 cvtdq2ps xmm2,xmm2 ; xmm2=(0123)

79 cvtdq2ps xmm0,xmm0 ; xmm0=(4567)

80 psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)

81 psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)

82 cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)

83 cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)

84

85 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2

86 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0

87 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3

88 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1

89

90 add esi, byte 2*SIZEOF_JSAMPROW

91 add edi, byte 2DCTSIZESIZEOF_FAST_FLOAT

92 dec ecx

93 jnz short .convloop

94

95 pop edi

96 pop esi

97 ; pop edx ; need not be preserved

98 ; pop ecx ; need not be preserved

99 pop ebx

100 pop ebp

101 ret

102

103

104 ; --------------------------------------------------------------------------

105 ;

106 ; Quantize/descale the coefficients, and store into coef_block

107 ;

108 ; GLOBAL(void)

109 ; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,

110 ; FAST_FLOAT * workspace);

111 ;

112

113 %define coef_block ebp+8 ; JCOEFPTR coef_block

114 %define divisors ebp+12 ; FAST_FLOAT * divisors

115 %define workspace ebp+16 ; FAST_FLOAT * workspace

116

117 align 16

118 global EXTN(jsimd_quantize_float_sse2) PRIVATE

119

120 EXTN(jsimd_quantize_float_sse2):

121 push ebp

122 mov ebp,esp

123 ; push ebx ; unused

124 ; push ecx ; unused

125 ; push edx ; need not be preserved

126 push esi

127 push edi

128

129 mov esi, POINTER [workspace]

130 mov edx, POINTER [divisors]

131 mov edi, JCOEFPTR [coef_block]

132 mov eax, DCTSIZE2/16

133 alignx 16,7

134 .quantloop:

135 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]

136 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]

137 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]

138 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]

139 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]

140 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]

141 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]

142 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]

143

144 cvtps2dq xmm0,xmm0

145 cvtps2dq xmm1,xmm1

146 cvtps2dq xmm2,xmm2

147 cvtps2dq xmm3,xmm3

148

149 packssdw xmm0,xmm1

150 packssdw xmm2,xmm3

151

152 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0

153 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2

154

155 add esi, byte 16*SIZEOF_FAST_FLOAT

156 add edx, byte 16*SIZEOF_FAST_FLOAT

157 add edi, byte 16*SIZEOF_JCOEF

158 dec eax

159 jnz short .quantloop

160

161 pop edi

162 pop esi

163 ; pop edx ; need not be preserved

164 ; pop ecx ; unused

165 ; pop ebx ; unused

166 pop ebp

167 ret

168

169 ; For some reason, the OS X linker does not honor the request to align the

170 ; segment unless we do this.

171 align 16

OLD	NEW

« no previous file with comments | « simd/jcqntmmx.asm ('k') | simd/jcqnts2f-64.asm » ('j') | no next file with comments »