simd/jcqnts2i-64.asm - Issue 1939823002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jcqnts2i-64.asm

Issue 1939823002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Test Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 ;

2 ; jcqnts2i-64.asm - sample data conversion and quantization (64-bit SSE2)

3 ;

4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ; Copyright 2009 D. R. Commander

6 ;

7 ; Based on

8 ; x86 SIMD extension for IJG JPEG library

9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

11 ;

12 ; This file should be assembled with NASM (Netwide Assembler),

13 ; can not be assembled with Microsoft's MASM or any compatible

14 ; assembler (including Borland's Turbo Assembler).

15 ; NASM is available from http://nasm.sourceforge.net/ or

16 ; http://sourceforge.net/project/showfiles.php?group_id=6208

17 ;

18 ; [TAB8]

19

20 %include "jsimdext.inc"

21 %include "jdct.inc"

22

23 ; --------------------------------------------------------------------------

24 SECTION SEG_TEXT

25 BITS 64

26 ;

27 ; Load data into workspace, applying unsigned->signed conversion

28 ;

29 ; GLOBAL(void)

30 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,

31 ; DCTELEM * workspace);

32 ;

33

34 ; r10 = JSAMPARRAY sample_data

35 ; r11 = JDIMENSION start_col

36 ; r12 = DCTELEM * workspace

37

38 align 16

39 global EXTN(jsimd_convsamp_sse2) PRIVATE

40

41 EXTN(jsimd_convsamp_sse2):

42 push rbp

43 mov rax,rsp

44 mov rbp,rsp

45 collect_args

46 push rbx

47

48 pxor xmm6,xmm6 ; xmm6=(all 0's)

49 pcmpeqw xmm7,xmm7

50 psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}

51

52 mov rsi, r10

53 mov eax, r11d

54 mov rdi, r12

55 mov rcx, DCTSIZE/4

56 .convloop:

57 mov rbx, JSAMPROW [rsi+0SIZEOF_JSAMPROW] ; (JSAMPLE )

58 mov rdx, JSAMPROW [rsi+1SIZEOF_JSAMPROW] ; (JSAMPLE )

59

60 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567 )

61 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF )

62

63 mov rbx, JSAMPROW [rsi+2SIZEOF_JSAMPROW] ; (JSAMPLE )

64 mov rdx, JSAMPROW [rsi+3SIZEOF_JSAMPROW] ; (JSAMPLE )

65

66 movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN )

67 movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV )

68

69 punpcklbw xmm0,xmm6 ; xmm0=(01234567)

70 punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)

71 paddw xmm0,xmm7

72 paddw xmm1,xmm7

73 punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)

74 punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)

75 paddw xmm2,xmm7

76 paddw xmm3,xmm7

77

78 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0

79 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1

80 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2

81 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3

82

83 add rsi, byte 4*SIZEOF_JSAMPROW

84 add rdi, byte 4DCTSIZESIZEOF_DCTELEM

85 dec rcx

86 jnz short .convloop

87

88 pop rbx

89 uncollect_args

90 pop rbp

91 ret

92

93 ; --------------------------------------------------------------------------

94 ;

95 ; Quantize/descale the coefficients, and store into coef_block

96 ;

97 ; This implementation is based on an algorithm described in

98 ; "How to optimize for the Pentium family of microprocessors"

99 ; (http://www.agner.org/assem/).

100 ;

101 ; GLOBAL(void)

102 ; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,

103 ; DCTELEM * workspace);

104 ;

105

106 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)

107 %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)

108 %define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)

109

110 ; r10 = JCOEFPTR coef_block

111 ; r11 = DCTELEM * divisors

112 ; r12 = DCTELEM * workspace

113

114 align 16

115 global EXTN(jsimd_quantize_sse2) PRIVATE

116

117 EXTN(jsimd_quantize_sse2):

118 push rbp

119 mov rax,rsp

120 mov rbp,rsp

121 collect_args

122

123 mov rsi, r12

124 mov rdx, r11

125 mov rdi, r10

126 mov rax, DCTSIZE2/32

127 .quantloop:

128 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]

129 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]

130 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]

131 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]

132 movdqa xmm0,xmm4

133 movdqa xmm1,xmm5

134 movdqa xmm2,xmm6

135 movdqa xmm3,xmm7

136 psraw xmm4,(WORD_BIT-1)

137 psraw xmm5,(WORD_BIT-1)

138 psraw xmm6,(WORD_BIT-1)

139 psraw xmm7,(WORD_BIT-1)

140 pxor xmm0,xmm4

141 pxor xmm1,xmm5

142 pxor xmm2,xmm6

143 pxor xmm3,xmm7

144 psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;

145 psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;

146 psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;

147 psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;

148

149 paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor

150 paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]

151 paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]

152 paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]

153 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal

154 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]

155 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]

156 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]

157 pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale

158 pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]

159 pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]

160 pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]

161

162 pxor xmm0,xmm4

163 pxor xmm1,xmm5

164 pxor xmm2,xmm6

165 pxor xmm3,xmm7

166 psubw xmm0,xmm4

167 psubw xmm1,xmm5

168 psubw xmm2,xmm6

169 psubw xmm3,xmm7

170 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0

171 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1

172 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2

173 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3

174

175 add rsi, byte 32*SIZEOF_DCTELEM

176 add rdx, byte 32*SIZEOF_DCTELEM

177 add rdi, byte 32*SIZEOF_JCOEF

178 dec rax

179 jnz near .quantloop

180

181 uncollect_args

182 pop rbp

183 ret

184

185 ; For some reason, the OS X linker does not honor the request to align the

186 ; segment unless we do this.

187 align 16

OLD	NEW

« config.h ('K') | « simd/jcqnts2i.asm ('k') | simd/jcqntsse.asm » ('j') | no next file with comments »