source/libvpx/vp8/encoder/x86/quantize_sse2.asm - Issue 7671004: Update libvpx snapshot to v0.9.7-p1 (Cayuga).

Side by Side Diff: source/libvpx/vp8/encoder/x86/quantize_sse2.asm

Issue 7671004: Update libvpx snapshot to v0.9.7-p1 (Cayuga). (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: '' Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 ;	1 ;

2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 ;	3 ;

4 ; Use of this source code is governed by a BSD-style license and patent	4 ; Use of this source code is governed by a BSD-style license and patent

5 ; grant that can be found in the LICENSE file in the root of the source	5 ; grant that can be found in the LICENSE file in the root of the source

6 ; tree. All contributing project authors may be found in the AUTHORS	6 ; tree. All contributing project authors may be found in the AUTHORS

7 ; file in the root of the source tree.	7 ; file in the root of the source tree.

8 ;	8 ;

9	9

10	10

11 %include "vpx_ports/x86_abi_support.asm"	11 %include "vpx_ports/x86_abi_support.asm"

12 %include "asm_enc_offsets.asm"	12 %include "asm_enc_offsets.asm"

13	13

14	14

15 ; void vp8_regular_quantize_b_sse2 \| arg	15 ; void vp8_regular_quantize_b_sse2 \| arg

16 ; (BLOCK *b, \| 0	16 ; (BLOCK *b, \| 0

17 ; BLOCKD *d) \| 1	17 ; BLOCKD *d) \| 1

18	18

19 global sym(vp8_regular_quantize_b_sse2)	19 global sym(vp8_regular_quantize_b_sse2)

20 sym(vp8_regular_quantize_b_sse2):	20 sym(vp8_regular_quantize_b_sse2):

21 push rbp	21 push rbp

22 mov rbp, rsp	22 mov rbp, rsp

23 SAVE_XMM	23 SAVE_XMM 7

24 GET_GOT rbx	24 GET_GOT rbx

25 push rsi

26	25

27 %if ABI_IS_32BIT	26 %if ABI_IS_32BIT

28 push rdi	27 push rdi

	28 push rsi

29 %else	29 %else

30 %ifidn __OUTPUT_FORMAT__,x64	30 %ifidn __OUTPUT_FORMAT__,x64

31 push rdi	31 push rdi

	32 push rsi

32 %endif	33 %endif

33 %endif	34 %endif

34	35

35 ALIGN_STACK 16, rax	36 ALIGN_STACK 16, rax

36 %define BLOCKD_d 0 ; 8	37 %define zrun_zbin_boost 0 ; 8

37 %define zrun_zbin_boost 8 ; 8	38 %define abs_minus_zbin 8 ; 32

38 %define abs_minus_zbin 16 ; 32	39 %define temp_qcoeff 40 ; 32

39 %define temp_qcoeff 48 ; 32	40 %define qcoeff 72 ; 32

40 %define qcoeff 80 ; 32	41 %define stack_size 104

41 %define stack_size 112

42 sub rsp, stack_size	42 sub rsp, stack_size

43 ; end prolog	43 ; end prolog

44	44

45 %if ABI_IS_32BIT	45 %if ABI_IS_32BIT

46 mov rdi, arg(0)	46 mov rdi, arg(0) ; BLOCK *b

	47 mov rsi, arg(1) ; BLOCKD *d

47 %else	48 %else

48 %ifidn __OUTPUT_FORMAT__,x64	49 %ifidn __OUTPUT_FORMAT__,x64

49 mov rdi, rcx ; BLOCK *b	50 mov rdi, rcx ; BLOCK *b

50 mov [rsp + BLOCKD_d], rdx	51 mov rsi, rdx ; BLOCKD *d

51 %else	52 %else

52 ;mov rdi, rdi ; BLOCK *b	53 ;mov rdi, rdi ; BLOCK *b

53 mov [rsp + BLOCKD_d], rsi	54 ;mov rsi, rsi ; BLOCKD *d

54 %endif	55 %endif

55 %endif	56 %endif

56	57

57 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr	58 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr

58 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr	59 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr

59 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value	60 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value

60	61

61 ; z	62 ; z

62 movdqa xmm0, [rdx]	63 movdqa xmm0, [rdx]

63 movdqa xmm4, [rdx + 16]	64 movdqa xmm4, [rdx + 16]

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
118 paddw xmm5, xmm7	119 paddw xmm5, xmm7

119	120

120 movdqa [rsp + temp_qcoeff], xmm1	121 movdqa [rsp + temp_qcoeff], xmm1

121 movdqa [rsp + temp_qcoeff + 16], xmm5	122 movdqa [rsp + temp_qcoeff + 16], xmm5

122	123

123 pxor xmm6, xmm6	124 pxor xmm6, xmm6

124 ; zero qcoeff	125 ; zero qcoeff

125 movdqa [rsp + qcoeff], xmm6	126 movdqa [rsp + qcoeff], xmm6

126 movdqa [rsp + qcoeff + 16], xmm6	127 movdqa [rsp + qcoeff + 16], xmm6

127	128

128 mov rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr	129 mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr

129 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr	130 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr

130 mov [rsp + zrun_zbin_boost], rsi	131 mov [rsp + zrun_zbin_boost], rdx

131	132

132 %macro ZIGZAG_LOOP 1	133 %macro ZIGZAG_LOOP 1

133 movsx edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc

134

135 ; x	134 ; x

136 movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]	135 movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]

137	136

138 ; if (x >= zbin)	137 ; if (x >= zbin)

139 sub cx, WORD PTR[rsi] ; x - zbin	138 sub cx, WORD PTR[rdx] ; x - zbin

140 lea rsi, [rsi + 2] ; zbin_boost_ptr++	139 lea rdx, [rdx + 2] ; zbin_boost_ptr++

141 jl rq_zigzag_loop_%1 ; x < zbin	140 jl rq_zigzag_loop_%1 ; x < zbin

142	141

143 movsx edi, WORD PTR[rsp + temp_qcoeff + rdx *2]	142 movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

144	143

145 ; downshift by quant_shift[rdx]	144 ; downshift by quant_shift[rc]

146 movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc]	145 movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]

147 sar edi, cl ; also sets Z bit	146 sar edi, cl ; also sets Z bit

148 je rq_zigzag_loop_%1 ; !y	147 je rq_zigzag_loop_%1 ; !y

149 mov WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff [rc]	148 mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoef f[rc]

150 mov rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost	149 mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

151 rq_zigzag_loop_%1:	150 rq_zigzag_loop_%1:

152 %endmacro	151 %endmacro

153 ZIGZAG_LOOP 0	152 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c

154 ZIGZAG_LOOP 1	153 ZIGZAG_LOOP 0

155 ZIGZAG_LOOP 2	154 ZIGZAG_LOOP 1

156 ZIGZAG_LOOP 3	155 ZIGZAG_LOOP 4

157 ZIGZAG_LOOP 4	156 ZIGZAG_LOOP 8

158 ZIGZAG_LOOP 5	157 ZIGZAG_LOOP 5

159 ZIGZAG_LOOP 6	158 ZIGZAG_LOOP 2

160 ZIGZAG_LOOP 7	159 ZIGZAG_LOOP 3

161 ZIGZAG_LOOP 8	160 ZIGZAG_LOOP 6

162 ZIGZAG_LOOP 9	161 ZIGZAG_LOOP 9

163 ZIGZAG_LOOP 10

164 ZIGZAG_LOOP 11

165 ZIGZAG_LOOP 12	162 ZIGZAG_LOOP 12

166 ZIGZAG_LOOP 13	163 ZIGZAG_LOOP 13

	164 ZIGZAG_LOOP 10

	165 ZIGZAG_LOOP 7

	166 ZIGZAG_LOOP 11

167 ZIGZAG_LOOP 14	167 ZIGZAG_LOOP 14

168 ZIGZAG_LOOP 15	168 ZIGZAG_LOOP 15

169	169

170 movdqa xmm2, [rsp + qcoeff]	170 movdqa xmm2, [rsp + qcoeff]

171 movdqa xmm3, [rsp + qcoeff + 16]	171 movdqa xmm3, [rsp + qcoeff + 16]

172	172

173 %if ABI_IS_32BIT	173 mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr

174 mov rdi, arg(1)	174 mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr

175 %else

176 mov rdi, [rsp + BLOCKD_d]

177 %endif

178

179 mov rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr

180 mov rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr

181	175

182 ; y ^ sz	176 ; y ^ sz

183 pxor xmm2, xmm0	177 pxor xmm2, xmm0

184 pxor xmm3, xmm4	178 pxor xmm3, xmm4

185 ; x = (y ^ sz) - sz	179 ; x = (y ^ sz) - sz

186 psubw xmm2, xmm0	180 psubw xmm2, xmm0

187 psubw xmm3, xmm4	181 psubw xmm3, xmm4

188	182

189 ; dequant	183 ; dequant

190 movdqa xmm0, [rcx]	184 movdqa xmm0, [rcx]

191 movdqa xmm1, [rcx + 16]	185 movdqa xmm1, [rcx + 16]

192	186

193 mov rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr	187 mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr

194	188

195 pmullw xmm0, xmm2	189 pmullw xmm0, xmm2

196 pmullw xmm1, xmm3	190 pmullw xmm1, xmm3

197	191

198 movdqa [rcx], xmm2 ; store qcoeff	192 movdqa [rcx], xmm2 ; store qcoeff

199 movdqa [rcx + 16], xmm3	193 movdqa [rcx + 16], xmm3

200 movdqa [rsi], xmm0 ; store dqcoeff	194 movdqa [rdi], xmm0 ; store dqcoeff

201 movdqa [rsi + 16], xmm1	195 movdqa [rdi + 16], xmm1

202	196

203 ; select the last value (in zig_zag order) for EOB	197 ; select the last value (in zig_zag order) for EOB

204 pcmpeqw xmm2, xmm6	198 pcmpeqw xmm2, xmm6

205 pcmpeqw xmm3, xmm6	199 pcmpeqw xmm3, xmm6

206 ; !	200 ; !

207 pcmpeqw xmm6, xmm6	201 pcmpeqw xmm6, xmm6

208 pxor xmm2, xmm6	202 pxor xmm2, xmm6

209 pxor xmm3, xmm6	203 pxor xmm3, xmm6

210 ; mask inv_zig_zag	204 ; mask inv_zig_zag

211 pand xmm2, [GLOBAL(inv_zig_zag)]	205 pand xmm2, [GLOBAL(inv_zig_zag)]

212 pand xmm3, [GLOBAL(inv_zig_zag + 16)]	206 pand xmm3, [GLOBAL(inv_zig_zag + 16)]

213 ; select the max value	207 ; select the max value

214 pmaxsw xmm2, xmm3	208 pmaxsw xmm2, xmm3

215 pshufd xmm3, xmm2, 00001110b	209 pshufd xmm3, xmm2, 00001110b

216 pmaxsw xmm2, xmm3	210 pmaxsw xmm2, xmm3

217 pshuflw xmm3, xmm2, 00001110b	211 pshuflw xmm3, xmm2, 00001110b

218 pmaxsw xmm2, xmm3	212 pmaxsw xmm2, xmm3

219 pshuflw xmm3, xmm2, 00000001b	213 pshuflw xmm3, xmm2, 00000001b

220 pmaxsw xmm2, xmm3	214 pmaxsw xmm2, xmm3

221 movd eax, xmm2	215 movd eax, xmm2

222 and eax, 0xff	216 and eax, 0xff

223 mov [rdi + vp8_blockd_eob], eax	217 mov [rsi + vp8_blockd_eob], eax

224	218

225 ; begin epilog	219 ; begin epilog

226 add rsp, stack_size	220 add rsp, stack_size

227 pop rsp	221 pop rsp

228 %if ABI_IS_32BIT	222 %if ABI_IS_32BIT

	223 pop rsi

229 pop rdi	224 pop rdi

230 %else	225 %else

231 %ifidn __OUTPUT_FORMAT__,x64	226 %ifidn __OUTPUT_FORMAT__,x64

	227 pop rsi

232 pop rdi	228 pop rdi

233 %endif	229 %endif

234 %endif	230 %endif

235 pop rsi

236 RESTORE_GOT	231 RESTORE_GOT

237 RESTORE_XMM	232 RESTORE_XMM

238 pop rbp	233 pop rbp

239 ret	234 ret

240	235

241 ; int vp8_fast_quantize_b_impl_sse2 \| arg	236 ; void vp8_fast_quantize_b_sse2 \| arg

242 ; (short *coeff_ptr, \| 0	237 ; (BLOCK *b, \| 0

243 ; short *qcoeff_ptr, \| 1	238 ; BLOCKD *d) \| 1

244 ; short *dequant_ptr, \| 2

245 ; short *inv_scan_order, \| 3

246 ; short *round_ptr, \| 4

247 ; short *quant_ptr, \| 5

248 ; short *dqcoeff_ptr) \| 6

249	239

250 global sym(vp8_fast_quantize_b_impl_sse2)	240 global sym(vp8_fast_quantize_b_sse2)

251 sym(vp8_fast_quantize_b_impl_sse2):	241 sym(vp8_fast_quantize_b_sse2):

252 push rbp	242 push rbp

253 mov rbp, rsp	243 mov rbp, rsp

254 SHADOW_ARGS_TO_STACK 7	244 GET_GOT rbx

	245

	246 %if ABI_IS_32BIT

	247 push rdi

255 push rsi	248 push rsi

	249 %else

	250 %ifidn __OUTPUT_FORMAT__,x64

256 push rdi	251 push rdi

	252 push rsi

	253 %else

	254 ; these registers are used for passing arguments

	255 %endif

	256 %endif

	257

257 ; end prolog	258 ; end prolog

258	259

259 mov rdx, arg(0) ;coeff_ptr	260 %if ABI_IS_32BIT

260 mov rcx, arg(2) ;dequant_ptr	261 mov rdi, arg(0) ; BLOCK *b

261 mov rdi, arg(4) ;round_ptr	262 mov rsi, arg(1) ; BLOCKD *d

262 mov rsi, arg(5) ;quant_ptr	263 %else

	264 %ifidn __OUTPUT_FORMAT__,x64

	265 mov rdi, rcx ; BLOCK *b

	266 mov rsi, rdx ; BLOCKD *d

	267 %else

	268 ;mov rdi, rdi ; BLOCK *b

	269 ;mov rsi, rsi ; BLOCKD *d

	270 %endif

	271 %endif

263	272

264 movdqa xmm0, XMMWORD PTR[rdx]	273 mov rax, [rdi + vp8_block_coeff]

265 movdqa xmm4, XMMWORD PTR[rdx + 16]	274 mov rcx, [rdi + vp8_block_round]

	275 mov rdx, [rdi + vp8_block_quant_fast]

266	276

267 movdqa xmm2, XMMWORD PTR[rdi] ;round lo	277 ; z = coeff

268 movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi	278 movdqa xmm0, [rax]

	279 movdqa xmm4, [rax + 16]

269	280

	281 ; dup z so we can save sz

270 movdqa xmm1, xmm0	282 movdqa xmm1, xmm0

271 movdqa xmm5, xmm4	283 movdqa xmm5, xmm4

272	284

273 psraw xmm0, 15 ;sign of z (aka sz)	285 ; sz = z >> 15

274 psraw xmm4, 15 ;sign of z (aka sz)	286 psraw xmm0, 15

	287 psraw xmm4, 15

275	288

276 pxor xmm1, xmm0	289 ; x = abs(z) = (z ^ sz) - sz

277 pxor xmm5, xmm4

278 psubw xmm1, xmm0 ;x = abs(z)

279 psubw xmm5, xmm4 ;x = abs(z)

280

281 paddw xmm1, xmm2

282 paddw xmm5, xmm3

283

284 pmulhw xmm1, XMMWORD PTR[rsi]

285 pmulhw xmm5, XMMWORD PTR[rsi + 16]

286

287 mov rdi, arg(1) ;qcoeff_ptr

288 mov rsi, arg(6) ;dqcoeff_ptr

289

290 movdqa xmm2, XMMWORD PTR[rcx]

291 movdqa xmm3, XMMWORD PTR[rcx + 16]

292

293 pxor xmm1, xmm0	290 pxor xmm1, xmm0

294 pxor xmm5, xmm4	291 pxor xmm5, xmm4

295 psubw xmm1, xmm0	292 psubw xmm1, xmm0

296 psubw xmm5, xmm4	293 psubw xmm5, xmm4

297	294

298 movdqa XMMWORD PTR[rdi], xmm1	295 ; x += round

299 movdqa XMMWORD PTR[rdi + 16], xmm5	296 paddw xmm1, [rcx]

	297 paddw xmm5, [rcx + 16]

300	298

301 pmullw xmm2, xmm1	299 mov rax, [rsi + vp8_blockd_qcoeff]

302 pmullw xmm3, xmm5	300 mov rcx, [rsi + vp8_blockd_dequant]

	301 mov rdi, [rsi + vp8_blockd_dqcoeff]

303	302

304 mov rdi, arg(3) ;inv_scan_order	303 ; y = x * quant >> 16

	304 pmulhw xmm1, [rdx]

	305 pmulhw xmm5, [rdx + 16]

305	306

306 ; Start with 16	307 ; x = (y ^ sz) - sz

	308 pxor xmm1, xmm0

	309 pxor xmm5, xmm4

	310 psubw xmm1, xmm0

	311 psubw xmm5, xmm4

	312

	313 ; qcoeff = x

	314 movdqa [rax], xmm1

	315 movdqa [rax + 16], xmm5

	316

	317 ; x * dequant

	318 movdqa xmm2, xmm1

	319 movdqa xmm3, xmm5

	320 pmullw xmm2, [rcx]

	321 pmullw xmm3, [rcx + 16]

	322

	323 ; dqcoeff = x * dequant

	324 movdqa [rdi], xmm2

	325 movdqa [rdi + 16], xmm3

	326

307 pxor xmm4, xmm4 ;clear all bits	327 pxor xmm4, xmm4 ;clear all bits

308 pcmpeqw xmm1, xmm4	328 pcmpeqw xmm1, xmm4

309 pcmpeqw xmm5, xmm4	329 pcmpeqw xmm5, xmm4

310	330

311 pcmpeqw xmm4, xmm4 ;set all bits	331 pcmpeqw xmm4, xmm4 ;set all bits

312 pxor xmm1, xmm4	332 pxor xmm1, xmm4

313 pxor xmm5, xmm4	333 pxor xmm5, xmm4

314	334

315 pand xmm1, XMMWORD PTR[rdi]	335 pand xmm1, [GLOBAL(inv_zig_zag)]

316 pand xmm5, XMMWORD PTR[rdi+16]	336 pand xmm5, [GLOBAL(inv_zig_zag + 16)]

317	337

318 pmaxsw xmm1, xmm5	338 pmaxsw xmm1, xmm5

319	339

320 ; now down to 8	340 ; now down to 8

321 pshufd xmm5, xmm1, 00001110b	341 pshufd xmm5, xmm1, 00001110b

322	342

323 pmaxsw xmm1, xmm5	343 pmaxsw xmm1, xmm5

324	344

325 ; only 4 left	345 ; only 4 left

326 pshuflw xmm5, xmm1, 00001110b	346 pshuflw xmm5, xmm1, 00001110b

327	347

328 pmaxsw xmm1, xmm5	348 pmaxsw xmm1, xmm5

329	349

330 ; okay, just 2!	350 ; okay, just 2!

331 pshuflw xmm5, xmm1, 00000001b	351 pshuflw xmm5, xmm1, 00000001b

332	352

333 pmaxsw xmm1, xmm5	353 pmaxsw xmm1, xmm5

334	354

335 movd rax, xmm1	355 movd eax, xmm1

336 and rax, 0xff	356 and eax, 0xff

337	357 mov [rsi + vp8_blockd_eob], eax

338 movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff

339 movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff

340	358

341 ; begin epilog	359 ; begin epilog

	360 %if ABI_IS_32BIT

	361 pop rsi

342 pop rdi	362 pop rdi

	363 %else

	364 %ifidn __OUTPUT_FORMAT__,x64

343 pop rsi	365 pop rsi

344 UNSHADOW_ARGS	366 pop rdi

	367 %endif

	368 %endif

	369

	370 RESTORE_GOT

345 pop rbp	371 pop rbp

346 ret	372 ret

347	373

348 SECTION_RODATA	374 SECTION_RODATA

349 align 16	375 align 16

350 zig_zag:

351 dw 0x0000, 0x0001, 0x0004, 0x0008

352 dw 0x0005, 0x0002, 0x0003, 0x0006

353 dw 0x0009, 0x000c, 0x000d, 0x000a

354 dw 0x0007, 0x000b, 0x000e, 0x000f

355 inv_zig_zag:	376 inv_zig_zag:

356 dw 0x0001, 0x0002, 0x0006, 0x0007	377 dw 0x0001, 0x0002, 0x0006, 0x0007

357 dw 0x0003, 0x0005, 0x0008, 0x000d	378 dw 0x0003, 0x0005, 0x0008, 0x000d

358 dw 0x0004, 0x0009, 0x000c, 0x000e	379 dw 0x0004, 0x0009, 0x000c, 0x000e

359 dw 0x000a, 0x000b, 0x000f, 0x0010	380 dw 0x000a, 0x000b, 0x000f, 0x0010

OLD	NEW

« no previous file with comments | « source/libvpx/vp8/encoder/x86/mcomp_x86.h ('k') | source/libvpx/vp8/encoder/x86/quantize_sse4.asm » ('j') | no next file with comments »