source/libvpx/vp8/common/x86/variance_impl_sse2.asm - Issue 1162573005: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp8/common/x86/variance_impl_sse2.asm

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 ;	1 ;

2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 ;	3 ;

4 ; Use of this source code is governed by a BSD-style license	4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source	5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found	6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may	7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.	8 ; be found in the AUTHORS file in the root of the source tree.

9 ;	9 ;

10	10

11	11

12 %include "vpx_ports/x86_abi_support.asm"	12 %include "vpx_ports/x86_abi_support.asm"

13	13

14 %define xmm_filter_shift 7	14 %define xmm_filter_shift 7

15	15

16 ;unsigned int vp8_get_mb_ss_sse2

17 ;(

18 ; short *src_ptr

19 ;)

20 global sym(vp8_get_mb_ss_sse2) PRIVATE

21 sym(vp8_get_mb_ss_sse2):

22 push rbp

23 mov rbp, rsp

24 SHADOW_ARGS_TO_STACK 1

25 GET_GOT rbx

26 push rsi

27 push rdi

28 sub rsp, 16

29 ; end prolog

30

31

32 mov rax, arg(0) ;[src_ptr]

33 mov rcx, 8

34 pxor xmm4, xmm4

35

36 .NEXTROW:

37 movdqa xmm0, [rax]

38 movdqa xmm1, [rax+16]

39 movdqa xmm2, [rax+32]

40 movdqa xmm3, [rax+48]

41 pmaddwd xmm0, xmm0

42 pmaddwd xmm1, xmm1

43 pmaddwd xmm2, xmm2

44 pmaddwd xmm3, xmm3

45

46 paddd xmm0, xmm1

47 paddd xmm2, xmm3

48 paddd xmm4, xmm0

49 paddd xmm4, xmm2

50

51 add rax, 0x40

52 dec rcx

53 ja .NEXTROW

54

55 movdqa xmm3,xmm4

56 psrldq xmm4,8

57 paddd xmm4,xmm3

58 movdqa xmm3,xmm4

59 psrldq xmm4,4

60 paddd xmm4,xmm3

61 movq rax,xmm4

62

63

64 ; begin epilog

65 add rsp, 16

66 pop rdi

67 pop rsi

68 RESTORE_GOT

69 UNSHADOW_ARGS

70 pop rbp

71 ret

72

73

74 ;unsigned int vp8_get16x16var_sse2

75 ;(

76 ; unsigned char * src_ptr,

77 ; int source_stride,

78 ; unsigned char * ref_ptr,

79 ; int recon_stride,

80 ; unsigned int * SSE,

81 ; int * Sum

82 ;)

83 global sym(vp8_get16x16var_sse2) PRIVATE

84 sym(vp8_get16x16var_sse2):

85 push rbp

86 mov rbp, rsp

87 SHADOW_ARGS_TO_STACK 6

88 SAVE_XMM 7

89 push rbx

90 push rsi

91 push rdi

92 ; end prolog

93

94 mov rsi, arg(0) ;[src_ptr]

95 mov rdi, arg(2) ;[ref_ptr]

96

97 movsxd rax, DWORD PTR arg(1) ;[source_stride]

98 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]

99

100 ; Prefetch data

101 lea rcx, [rax+rax*2]

102 prefetcht0 [rsi]

103 prefetcht0 [rsi+rax]

104 prefetcht0 [rsi+rax*2]

105 prefetcht0 [rsi+rcx]

106 lea rbx, [rsi+rax*4]

107 prefetcht0 [rbx]

108 prefetcht0 [rbx+rax]

109 prefetcht0 [rbx+rax*2]

110 prefetcht0 [rbx+rcx]

111

112 lea rcx, [rdx+rdx*2]

113 prefetcht0 [rdi]

114 prefetcht0 [rdi+rdx]

115 prefetcht0 [rdi+rdx*2]

116 prefetcht0 [rdi+rcx]

117 lea rbx, [rdi+rdx*4]

118 prefetcht0 [rbx]

119 prefetcht0 [rbx+rdx]

120 prefetcht0 [rbx+rdx*2]

121 prefetcht0 [rbx+rcx]

122

123 pxor xmm0, xmm0 ; clear xmm0 for unpack

124 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs

125

126 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse

127 mov rcx, 16

128

129 .var16loop:

130 movdqu xmm1, XMMWORD PTR [rsi]

131 movdqu xmm2, XMMWORD PTR [rdi]

132

133 prefetcht0 [rsi+rax*8]

134 prefetcht0 [rdi+rdx*8]

135

136 movdqa xmm3, xmm1

137 movdqa xmm4, xmm2

138

139

140 punpcklbw xmm1, xmm0

141 punpckhbw xmm3, xmm0

142

143 punpcklbw xmm2, xmm0

144 punpckhbw xmm4, xmm0

145

146

147 psubw xmm1, xmm2

148 psubw xmm3, xmm4

149

150 paddw xmm7, xmm1

151 pmaddwd xmm1, xmm1

152

153 paddw xmm7, xmm3

154 pmaddwd xmm3, xmm3

155

156 paddd xmm6, xmm1

157 paddd xmm6, xmm3

158

159 add rsi, rax

160 add rdi, rdx

161

162 sub rcx, 1

163 jnz .var16loop

164

165

166 movdqa xmm1, xmm6

167 pxor xmm6, xmm6

168

169 pxor xmm5, xmm5

170 punpcklwd xmm6, xmm7

171

172 punpckhwd xmm5, xmm7

173 psrad xmm5, 16

174

175 psrad xmm6, 16

176 paddd xmm6, xmm5

177

178 movdqa xmm2, xmm1

179 punpckldq xmm1, xmm0

180

181 punpckhdq xmm2, xmm0

182 movdqa xmm7, xmm6

183

184 paddd xmm1, xmm2

185 punpckldq xmm6, xmm0

186

187 punpckhdq xmm7, xmm0

188 paddd xmm6, xmm7

189

190 movdqa xmm2, xmm1

191 movdqa xmm7, xmm6

192

193 psrldq xmm1, 8

194 psrldq xmm6, 8

195

196 paddd xmm7, xmm6

197 paddd xmm1, xmm2

198

199 mov rax, arg(5) ;[Sum]

200 mov rdi, arg(4) ;[SSE]

201

202 movd DWORD PTR [rax], xmm7

203 movd DWORD PTR [rdi], xmm1

204

205

206 ; begin epilog

207 pop rdi

208 pop rsi

209 pop rbx

210 RESTORE_XMM

211 UNSHADOW_ARGS

212 pop rbp

213 ret

214

215

216

217

218 ;unsigned int vp8_get8x8var_sse2

219 ;(

220 ; unsigned char * src_ptr,

221 ; int source_stride,

222 ; unsigned char * ref_ptr,

223 ; int recon_stride,

224 ; unsigned int * SSE,

225 ; int * Sum

226 ;)

227 global sym(vp8_get8x8var_sse2) PRIVATE

228 sym(vp8_get8x8var_sse2):

229 push rbp

230 mov rbp, rsp

231 SHADOW_ARGS_TO_STACK 6

232 SAVE_XMM 7

233 GET_GOT rbx

234 push rsi

235 push rdi

236 sub rsp, 16

237 ; end prolog

238

239 mov rsi, arg(0) ;[src_ptr]

240 mov rdi, arg(2) ;[ref_ptr]

241

242 movsxd rax, DWORD PTR arg(1) ;[source_stride]

243 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]

244

245 pxor xmm0, xmm0 ; clear xmm0 for unpack

246 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs

247

248 movq xmm1, QWORD PTR [rsi]

249 movq xmm2, QWORD PTR [rdi]

250

251 punpcklbw xmm1, xmm0

252 punpcklbw xmm2, xmm0

253

254 psubsw xmm1, xmm2

255 paddw xmm7, xmm1

256

257 pmaddwd xmm1, xmm1

258

259 movq xmm2, QWORD PTR[rsi + rax]

260 movq xmm3, QWORD PTR[rdi + rdx]

261

262 punpcklbw xmm2, xmm0

263 punpcklbw xmm3, xmm0

264

265 psubsw xmm2, xmm3

266 paddw xmm7, xmm2

267

268 pmaddwd xmm2, xmm2

269 paddd xmm1, xmm2

270

271

272 movq xmm2, QWORD PTR[rsi + rax * 2]

273 movq xmm3, QWORD PTR[rdi + rdx * 2]

274

275 punpcklbw xmm2, xmm0

276 punpcklbw xmm3, xmm0

277

278 psubsw xmm2, xmm3

279 paddw xmm7, xmm2

280

281 pmaddwd xmm2, xmm2

282 paddd xmm1, xmm2

283

284

285 lea rsi, [rsi + rax * 2]

286 lea rdi, [rdi + rdx * 2]

287 movq xmm2, QWORD PTR[rsi + rax]

288 movq xmm3, QWORD PTR[rdi + rdx]

289

290 punpcklbw xmm2, xmm0

291 punpcklbw xmm3, xmm0

292

293 psubsw xmm2, xmm3

294 paddw xmm7, xmm2

295

296 pmaddwd xmm2, xmm2

297 paddd xmm1, xmm2

298

299 movq xmm2, QWORD PTR[rsi + rax *2]

300 movq xmm3, QWORD PTR[rdi + rdx *2]

301

302 punpcklbw xmm2, xmm0

303 punpcklbw xmm3, xmm0

304

305 psubsw xmm2, xmm3

306 paddw xmm7, xmm2

307

308 pmaddwd xmm2, xmm2

309 paddd xmm1, xmm2

310

311

312 lea rsi, [rsi + rax * 2]

313 lea rdi, [rdi + rdx * 2]

314

315

316 movq xmm2, QWORD PTR[rsi + rax]

317 movq xmm3, QWORD PTR[rdi + rdx]

318

319 punpcklbw xmm2, xmm0

320 punpcklbw xmm3, xmm0

321

322 psubsw xmm2, xmm3

323 paddw xmm7, xmm2

324

325 pmaddwd xmm2, xmm2

326 paddd xmm1, xmm2

327

328 movq xmm2, QWORD PTR[rsi + rax *2]

329 movq xmm3, QWORD PTR[rdi + rdx *2]

330

331 punpcklbw xmm2, xmm0

332 punpcklbw xmm3, xmm0

333

334 psubsw xmm2, xmm3

335 paddw xmm7, xmm2

336

337 pmaddwd xmm2, xmm2

338 paddd xmm1, xmm2

339

340

341 lea rsi, [rsi + rax * 2]

342 lea rdi, [rdi + rdx * 2]

343

344 movq xmm2, QWORD PTR[rsi + rax]

345 movq xmm3, QWORD PTR[rdi + rdx]

346

347 punpcklbw xmm2, xmm0

348 punpcklbw xmm3, xmm0

349

350 psubsw xmm2, xmm3

351 paddw xmm7, xmm2

352

353 pmaddwd xmm2, xmm2

354 paddd xmm1, xmm2

355

356

357 movdqa xmm6, xmm7

358 punpcklwd xmm6, xmm0

359

360 punpckhwd xmm7, xmm0

361 movdqa xmm2, xmm1

362

363 paddw xmm6, xmm7

364 punpckldq xmm1, xmm0

365

366 punpckhdq xmm2, xmm0

367 movdqa xmm7, xmm6

368

369 paddd xmm1, xmm2

370 punpckldq xmm6, xmm0

371

372 punpckhdq xmm7, xmm0

373 paddw xmm6, xmm7

374

375 movdqa xmm2, xmm1

376 movdqa xmm7, xmm6

377

378 psrldq xmm1, 8

379 psrldq xmm6, 8

380

381 paddw xmm7, xmm6

382 paddd xmm1, xmm2

383

384 mov rax, arg(5) ;[Sum]

385 mov rdi, arg(4) ;[SSE]

386

387 movq rdx, xmm7

388 movsx rcx, dx

389

390 mov dword ptr [rax], ecx

391 movd DWORD PTR [rdi], xmm1

392

393 ; begin epilog

394 add rsp, 16

395 pop rdi

396 pop rsi

397 RESTORE_GOT

398 RESTORE_XMM

399 UNSHADOW_ARGS

400 pop rbp

401 ret

402

403 ;void vp8_filter_block2d_bil_var_sse2	16 ;void vp8_filter_block2d_bil_var_sse2

404 ;(	17 ;(

405 ; unsigned char *ref_ptr,	18 ; unsigned char *ref_ptr,

406 ; int ref_pixels_per_line,	19 ; int ref_pixels_per_line,

407 ; unsigned char *src_ptr,	20 ; unsigned char *src_ptr,

408 ; int src_pixels_per_line,	21 ; int src_pixels_per_line,

409 ; unsigned int Height,	22 ; unsigned int Height,

410 ; int xoffset,	23 ; int xoffset,

411 ; int yoffset,	24 ; int yoffset,

412 ; int *sum,	25 ; int *sum,

(...skipping 937 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1350 align 16	963 align 16

1351 vp8_bilinear_filters_sse2:	964 vp8_bilinear_filters_sse2:

1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0	965 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0

1353 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16	966 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16

1354 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32	967 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32

1355 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48	968 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48

1356 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64	969 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

1357 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80	970 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80

1358 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96	971 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96

1359 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112	972 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

OLD	NEW

« no previous file with comments | « source/libvpx/vp8/common/x86/variance_impl_mmx.asm ('k') | source/libvpx/vp8/common/x86/variance_mmx.c » ('j') | no next file with comments »