source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm - Issue 554673004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_postproc_mmx.asm

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 ;

2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 ;

4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.

9 ;

10

11

12 %include "vpx_ports/x86_abi_support.asm"

13

14 %define VP9_FILTER_WEIGHT 128

15 %define VP9_FILTER_SHIFT 7

16

17 ;void vp9_post_proc_down_and_across_mmx

18 ;(

19 ; unsigned char *src_ptr,

20 ; unsigned char *dst_ptr,

21 ; int src_pixels_per_line,

22 ; int dst_pixels_per_line,

23 ; int rows,

24 ; int cols,

25 ; int flimit

26 ;)

27 global sym(vp9_post_proc_down_and_across_mmx) PRIVATE

28 sym(vp9_post_proc_down_and_across_mmx):

29 push rbp

30 mov rbp, rsp

31 SHADOW_ARGS_TO_STACK 7

32 GET_GOT rbx

33 push rsi

34 push rdi

35 ; end prolog

36

37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1

38 ; move the global rd onto the stack, since we don't have enough registers

39 ; to do PIC addressing

40 movq mm0, [GLOBAL(rd)]

41 sub rsp, 8

42 movq [rsp], mm0

43 %define RD [rsp]

44 %else

45 %define RD [GLOBAL(rd)]

46 %endif

47

48 push rbx

49 lea rbx, [GLOBAL(Blur)]

50 movd mm2, dword ptr arg(6) ;flimit

51 punpcklwd mm2, mm2

52 punpckldq mm2, mm2

53

54 mov rsi, arg(0) ;src_ptr

55 mov rdi, arg(1) ;dst_ptr

56

57 movsxd rcx, DWORD PTR arg(4) ;rows

58 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pit ch?

59 pxor mm0, mm0 ; mm0 = 00000000

60

61 .nextrow:

62

63 xor rdx, rdx ; clear out rdx for use as loop counte r

64 .nextcol:

65

66 pxor mm7, mm7 ; mm7 = 00000000

67 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps

68 movq mm3, [rsi] ; mm4 = r0 p0..p7

69 punpcklbw mm3, mm0 ; mm3 = p0..p3

70 movq mm1, mm3 ; mm1 = p0..p3

71 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers

72

73 movq mm6, [rbx + 48] ; mm6 = kernel 3 taps

74 movq mm5, [rsi + rax] ; mm4 = r1 p0..p7

75 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3

76 pmullw mm6, mm5 ; mm6 = p0..p3 kernel 3 modifiers

77 paddusw mm3, mm6 ; mm3 += mm6

78

79 ; thresholding

80 movq mm7, mm1 ; mm7 = r0 p0..p3

81 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3

82 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3

83 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)

84 pcmpgtw mm7, mm2

85

86 movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers

87 movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7

88 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3

89 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers

90 paddusw mm3, mm6 ; mm3 += mm5

91

92 ; thresholding

93 movq mm6, mm1 ; mm6 = r0 p0..p3

94 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3

95 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3

96 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)

97 pcmpgtw mm6, mm2

98 por mm7, mm6 ; accumulate thresholds

99

100

101 neg rax

102 movq mm6, [rbx ] ; kernel 0 taps

103 movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7

104 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3

105 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers

106 paddusw mm3, mm6 ; mm3 += mm5

107

108 ; thresholding

109 movq mm6, mm1 ; mm6 = r0 p0..p3

110 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3

111 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3

112 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)

113 pcmpgtw mm6, mm2

114 por mm7, mm6 ; accumulate thresholds

115

116 movq mm6, [rbx + 16] ; kernel 1 taps

117 movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7

118 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3

119 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.

120 paddusw mm3, mm6 ; mm3 += mm5

121

122 ; thresholding

123 movq mm6, mm1 ; mm6 = r0 p0..p3

124 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3

125 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3

126 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)

127 pcmpgtw mm6, mm2

128 por mm7, mm6 ; accumulate thresholds

129

130

131 paddusw mm3, RD ; mm3 += round value

132 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128

133

134 pand mm1, mm7 ; mm1 select vals > thresh from source

135 pandn mm7, mm3 ; mm7 select vals < thresh from blurre d result

136 paddusw mm1, mm7 ; combination

137

138 packuswb mm1, mm0 ; pack to bytes

139

140 movd [rdi], mm1 ;

141 neg rax ; pitch is positive

142

143

144 add rsi, 4

145 add rdi, 4

146 add rdx, 4

147

148 cmp edx, dword ptr arg(5) ;cols

149 jl .nextcol

150 ; done with the all cols, start the across filtering in place

151 sub rsi, rdx

152 sub rdi, rdx

153

154

155 push rax

156 xor rdx, rdx

157 mov rax, [rdi-4];

158

159 .acrossnextcol:

160 pxor mm7, mm7 ; mm7 = 00000000

161 movq mm6, [rbx + 32 ] ;

162 movq mm4, [rdi+rdx] ; mm4 = p0..p7

163 movq mm3, mm4 ; mm3 = p0..p7

164 punpcklbw mm3, mm0 ; mm3 = p0..p3

165 movq mm1, mm3 ; mm1 = p0..p3

166 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers

167

168 movq mm6, [rbx + 48]

169 psrlq mm4, 8 ; mm4 = p1..p7

170 movq mm5, mm4 ; mm5 = p1..p7

171 punpcklbw mm5, mm0 ; mm5 = p1..p4

172 pmullw mm6, mm5 ; mm6 = p1..p4 kernel 3 modifiers

173 paddusw mm3, mm6 ; mm3 += mm6

174

175 ; thresholding

176 movq mm7, mm1 ; mm7 = p0..p3

177 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4

178 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3

179 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)

180 pcmpgtw mm7, mm2

181

182 movq mm6, [rbx + 64 ]

183 psrlq mm4, 8 ; mm4 = p2..p7

184 movq mm5, mm4 ; mm5 = p2..p7

185 punpcklbw mm5, mm0 ; mm5 = p2..p5

186 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers

187 paddusw mm3, mm6 ; mm3 += mm5

188

189 ; thresholding

190 movq mm6, mm1 ; mm6 = p0..p3

191 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4

192 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3

193 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)

194 pcmpgtw mm6, mm2

195 por mm7, mm6 ; accumulate thresholds

196

197

198 movq mm6, [rbx ]

199 movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5

200 movq mm5, mm4 ; mm5 = p-2..p5

201 punpcklbw mm5, mm0 ; mm5 = p-2..p1

202 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers

203 paddusw mm3, mm6 ; mm3 += mm5

204

205 ; thresholding

206 movq mm6, mm1 ; mm6 = p0..p3

207 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4

208 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3

209 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)

210 pcmpgtw mm6, mm2

211 por mm7, mm6 ; accumulate thresholds

212

213 movq mm6, [rbx + 16]

214 psrlq mm4, 8 ; mm4 = p-1..p5

215 punpcklbw mm4, mm0 ; mm4 = p-1..p2

216 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.

217 paddusw mm3, mm6 ; mm3 += mm5

218

219 ; thresholding

220 movq mm6, mm1 ; mm6 = p0..p3

221 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4

222 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3

223 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)

224 pcmpgtw mm6, mm2

225 por mm7, mm6 ; accumulate thresholds

226

227 paddusw mm3, RD ; mm3 += round value

228 psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128

229

230 pand mm1, mm7 ; mm1 select vals > thresh from source

231 pandn mm7, mm3 ; mm7 select vals < thresh from blurre d result

232 paddusw mm1, mm7 ; combination

233

234 packuswb mm1, mm0 ; pack to bytes

235 mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes

236 movd eax, mm1

237

238 add rdx, 4

239 cmp edx, dword ptr arg(5) ;cols

240 jl .acrossnextcol;

241

242 mov DWORD PTR [rdi+rdx-4], eax

243 pop rax

244

245 ; done with this rwo

246 add rsi,rax ; next line

247 movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pit ch?

248 add rdi,rax ; next destination

249 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pit ch?

250

251 dec rcx ; decrement count

252 jnz .nextrow ; next row

253 pop rbx

254

255 ; begin epilog

256 pop rdi

257 pop rsi

258 RESTORE_GOT

259 UNSHADOW_ARGS

260 pop rbp

261 ret

262 %undef RD

263

264

265 ;void vp9_mbpost_proc_down_mmx(unsigned char *dst,

266 ; int pitch, int rows, int cols,int flimit)

267 extern sym(vp9_rv)

268 global sym(vp9_mbpost_proc_down_mmx) PRIVATE

269 sym(vp9_mbpost_proc_down_mmx):

270 push rbp

271 mov rbp, rsp

272 SHADOW_ARGS_TO_STACK 5

273 GET_GOT rbx

274 push rsi

275 push rdi

276 ; end prolog

277

278 ALIGN_STACK 16, rax

279 sub rsp, 136

280

281 ; unsigned char d[16][8] at [rsp]

282 ; create flimit2 at [rsp+128]

283 mov eax, dword ptr arg(4) ;flimit

284 mov [rsp+128], eax

285 mov [rsp+128+4], eax

286 %define flimit2 [rsp+128]

287

288 %if ABI_IS_32BIT=0

289 lea r8, [GLOBAL(sym(vp9_rv))]

290 %endif

291

292 ;rows +=8;

293 add dword ptr arg(2), 8

294

295 ;for(c=0; c<cols; c+=4)

296 .loop_col:

297 mov rsi, arg(0) ;s

298 pxor mm0, mm0 ;

299

300 movsxd rax, dword ptr arg(1) ;pitch ;

301 neg rax ; rax = -pitch

302

303 lea rsi, [rsi + rax8]; ; rdi = s[-pitch 8]

304 neg rax

305

306

307 pxor mm5, mm5

308 pxor mm6, mm6 ;

309

310 pxor mm7, mm7 ;

311 mov rdi, rsi

312

313 mov rcx, 15 ;

314

315 .loop_initvar:

316 movd mm1, DWORD PTR [rdi];

317 punpcklbw mm1, mm0 ;

318

319 paddw mm5, mm1 ;

320 pmullw mm1, mm1 ;

321

322 movq mm2, mm1 ;

323 punpcklwd mm1, mm0 ;

324

325 punpckhwd mm2, mm0 ;

326 paddd mm6, mm1 ;

327

328 paddd mm7, mm2 ;

329 lea rdi, [rdi+rax] ;

330

331 dec rcx

332 jne .loop_initvar

333 ;save the var and sum

334 xor rdx, rdx

335 .loop_row:

336 movd mm1, DWORD PTR [rsi] ; [s-pitch*8]

337 movd mm2, DWORD PTR [rdi] ; [s+pitch*7]

338

339 punpcklbw mm1, mm0

340 punpcklbw mm2, mm0

341

342 paddw mm5, mm2

343 psubw mm5, mm1

344

345 pmullw mm2, mm2

346 movq mm4, mm2

347

348 punpcklwd mm2, mm0

349 punpckhwd mm4, mm0

350

351 paddd mm6, mm2

352 paddd mm7, mm4

353

354 pmullw mm1, mm1

355 movq mm2, mm1

356

357 punpcklwd mm1, mm0

358 psubd mm6, mm1

359

360 punpckhwd mm2, mm0

361 psubd mm7, mm2

362

363

364 movq mm3, mm6

365 pslld mm3, 4

366

367 psubd mm3, mm6

368 movq mm1, mm5

369

370 movq mm4, mm5

371 pmullw mm1, mm1

372

373 pmulhw mm4, mm4

374 movq mm2, mm1

375

376 punpcklwd mm1, mm4

377 punpckhwd mm2, mm4

378

379 movq mm4, mm7

380 pslld mm4, 4

381

382 psubd mm4, mm7

383

384 psubd mm3, mm1

385 psubd mm4, mm2

386

387 psubd mm3, flimit2

388 psubd mm4, flimit2

389

390 psrad mm3, 31

391 psrad mm4, 31

392

393 packssdw mm3, mm4

394 packsswb mm3, mm0

395

396 movd mm1, DWORD PTR [rsi+rax*8]

397

398 movq mm2, mm1

399 punpcklbw mm1, mm0

400

401 paddw mm1, mm5

402 mov rcx, rdx

403

404 and rcx, 127

405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1

406 push rax

407 lea rax, [GLOBAL(sym(vp9_rv))]

408 movq mm4, [rax + rcx2] ;vp9_rv[rcx2]

409 pop rax

410 %elif ABI_IS_32BIT=0

411 movq mm4, [r8 + rcx2] ;vp9_rv[rcx2]

412 %else

413 movq mm4, [sym(vp9_rv) + rcx*2]

414 %endif

415 paddw mm1, mm4

416 ;paddw xmm1, eight8s

417 psraw mm1, 4

418

419 packuswb mm1, mm0

420 pand mm1, mm3

421

422 pandn mm3, mm2

423 por mm1, mm3

424

425 and rcx, 15

426 movd DWORD PTR [rsp+rcx4], mm1 ;d[rcx4]

427

428 mov rcx, rdx

429 sub rcx, 8

430

431 and rcx, 15

432 movd mm1, DWORD PTR [rsp+rcx4] ;d[rcx4]

433

434 movd [rsi], mm1

435 lea rsi, [rsi+rax]

436

437 lea rdi, [rdi+rax]

438 add rdx, 1

439

440 cmp edx, dword arg(2) ;rows

441 jl .loop_row

442

443

444 add dword arg(0), 4 ; s += 4

445 sub dword arg(3), 4 ; cols -= 4

446 cmp dword arg(3), 0

447 jg .loop_col

448

449 add rsp, 136

450 pop rsp

451

452 ; begin epilog

453 pop rdi

454 pop rsi

455 RESTORE_GOT

456 UNSHADOW_ARGS

457 pop rbp

458 ret

459 %undef flimit2

460

461

462 ;void vp9_plane_add_noise_mmx (unsigned char start, unsigned char noise,

463 ; unsigned char blackclamp[16],

464 ; unsigned char whiteclamp[16],

465 ; unsigned char bothclamp[16],

466 ; unsigned int width, unsigned int height, int pitch)

467 global sym(vp9_plane_add_noise_mmx) PRIVATE

468 sym(vp9_plane_add_noise_mmx):

469 push rbp

470 mov rbp, rsp

471 SHADOW_ARGS_TO_STACK 8

472 GET_GOT rbx

473 push rsi

474 push rdi

475 ; end prolog

476

477 .addnoise_loop:

478 call sym(LIBVPX_RAND) WRT_PLT

479 mov rcx, arg(1) ;noise

480 and rax, 0xff

481 add rcx, rax

482

483 ; we rely on the fact that the clamping vectors are stored contiguously

484 ; in black/white/both order. Note that we have to reload this here because

485 ; rdx could be trashed by rand()

486 mov rdx, arg(2) ; blackclamp

487

488

489 mov rdi, rcx

490 movsxd rcx, dword arg(5) ;[Width]

491 mov rsi, arg(0) ;Pos

492 xor rax,rax

493

494 .addnoise_nextset:

495 movq mm1,[rsi+rax] ; get the source

496

497 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so w e don't outrange adding noise

498 paddusb mm1, [rdx+32] ;bothclamp

499 psubusb mm1, [rdx+16] ;whiteclamp

500

501 movq mm2,[rdi+rax] ; get the noise for this line

502 paddb mm1,mm2 ; add it in

503 movq [rsi+rax],mm1 ; store the result

504

505 add rax,8 ; move to the next line

506

507 cmp rax, rcx

508 jl .addnoise_nextset

509

510 movsxd rax, dword arg(7) ; Pitch

511 add arg(0), rax ; Start += Pitch

512 sub dword arg(6), 1 ; Height -= 1

513 jg .addnoise_loop

514

515 ; begin epilog

516 pop rdi

517 pop rsi

518 RESTORE_GOT

519 UNSHADOW_ARGS

520 pop rbp

521 ret

522

523

524 SECTION_RODATA

525 align 16

526 Blur:

527 times 16 dw 16

528 times 8 dw 64

529 times 16 dw 16

530 times 8 dw 0

531

532 rd:

533 times 4 dw 0x40

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.c » ('j') | no next file with comments »