src/opts/SkBlitRow_opts_SSE4_asm.S - Issue 894083002: Revert of Revert of SSE4 opaque blend using intrinsics instead of assembly.

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4_asm.S

Issue 894083002: Revert of Revert of SSE4 opaque blend using intrinsics instead of assembly. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 * Copyright 2014 The Android Open Source Project

3 *

4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.

6 */

7

8 #ifdef CRBUG_399842_FIXED

9

10 #if defined(__clang__) \|\| (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC))

11

12 #define CFI_PUSH(REG) \

13 .cfi_adjust_cfa_offset 4; \

14 .cfi_rel_offset REG, 0

15

16 #define CFI_POP(REG) \

17 .cfi_adjust_cfa_offset -4; \

18 .cfi_restore REG

19

20 #define PUSH(REG) pushl REG; CFI_PUSH (REG)

21 #define POP(REG) popl REG; CFI_POP (REG)

22 #define RETURN POP(%edi); ret

23

24 #define EXTRACT_ALPHA(var1, var2) \

25 movdqa %var1, %var2; /* Clone source pixels to extract alpha */\

26 psrlw $8, %var2; /* Discard red and blue, leaving alpha a nd green */\

27 pshufhw $0xF5, %var2, %var2; /* Repeat alpha for scaling (high) */\

28 movdqa %xmm6, %xmm4; \

29 pshuflw $0xF5, %var2, %var2; /* Repeat alpha for scaling (low) */\

30 movdqa %xmm5, %xmm3; \

31 psubw %var2, %xmm4 /* Finalize alpha calculations */

32

33 #define SCALE_PIXELS \

34 psllw $8, %xmm5; /* Filter out red and blue components */ \

35 pmulhuw %xmm4, %xmm5; /* Scale red and blue */\

36 psrlw $8, %xmm3; /* Filter out alpha and green components */\

37 pmullw %xmm4, %xmm3 /* Scale alpha and green */

38

39

40 /*

41 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,

42 * const SkPMColor* SK_RESTRICT src,

43 * int count, U8CPU alpha)

44 *

45 * This function is divided into six blocks: initialization, blit 4-15 pixels,

46 * blit 0-3 pixels, align destination for 16+ pixel blits,

47 * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned.

48 * There are some code reuse between the blocks.

49 *

50 * The primary optimization comes from checking the source pixels' alpha value.

51 * If the alpha is zero, the pixel can be skipped entirely.

52 * If the alpha is fully opaque, the pixel can be copied directly to the destina tion.

53 * According to collected statistics, these two cases are the most common.

54 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the

55 * memory latency worse-case.

56 */

57

58 #ifdef __clang__

59 .text

60 #else

61 .section .text.sse4.2,"ax",@progbits

62 .type S32A_Opaque_BlitRow32_SSE4_asm, @function

63 #endif

64 .p2align 4

65 #if defined(SK_BUILD_FOR_MAC)

66 .global _S32A_Opaque_BlitRow32_SSE4_asm

67 .private_extern _S32A_Opaque_BlitRow32_SSE4_asm

68 _S32A_Opaque_BlitRow32_SSE4_asm:

69 #else

70 .global S32A_Opaque_BlitRow32_SSE4_asm

71 .hidden S32A_Opaque_BlitRow32_SSE4_asm

72 S32A_Opaque_BlitRow32_SSE4_asm:

73 #endif

74 .cfi_startproc

75 movl 8(%esp), %eax // Source pointer

76 movl 12(%esp), %ecx // Pixel count

77 movl 4(%esp), %edx // Destination pointer

78 prefetcht0 (%eax)

79

80 // Setup SSE constants

81 pcmpeqd %xmm7, %xmm7 // 0xFF000000 mask to check alpha

82 pslld $24, %xmm7

83 pcmpeqw %xmm6, %xmm6 // 16-bit 256 to calculate inv. alpha

84 psrlw $15, %xmm6

85 psllw $8, %xmm6

86 pcmpeqw %xmm0, %xmm0 // 0x00FF00FF mask (Must be in xmm0 beca use of pblendvb)

87 psrlw $8, %xmm0

88 subl $4, %ecx // Check if we have only 0-3 pixels

89 js .LReallySmall

90 PUSH(%edi)

91 cmpl $11, %ecx // Do we have enough pixels to run the m ain loop?

92 ja .LBigBlit

93

94 // Handle small blits (4-15 pixels)

95 //////////////////////////////////////////////////////////////////////////// ////

96 xorl %edi, %edi // Reset offset to zero

97

98 .LSmallLoop:

99 lddqu (%eax, %edi), %xmm1 // Load four source pixels

100 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

101 ja .LSmallAlphaNotOpaqueOrZero

102 jz .LSmallAlphaZero // If all alphas are zero, skip the pixe ls completely

103 movdqu %xmm1, (%edx, %edi) // Store four destination pixels

104 .LSmallAlphaZero:

105 addl $16, %edi

106 subl $4, %ecx // Check if there are four additional pi xels, at least

107 jns .LSmallLoop

108 jmp .LSmallRemaining

109

110 // Handle mixed alphas (calculate and scale)

111 .p2align 4

112 .LSmallAlphaNotOpaqueOrZero:

113 lddqu (%edx, %edi), %xmm5 // Load four destination pixels

114 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

115 SCALE_PIXELS // Scale pixels using alpha

116

117 addl $16, %edi

118 subl $4, %ecx // Check if there are four additional pi xels, at least

119 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly

120 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

121 movdqu %xmm1, -16(%edx, %edi) // Store four destination pixels

122 jns .LSmallLoop

123

124 // Handle the last 0-3 pixels (also used by the main loops)

125 .LSmallRemaining:

126 cmpl $-4, %ecx // Check if we are done

127 je .LSmallExit

128 sall $2, %ecx // Calculate offset for last pixels

129 addl %ecx, %edi

130

131 lddqu (%eax, %edi), %xmm1 // Load last four source pixels (overlap ping)

132 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

133 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store (overlapping)

134 jz .LSmallExit // If all alphas are zero, skip the pixe ls completely

135

136 // Handle mixed alphas (calculate and scale)

137 lddqu (%edx, %edi), %xmm5 // Load last four destination pixels (ov erlapping)

138 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

139

140 psllw $8, %xmm3 // Filter out red and blue components

141 pmulhuw %xmm4, %xmm3 // Scale red and blue

142 movdqa %xmm5, %xmm2

143 psrlw $8, %xmm2 // Filter out alpha and green components

144 pmullw %xmm4, %xmm2 // Scale alpha and green

145

146 cmpl $-8, %ecx // Check how many pixels should be writt en

147 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, impli citly)

148 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether

149 jb .LSmallPixelsLeft1

150 ja .LSmallPixelsLeft3 // To avoid double-blending the overlapp ing pixels...

151 pblendw $0xF0, %xmm1, %xmm5 // Merge only the final two pixels to th e destination

152 movdqu %xmm5, (%edx, %edi) // Store last two destination pixels

153 .LSmallExit:

154 RETURN

155

156 .LSmallPixelsLeft1:

157 pblendw $0xC0, %xmm1, %xmm5 // Merge only the final pixel to the des tination

158 movdqu %xmm5, (%edx, %edi) // Store last destination pixel

159 RETURN

160

161 .LSmallPixelsLeft3:

162 pblendw $0xFC, %xmm1, %xmm5 // Merge only the final three pixels to the destination

163 movdqu %xmm5, (%edx, %edi) // Store last three destination pixels

164 RETURN

165

166 .LSmallRemainingStoreAll:

167 movdqu %xmm1, (%edx, %edi) // Store last destination pixels (overwr ite)

168 RETURN

169

170 // Handle really small blits (0-3 pixels)

171 //////////////////////////////////////////////////////////////////////////// ////

172 .LReallySmall:

173 addl $4, %ecx

174 jle .LReallySmallExit

175 pcmpeqd %xmm1, %xmm1

176 cmp $2, %ecx // Check how many pixels should be read

177 pinsrd $0x0, (%eax), %xmm1 // Load one source pixel

178 pinsrd $0x0, (%edx), %xmm5 // Load one destination pixel

179 jb .LReallySmallCalc

180 pinsrd $0x1, 4(%eax), %xmm1 // Load second source pixel

181 pinsrd $0x1, 4(%edx), %xmm5 // Load second destination pixel

182 je .LReallySmallCalc

183 pinsrd $0x2, 8(%eax), %xmm1 // Load third source pixel

184 pinsrd $0x2, 8(%edx), %xmm5 // Load third destination pixel

185

186 .LReallySmallCalc:

187 ptest %xmm7, %xmm1 // Check if all alphas are opaque

188 jc .LReallySmallStore // If all alphas are opaque, just store

189

190 // Handle mixed alphas (calculate and scale)

191 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

192

193 pand %xmm0, %xmm5 // Filter out red and blue components

194 pmullw %xmm4, %xmm5 // Scale red and blue

195 psrlw $8, %xmm3 // Filter out alpha and green components

196 pmullw %xmm4, %xmm3 // Scale alpha and green

197

198 psrlw $8, %xmm5 // Combine results

199 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly

200 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

201

202 .LReallySmallStore:

203 cmp $2, %ecx // Check how many pixels should be writt en

204 pextrd $0x0, %xmm1, (%edx) // Store one destination pixel

205 jb .LReallySmallExit

206 pextrd $0x1, %xmm1, 4(%edx) // Store second destination pixel

207 je .LReallySmallExit

208 pextrd $0x2, %xmm1, 8(%edx) // Store third destination pixel

209 .LReallySmallExit:

210 ret

211

212 // Handle bigger blit operations (16+ pixels)

213 //////////////////////////////////////////////////////////////////////////// ////

214 .p2align 4

215 .LBigBlit:

216 // Align destination?

217 testl $0xF, %edx

218 lddqu (%eax), %xmm1 // Pre-load four source pixels

219 jz .LAligned

220

221 movl %edx, %edi // Calculate alignment of destination po inter

222 negl %edi

223 andl $0xF, %edi

224

225 // Handle 1-3 pixels to align destination

226 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

227 jz .LAlignDone // If all alphas are zero, just skip

228 lddqu (%edx), %xmm5 // Load four destination pixels

229 jc .LAlignStore // If all alphas are opaque, just store

230

231 // Handle mixed alphas (calculate and scale)

232 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

233

234 psllw $8, %xmm3 // Filter out red and blue components

235 pmulhuw %xmm4, %xmm3 // Scale red and blue

236 movdqa %xmm5, %xmm2

237 psrlw $8, %xmm2 // Filter out alpha and green components

238 pmullw %xmm4, %xmm2 // Scale alpha and green

239

240 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, impli citly)

241 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether

242

243 .LAlignStore:

244 cmp $8, %edi // Check how many pixels should be writt en

245 jb .LAlignPixelsLeft1

246 ja .LAlignPixelsLeft3

247 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels

248 jmp .LAlignStorePixels

249

250 .LAlignPixelsLeft1:

251 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel

252 jmp .LAlignStorePixels

253

254 .LAlignPixelsLeft3:

255 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels

256

257 .LAlignStorePixels:

258 movdqu %xmm5, (%edx) // Store destination pixels

259

260 .LAlignDone:

261 addl %edi, %eax // Adjust pointers and pixel count

262 addl %edi, %edx

263 shrl $2, %edi

264 lddqu (%eax), %xmm1 // Pre-load new source pixels (after ali gnment)

265 subl %edi, %ecx

266

267 .LAligned: // Destination is guaranteed to be 16 by te aligned

268 xorl %edi, %edi // Reset offset to zero

269 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup)

270 testl $0xF, %eax // Check alignment of source pointer

271 jz .LAlignedLoop

272

273 // Source not aligned to destination

274 //////////////////////////////////////////////////////////////////////////// ////

275 .p2align 4

276 .LUnalignedLoop: // Main loop for unaligned, handles eigh t pixels per iteration

277 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

278 ja .LAlphaNotOpaqueOrZero00

279 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels

280 jz .LAlphaZero00

281 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

282

283 .LAlphaZero00:

284 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

285 ja .LAlphaNotOpaqueOrZero01

286 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

287 jz .LAlphaZero01

288 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

289

290 .LAlphaZero01:

291 addl $32, %edi // Adjust offset and pixel count

292 subl $8, %ecx

293 jae .LUnalignedLoop

294 addl $8, %ecx // Adjust pixel count

295 jmp .LLoopCleanup0

296

297 .p2align 4

298 .LAlphaNotOpaqueOrZero00:

299 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

300 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

301 SCALE_PIXELS // Scale pixels using alpha

302

303 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels

304 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli citly)

305 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

306 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

307

308 // Handle next four pixels

309 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

310 ja .LAlphaNotOpaqueOrZero01

311 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

312 jz .LAlphaZero02

313 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

314 .LAlphaZero02:

315 addl $32, %edi // Adjust offset and pixel count

316 subl $8, %ecx

317 jae .LUnalignedLoop

318 addl $8, %ecx // Adjust pixel count

319 jmp .LLoopCleanup0

320

321 .p2align 4

322 .LAlphaNotOpaqueOrZero01:

323 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels

324 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value

325 SCALE_PIXELS // Scale pixels using alpha

326

327 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

328 addl $32, %edi

329 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli citly)

330 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether

331 subl $8, %ecx

332 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels

333 jae .LUnalignedLoop

334 addl $8, %ecx // Adjust pixel count

335

336 // Cleanup - handle pending pixels from loop

337 .LLoopCleanup0:

338 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

339 ja .LAlphaNotOpaqueOrZero02

340 jz .LAlphaZero03

341 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

342 .LAlphaZero03:

343 addl $16, %edi

344 subl $4, %ecx

345 js .LSmallRemaining // Reuse code from small loop

346

347 .LRemain0:

348 lddqu (%eax, %edi), %xmm1 // Load four source pixels

349 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

350 ja .LAlphaNotOpaqueOrZero02

351 jz .LAlphaZero04

352 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

353 .LAlphaZero04:

354 addl $16, %edi

355 subl $4, %ecx

356 jmp .LSmallRemaining // Reuse code from small loop

357

358 .LAlphaNotOpaqueOrZero02:

359 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

360 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

361 SCALE_PIXELS // Scale pixels using alpha

362

363 addl $16, %edi

364 subl $4, %ecx

365 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli citly)

366 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

367 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels

368 js .LSmallRemaining // Reuse code from small loop

369 jmp .LRemain0

370

371 // Source aligned to destination

372 //////////////////////////////////////////////////////////////////////////// ////

373 .p2align 4

374 .LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration

375 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

376 ja .LAlphaNotOpaqueOrZero10

377 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels

378 jz .LAlphaZero10

379 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

380

381 .LAlphaZero10:

382 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

383 ja .LAlphaNotOpaqueOrZero11

384 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

385 jz .LAlphaZero11

386 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

387

388 .LAlphaZero11:

389 addl $32, %edi // Adjust offset and pixel count

390 subl $8, %ecx

391 jae .LAlignedLoop

392 addl $8, %ecx // Adjust pixel count

393 jmp .LLoopCleanup1

394

395 .p2align 4

396 .LAlphaNotOpaqueOrZero10:

397 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

398 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

399 SCALE_PIXELS // Scale pixels using alpha

400

401 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels

402 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli citly)

403 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

404 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

405

406 // Handle next four pixels

407 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

408 ja .LAlphaNotOpaqueOrZero11

409 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

410 jz .LAlphaZero12

411 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

412 .LAlphaZero12:

413 addl $32, %edi // Adjust offset and pixel count

414 subl $8, %ecx

415 jae .LAlignedLoop

416 addl $8, %ecx // Adjust pixel count

417 jmp .LLoopCleanup1

418

419 .p2align 4

420 .LAlphaNotOpaqueOrZero11:

421 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels

422 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value

423 SCALE_PIXELS // Scale pixels using alpha

424 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

425

426 addl $32, %edi

427 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli citly)

428 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether

429 subl $8, %ecx

430 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels

431 jae .LAlignedLoop

432 addl $8, %ecx // Adjust pixel count

433

434 // Cleanup - handle pending pixels from loop

435 .LLoopCleanup1:

436 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

437 ja .LAlphaNotOpaqueOrZero12

438 jz .LAlphaZero13

439 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

440 .LAlphaZero13:

441 addl $16, %edi

442 subl $4, %ecx

443 js .LSmallRemaining // Reuse code from small loop

444

445 .LRemain1:

446 movdqa (%eax, %edi), %xmm1 // Load four source pixels

447 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

448 ja .LAlphaNotOpaqueOrZero12

449 jz .LAlphaZero14

450 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

451 .LAlphaZero14:

452 addl $16, %edi

453 subl $4, %ecx

454 jmp .LSmallRemaining // Reuse code from small loop

455

456 .LAlphaNotOpaqueOrZero12:

457 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

458 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

459 SCALE_PIXELS // Scale pixels using alpha

460

461 addl $16, %edi

462 subl $4, %ecx

463 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli citly)

464 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

465 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels

466 js .LSmallRemaining // Reuse code from small loop

467 jmp .LRemain1

468

469 .cfi_endproc

470 #ifndef __clang__

471 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm

472 #endif

473 #endif

474

475 #endif // CRBUG_399842_FIXED

OLD	NEW

« no previous file with comments | « src/opts/SkBlitRow_opts_SSE4.cpp ('k') | src/opts/SkBlitRow_opts_SSE4_x64_asm.S » ('j') | no next file with comments »