src/opts/SkBlitRow_opts_SSE4_asm.S - Issue 311053009: Revert of Add SSE4 optimization of S32A_Opaque_Blitrow

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4_asm.S

Issue 311053009: Revert of Add SSE4 optimization of S32A_Opaque_Blitrow (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 * Copyright 2013 The Android Open Source Project

3 *

4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.

6 */

7

8 #if !defined(_MSC_VER)

9

10 #define CFI_PUSH(REG) \

11 .cfi_adjust_cfa_offset 4; \

12 .cfi_rel_offset REG, 0

13

14 #define CFI_POP(REG) \

15 .cfi_adjust_cfa_offset -4; \

16 .cfi_restore REG

17

18 #define PUSH(REG) pushl REG; CFI_PUSH (REG)

19 #define POP(REG) popl REG; CFI_POP (REG)

20 #define RETURN POP(%edi); ret

21

22 #define EXTRACT_ALPHA(var1, var2) \

23 movdqa %var1, %var2; /* Clone source pixels to extract alpha */\

24 psrlw $8, %var2; /* Discard red and blue, leaving alpha a nd green */\

25 pshufhw $0xF5, %var2, %var2; /* Repeat alpha for scaling (high) */\

26 movdqa %xmm6, %xmm4; \

27 pshuflw $0xF5, %var2, %var2; /* Repeat alpha for scaling (low) */\

28 movdqa %xmm5, %xmm3; \

29 psubw %var2, %xmm4 /* Finalize alpha calculations */

30

31 #define SCALE_PIXELS \

32 psllw $8, %xmm5; /* Filter out red and blue components */ \

33 pmulhuw %xmm4, %xmm5; /* Scale red and blue */\

34 psrlw $8, %xmm3; /* Filter out alpha and green components */\

35 pmullw %xmm4, %xmm3 /* Scale alpha and green */

36

37

38 /*

39 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,

40 * const SkPMColor* SK_RESTRICT src,

41 * int count, U8CPU alpha)

42 *

43 * This function is divided into six blocks: initialization, blit 4-15 pixels,

44 * blit 0-3 pixels, align destination for 16+ pixel blits,

45 * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned.

46 * There are some code reuse between the blocks.

47 *

48 * The primary optimization comes from checking the source pixels' alpha value.

49 * If the alpha is zero, the pixel can be skipped entirely.

50 * If the alpha is fully opaque, the pixel can be copied directly to the destina tion.

51 * According to collected statistics, these two cases are the most common.

52 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the

53 * memory latency worse-case.

54 */

55

56 #ifdef __clang__

57 .text

58 .global _S32A_Opaque_BlitRow32_SSE4_asm

59 #else

60 .section .text.sse4.2,"ax",@progbits

61 .type S32A_Opaque_BlitRow32_SSE4_asm, @function

62 .global S32A_Opaque_BlitRow32_SSE4_asm

63 #endif

64

65 .p2align 4

66 _S32A_Opaque_BlitRow32_SSE4_asm:

67 S32A_Opaque_BlitRow32_SSE4_asm:

68 .cfi_startproc

69 movl 8(%esp), %eax // Source pointer

70 movl 12(%esp), %ecx // Pixel count

71 movl 4(%esp), %edx // Destination pointer

72 prefetcht0 (%eax)

73

74 // Setup SSE constants

75 pcmpeqd %xmm7, %xmm7 // 0xFF000000 mask to check alpha

76 pslld $24, %xmm7

77 pcmpeqw %xmm6, %xmm6 // 16-bit 256 to calculate inv. alpha

78 psrlw $15, %xmm6

79 psllw $8, %xmm6

80 pcmpeqw %xmm0, %xmm0 // 0x00FF00FF mask (Must be in xmm0 beca use of pblendvb)

81 psrlw $8, %xmm0

82 subl $4, %ecx // Check if we have only 0-3 pixels

83 js .LReallySmall

84 PUSH(%edi)

85 cmpl $11, %ecx // Do we have enough pixels to run the m ain loop?

86 ja .LBigBlit

87

88 // Handle small blits (4-15 pixels)

89 //////////////////////////////////////////////////////////////////////////// ////

90 xorl %edi, %edi // Reset offset to zero

91

92 .LSmallLoop:

93 lddqu (%eax, %edi), %xmm1 // Load four source pixels

94 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

95 ja .LSmallAlphaNotOpaqueOrZero

96 jz .LSmallAlphaZero // If all alphas are zero, skip the pixe ls completely

97 movdqu %xmm1, (%edx, %edi) // Store four destination pixels

98 .LSmallAlphaZero:

99 addl $16, %edi

100 subl $4, %ecx // Check if there are four additional pi xels, at least

101 jns .LSmallLoop

102 jmp .LSmallRemaining

103

104 // Handle mixed alphas (calculate and scale)

105 .p2align 4

106 .LSmallAlphaNotOpaqueOrZero:

107 lddqu (%edx, %edi), %xmm5 // Load four destination pixels

108 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

109 SCALE_PIXELS // Scale pixels using alpha

110

111 addl $16, %edi

112 subl $4, %ecx // Check if we can store all four pixels

113 pblendvb %xmm0, %xmm5, %xmm3

114 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

115 movdqu %xmm1, -16(%edx, %edi) // Store four destination pixels

116 jns .LSmallLoop

117

118 // Handle the last 0-3 pixels (also used by the big unaligned loop)

119 .LSmallRemaining:

120 cmpl $-4, %ecx // Check if we are done

121 je .LSmallExit

122 sall $2, %ecx // Calculate offset for last pixels

123 addl %ecx, %edi

124

125 lddqu (%eax, %edi), %xmm1 // Load last four source pixels (overlap ping)

126 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

127 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store (overlapping)

128 jz .LSmallExit // If all alphas are zero, skip the pixe ls completely

129

130 // Handle mixed alphas (calculate and scale)

131 lddqu (%edx, %edi), %xmm5 // Load last four destination pixels (ov erlapping)

132 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

133

134 psllw $8, %xmm3 // Filter out red and blue components

135 pmulhuw %xmm4, %xmm3 // Scale red and blue

136 movdqa %xmm5, %xmm2

137 psrlw $8, %xmm2 // Filter out alpha and green components

138 pmullw %xmm4, %xmm2 // Scale alpha and green

139

140 cmpl $-8, %ecx // Check how many pixels should be writt en

141 pblendvb %xmm0, %xmm3, %xmm2 // Combine results

142 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether

143 jb .LSmallPixelsLeft1

144 ja .LSmallPixelsLeft3 // To avoid double-blending the overlapp ing pixels...

145 pblendw $0xF0, %xmm1, %xmm5 // Merge only the final two pixels to th e destination

146 movdqu %xmm5, (%edx, %edi) // Store last two destination pixels

147 .LSmallExit:

148 RETURN

149

150 .LSmallPixelsLeft1:

151 pblendw $0xC0, %xmm1, %xmm5 // Merge only the final pixel to the des tination

152 movdqu %xmm5, (%edx, %edi) // Store last destination pixel

153 RETURN

154

155 .LSmallPixelsLeft3:

156 pblendw $0xFC, %xmm1, %xmm5 // Merge only the final three pixels to the destination

157 movdqu %xmm5, (%edx, %edi) // Store last three destination pixels

158 RETURN

159

160 .LSmallRemainingStoreAll:

161 movdqu %xmm1, (%edx, %edi) // Store last destination pixels (overwr ite)

162 RETURN

163

164 // Handle really small blits (0-3 pixels)

165 //////////////////////////////////////////////////////////////////////////// ////

166 .LReallySmall:

167 addl $4, %ecx

168 jle .LReallySmallExit

169 pcmpeqd %xmm1, %xmm1

170 cmp $2, %ecx // Check how many pixels should be read

171 pinsrd $0x0, (%eax), %xmm1 // Load one source pixel

172 pinsrd $0x0, (%edx), %xmm5 // Load one destination pixel

173 jb .LReallySmallCalc

174 pinsrd $0x1, 4(%eax), %xmm1 // Load second source pixel

175 pinsrd $0x1, 4(%edx), %xmm5 // Load second destination pixel

176 je .LReallySmallCalc

177 pinsrd $0x2, 8(%eax), %xmm1 // Load third source pixel

178 pinsrd $0x2, 8(%edx), %xmm5 // Load third destination pixel

179

180 .LReallySmallCalc:

181 ptest %xmm7, %xmm1 // Check if all alphas are opaque

182 jc .LReallySmallStore // If all alphas are opaque, just store

183

184 // Handle mixed alphas (calculate and scale)

185 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

186

187 pand %xmm0, %xmm5 // Filter out red and blue components

188 pmullw %xmm4, %xmm5 // Scale red and blue

189 psrlw $8, %xmm3 // Filter out alpha and green components

190 pmullw %xmm4, %xmm3 // Scale alpha and green

191

192 psrlw $8, %xmm5 // Combine results

193 pblendvb %xmm0, %xmm5, %xmm3

194 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

195

196 .LReallySmallStore:

197 cmp $2, %ecx // Check how many pixels should be writt en

198 pextrd $0x0, %xmm1, (%edx) // Store one destination pixel

199 jb .LReallySmallExit

200 pextrd $0x1, %xmm1, 4(%edx) // Store second destination pixel

201 je .LReallySmallExit

202 pextrd $0x2, %xmm1, 8(%edx) // Store third destination pixel

203 .LReallySmallExit:

204 ret

205

206 // Handle bigger blit operations (16+ pixels)

207 //////////////////////////////////////////////////////////////////////////// ////

208 .p2align 4

209 .LBigBlit:

210 // Align destination?

211 testl $0xF, %edx

212 lddqu (%eax), %xmm1 // Pre-load four source pixels

213 jz .LAligned

214

215 movl %edx, %edi // Calculate alignment of destination po inter

216 negl %edi

217 andl $0xF, %edi

218

219 // Handle 1-3 pixels to align destination

220 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

221 jz .LAlignDone // If all alphas are zero, just skip

222 lddqu (%edx), %xmm5 // Load four destination pixels

223 jc .LAlignStore // If all alphas are opaque, just store

224

225 // Handle mixed alphas (calculate and scale)

226 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

227

228 psllw $8, %xmm3 // Filter out red and blue components

229 pmulhuw %xmm4, %xmm3 // Scale red and blue

230 movdqa %xmm5, %xmm2

231 psrlw $8, %xmm2 // Filter out alpha and green components

232 pmullw %xmm4, %xmm2 // Scale alpha and green

233

234 pblendvb %xmm0, %xmm3, %xmm2 // Combine results

235 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether

236

237 .LAlignStore:

238 cmp $8, %edi // Check how many pixels should be writt en

239 jb .LAlignPixelsLeft1

240 ja .LAlignPixelsLeft3

241 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels

242 jmp .LAlignStorePixels

243

244 .LAlignPixelsLeft1:

245 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel

246 jmp .LAlignStorePixels

247

248 .LAlignPixelsLeft3:

249 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels

250

251 .LAlignStorePixels:

252 movdqu %xmm5, (%edx) // Store destination pixels

253

254 .LAlignDone:

255 addl %edi, %eax // Adjust pointers and pixel count

256 addl %edi, %edx

257 shrl $2, %edi

258 lddqu (%eax), %xmm1 // Pre-load new source pixels (after ali gnment)

259 subl %edi, %ecx

260

261 .LAligned: // Destination is guaranteed to be 16 by te aligned

262 xorl %edi, %edi // Reset offset to zero

263 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup)

264 testl $0xF, %eax // Check alignment of source pointer

265 jz .LAlignedLoop

266

267 // Source not aligned to destination

268 //////////////////////////////////////////////////////////////////////////// ////

269 .p2align 4

270 .LUnalignedLoop: // Main loop for unaligned, handles eigh t pixels per iteration

271 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

272 ja .LAlphaNotOpaqueOrZero00

273 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels

274 jz .LAlphaZero00

275 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

276

277 .LAlphaZero00:

278 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

279 ja .LAlphaNotOpaqueOrZero01

280 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

281 jz .LAlphaZero01

282 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

283

284 .LAlphaZero01:

285 addl $32, %edi // Adjust offset and pixel count

286 subl $8, %ecx

287 jae .LUnalignedLoop

288 addl $8, %ecx // Adjust pixel count

289 jmp .LLoopCleanup0

290

291 .p2align 4

292 .LAlphaNotOpaqueOrZero00:

293 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

294 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

295 SCALE_PIXELS // Scale pixels using alpha

296

297 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels

298 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

299 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

300 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

301

302 // Handle next four pixels

303 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

304 ja .LAlphaNotOpaqueOrZero01

305 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

306 jz .LAlphaZero02

307 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

308 .LAlphaZero02:

309 addl $32, %edi // Adjust offset and pixel count

310 subl $8, %ecx

311 jae .LUnalignedLoop

312 addl $8, %ecx // Adjust pixel count

313 jmp .LLoopCleanup0

314

315 .p2align 4

316 .LAlphaNotOpaqueOrZero01:

317 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels

318 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value

319 SCALE_PIXELS // Scale pixels using alpha

320

321 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

322 addl $32, %edi

323 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

324 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether

325 subl $8, %ecx

326 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels

327 jae .LUnalignedLoop

328 addl $8, %ecx // Adjust pixel count

329

330 // Cleanup - handle pending pixels from loop

331 .LLoopCleanup0:

332 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

333 ja .LAlphaNotOpaqueOrZero02

334 jz .LAlphaZero03

335 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

336 .LAlphaZero03:

337 addl $16, %edi

338 subl $4, %ecx

339 js .LSmallRemaining // Reuse code from small loop

340 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels

341 jmp .LLoopCleanup0

342

343 .LAlphaNotOpaqueOrZero02:

344 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

345 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

346 SCALE_PIXELS // Scale pixels using alpha

347

348 addl $16, %edi

349 subl $4, %ecx

350 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

351 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

352 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels

353 js .LSmallRemaining // Reuse code from small loop

354 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels

355 jmp .LLoopCleanup0

356

357 // Source aligned to destination

358 //////////////////////////////////////////////////////////////////////////// ////

359 .p2align 4

360 .LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration

361 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

362 ja .LAlphaNotOpaqueOrZero10

363 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels

364 jz .LAlphaZero10

365 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

366

367 .LAlphaZero10:

368 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

369 ja .LAlphaNotOpaqueOrZero11

370 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

371 jz .LAlphaZero11

372 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

373

374 .LAlphaZero11:

375 addl $32, %edi // Adjust offset and pixel count

376 subl $8, %ecx

377 jae .LAlignedLoop

378 jmp .LLoopCleanup1

379

380 .p2align 4

381 .LAlphaNotOpaqueOrZero10:

382 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

383 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

384 SCALE_PIXELS // Scale pixels using alpha

385

386 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels

387 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

388 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

389 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

390

391 // Handle next four pixels

392 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

393 ja .LAlphaNotOpaqueOrZero11

394 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

395 jz .LAlphaZero12

396 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

397 .LAlphaZero12:

398 addl $32, %edi // Adjust offset and pixel count

399 subl $8, %ecx

400 jae .LAlignedLoop

401 jmp .LLoopCleanup1

402

403 .p2align 4

404 .LAlphaNotOpaqueOrZero11:

405 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels

406 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value

407 SCALE_PIXELS // Scale pixels using alpha

408 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

409

410 addl $32, %edi

411 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

412 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether

413 subl $8, %ecx

414 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels

415 jae .LAlignedLoop

416

417 // Cleanup - handle four pending pixels from loop

418 .LLoopCleanup1:

419 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

420 ja .LAlphaNotOpaqueOrZero12

421 jz .LAlphaZero13

422 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

423 .LAlphaZero13:

424 addl $8, %ecx // Adjust offset and pixel count

425 jz .LExit

426 addl $16, %edi

427 jmp .LRemainLoop1

428

429 .LAlphaNotOpaqueOrZero12:

430 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

431 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

432 SCALE_PIXELS // Scale pixels using alpha

433

434 addl $8, %ecx // Adjust offset and pixel count

435 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

436 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

437 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

438 jz .LExit

439 addl $16, %edi

440

441 // Handle last 1-7 pixels

442 .LRemainLoop1:

443 movdqa (%eax, %edi), %xmm1 // Load four source pixels

444 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

445 ja .LRemainAlphaNotOpaqueOrZero1

446 jz .LRemainAlphaZero1

447

448 // All alphas were opaque (copy)

449 subl $4, %ecx // Check if we have more than four pixel s left

450 jle .LRemainStore

451 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

452 addl $16, %edi

453 jmp .LRemainLoop1

454

455 // All alphas were zero (skip)

456 .p2align 4

457 .LRemainAlphaZero1:

458 subl $4, %ecx // Check if we have more than four pixel s left

459 jle .LExit

460 addl $16, %edi

461 jmp .LRemainLoop1

462

463 // Handle mixed alphas (calculate and scale)

464 .p2align 4

465 .LRemainAlphaNotOpaqueOrZero1:

466 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

467 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value

468 SCALE_PIXELS // Scale pixels using alpha

469

470 subl $4, %ecx

471 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

472 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

473 jle .LRemainStore

474 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

475 addl $16, %edi

476 jmp .LRemainLoop1

477

478 // Store the last 1-4 pixels

479 .p2align 4

480 .LRemainStore:

481 jz .LRemainFull

482 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

483 cmp $-2, %ecx // Check how many pixels should be writt en

484 jb .LRemainPixelsLeft11

485 ja .LRemainPixelsLeft13

486 pblendw $0x0F, %xmm1, %xmm5

487 movdqa %xmm5, (%edx, %edi) // Store last 2 destination pixels

488 .LExit:

489 RETURN

490

491 .LRemainPixelsLeft11:

492 pblendw $0x03, %xmm1, %xmm5

493 movdqa %xmm5, (%edx, %edi) // Store last destination pixel

494 RETURN

495

496 .LRemainPixelsLeft13:

497 pblendw $0x3F, %xmm1, %xmm5

498 movdqa %xmm5, (%edx, %edi) // Store last 3 destination pixels

499 RETURN

500

501 .LRemainFull:

502 movdqa %xmm1, (%edx, %edi) // Store last 4 destination pixels

503 RETURN

504

505 .cfi_endproc

506 #ifndef __clang__

507 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm

508 #endif

509 #endif

OLD	NEW

« no previous file with comments | « src/opts/SkBlitRow_opts_SSE4.h ('k') | src/opts/SkBlitRow_opts_SSE4_x64_asm.S » ('j') | no next file with comments »