src/opts/SkBlitRow_opts_SSE4_x64_asm.S - Issue 289473009: Add SSE4 optimization of S32A_Opaque_Blitrow

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4_x64_asm.S

Issue 289473009: Add SSE4 optimization of S32A_Opaque_Blitrow (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2013 The Android Open Source Project

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #if !defined(_MSC_VER)

	9

	10 /*

	11 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,

	12 * const SkPMColor* SK_RESTRICT src,

	13 * int count, U8CPU alpha)

	14 *

	15 * The primary optimization comes from checking the source pixels' alpha value.

	16 * If the alpha is zero, the pixel can be skipped entirely.

	17 * If the alpha is fully opaque, the pixel can be copied directly to the destina tion.

	18 * According to collected statistics, these two cases are the most common.

	19 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the

	20 * memory latency worse-case.

	21 */

	22

	23 .section .text.sse4,"ax",@progbits

	24 .type S32A_Opaque_BlitRow32_SSE4_asm, @function

	25 .globl S32A_Opaque_BlitRow32_SSE4_asm

	26

	27 .p2align 4

	28 S32A_Opaque_BlitRow32_SSE4_asm:

	29 .cfi_startproc

	30 prefetcht0 (%rsi)

	31 movl %edx, %ecx // Pixel count

	32 movq %rdi, %rdx // Destination pointer

	33 movq %rsi, %rax // Source pointer

	34

	35 // Setup SSE constants

	36 movdqa .LAlphaCheckMask(%rip), %xmm7 // 0xFF000000 mask to check alpha

	37 movdqa .LInverseAlphaCalc(%rip), %xmm6// 16-bit 256 to calculate inv. a lpha

	38 movdqa .LResultMergeMask(%rip), %xmm0 // 0x00FF00FF mask (Must be in xm m0 because of pblendvb)

	39

	40 subl $4, %ecx // Check if we have only 0-3 pixels

	41 js .LReallySmall

	42 cmpl $11, %ecx // Do we have enough pixels to run the m ain loop?

	43 ja .LBigBlit

	44

	45 // Handle small blits (4-15 pixels)

	46 // ********************************

	47 xorq %rdi, %rdi // Reset offset to zero

	48

	49 .LSmallLoop:

	50 lddqu (%rax, %rdi), %xmm1 // Load four source pixels

	51 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	52 ja .LSmallAlphaNotOpaqueOrZero

	53 jz .LSmallAlphaZero

	54 movdqu %xmm1, (%rdx, %rdi) // Store four destination pixels

	55 .LSmallAlphaZero:

	56 addq $16, %rdi

	57 subl $4, %ecx // Check if there are four additional pi xels, at least

	58 jns .LSmallLoop

	59 jmp .LSmallRemaining

	60

	61 // Handle mixed alphas (calculate and scale)

	62 .p2align 4

	63 .LSmallAlphaNotOpaqueOrZero:

	64 lddqu (%rdx, %rdi), %xmm5 // Load four destination pixels

	65

	66 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	67 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green

	68 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	69 movdqa %xmm6, %xmm4

	70 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	71 movdqa %xmm5, %xmm3

	72 psubw %xmm2, %xmm4 // Finalize alpha calculations

	73

	74 psllw $8, %xmm5 // Filter out red and blue components

	75 pmulhuw %xmm4, %xmm5 // Scale red and blue

	76 psrlw $8, %xmm3 // Filter out alpha and green components

	77 pmullw %xmm4, %xmm3 // Scale alpha and green

	78

	79 addq $16, %rdi

	80 subl $4, %ecx // Check if we can store all four pixels

	81 pblendvb %xmm0, %xmm5, %xmm3

	82 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	83 movdqu %xmm1, -16(%rdx, %rdi) // Store four destination pixels

	84 jns .LSmallLoop

	85

	86 // Handle the last 0-3 pixels (also used by the big unaligned loop)

	87 .LSmallRemaining:

	88 cmpl $-4, %ecx // Check if we are done

	89 je .LSmallExit

	90 sall $2, %ecx // Calculate offset for last pixels

	91 movslq %ecx, %rcx

	92 addq %rcx, %rdi

	93

	94 lddqu (%rax, %rdi), %xmm1 // Load last four source pixels (overlap ping)

	95 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	96 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store

	97 jz .LSmallExit

	98

	99 // Handle mixed alphas (calculate and scale)

	100 lddqu (%rdx, %rdi), %xmm5 // Load last four destination pixels (ov erlapping)

	101 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	102 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green

	103 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	104 movdqa %xmm6, %xmm4

	105 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	106 movdqa %xmm5, %xmm3

	107 psubw %xmm2, %xmm4 // Finalize alpha calculations

	108

	109 psllw $8, %xmm3 // Filter out red and blue components

	110 pmulhuw %xmm4, %xmm3 // Scale red and blue

	111 movdqa %xmm5, %xmm2

	112 psrlw $8, %xmm2 // Filter out alpha and green components

	113 pmullw %xmm4, %xmm2 // Scale alpha and green

	114

	115 cmpl $-8, %ecx // Check how many pixels should be writt en

	116 pblendvb %xmm0, %xmm3, %xmm2 // Combine results

	117 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether

	118 jb .LSmallPixelsLeft1

	119 ja .LSmallPixelsLeft3

	120 pblendw $0xF0, %xmm1, %xmm5

	121 movdqu %xmm5, (%rdx, %rdi) // Store last two destination pixels

	122 .LSmallExit:

	123 ret

	124

	125 .LSmallPixelsLeft1:

	126 pblendw $0xC0, %xmm1, %xmm5

	127 movdqu %xmm5, (%rdx, %rdi) // Store last destination pixel

	128 ret

	129

	130 .LSmallPixelsLeft3:

	131 pblendw $0xFC, %xmm1, %xmm5

	132 movdqu %xmm5, (%rdx, %rdi) // Store last three destination pixels

	133 ret

	134

	135 .LSmallRemainingStoreAll:

	136 movdqu %xmm1, (%rdx, %rdi) // Store last destination pixels (overwr ite)

	137 ret

	138

	139 // Handle really small blits (0-3 pixels)

	140 // **************************************

	141 .LReallySmall:

	142 addl $4, %ecx

	143 jle .LReallySmallExit

	144 pcmpeqd %xmm1, %xmm1

	145 cmpl $2, %ecx // Check how many pixels should be read

	146 pinsrd $0x0, (%rax), %xmm1 // Load one source pixel

	147 pinsrd $0x0, (%rdx), %xmm5 // Load one destination pixel

	148 jb .LReallySmallCalc

	149 pinsrd $0x1, 4(%rax), %xmm1 // Load second source pixel

	150 pinsrd $0x1, 4(%rdx), %xmm5 // Load second destination pixel

	151 je .LReallySmallCalc

	152 pinsrd $0x2, 8(%rax), %xmm1 // Load third source pixel

	153 pinsrd $0x2, 8(%rdx), %xmm5 // Load third destination pixel

	154

	155 .LReallySmallCalc:

	156 ptest %xmm7, %xmm1 // Check if all alphas are opaque

	157 jc .LReallySmallStore // If all alphas are opaque, just store

	158

	159 // Handle mixed alphas (calculate and scale)

	160 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	161 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green

	162 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	163 movdqa %xmm6, %xmm4

	164 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	165 movdqa %xmm5, %xmm3

	166 psubw %xmm2, %xmm4 // Finalize alpha calculations

	167

	168 pand %xmm0, %xmm5 // Filter out red and blue components

	169 pmullw %xmm4, %xmm5 // Scale red and blue

	170 psrlw $8, %xmm3 // Filter out alpha and green components

	171 pmullw %xmm4, %xmm3 // Scale alpha and green

	172

	173 psrlw $8, %xmm5 // Combine results

	174 pblendvb %xmm0, %xmm5, %xmm3

	175 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	176

	177 .LReallySmallStore:

	178 cmpl $2, %ecx // Check how many pixels should be writt en

	179 pextrd $0x0, %xmm1, (%rdx) // Store one destination pixel

	180 jb .LReallySmallExit

	181 pextrd $0x1, %xmm1, 4(%rdx) // Store second destination pixel

	182 je .LReallySmallExit

	183 pextrd $0x2, %xmm1, 8(%rdx) // Store third destination pixel

	184 .LReallySmallExit:

	185 ret

	186

	187 // Handle bigger blit operations (16+ pixels)

	188 // ******************************************

	189 .p2align 4

	190 .LBigBlit:

	191 // Align destination?

	192 testl $0xF, %edx

	193 lddqu (%rax), %xmm1 // Pre-load four source pixels

	194 jz .LAligned

	195

	196 movq %rdx, %rdi // Calculate alignment of destination po inter

	197 negq %rdi

	198 andl $0xF, %edi

	199

	200 // Handle 1-3 pixels to align destination

	201 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	202 jz .LAlignDone // If all alphas are opaque, just skip

	203 lddqu (%rdx), %xmm5 // Load four destination pixels

	204 jc .LAlignStore // If all alphas are opaque, just store

	205

	206 // Handle mixed alphas (calculate and scale)

	207 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	208 psrlw $8, %xmm2 // Discard red and blue

	209 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	210 movdqa %xmm6, %xmm4

	211 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	212 movdqa %xmm5, %xmm3

	213 psubw %xmm2, %xmm4 // Finalize alpha calculations

	214

	215 psllw $8, %xmm3 // Filter out red and blue components

	216 pmulhuw %xmm4, %xmm3 // Scale red and blue

	217 movdqa %xmm5, %xmm2

	218 psrlw $8, %xmm2 // Filter out alpha and green components

	219 pmullw %xmm4, %xmm2 // Scale alpha and green

	220

	221 pblendvb %xmm0, %xmm3, %xmm2 // Combine results

	222 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether

	223

	224 .LAlignStore:

	225 cmpl $8, %edi // Check how many pixels should be writt en

	226 jb .LAlignPixelsLeft1

	227 ja .LAlignPixelsLeft3

	228 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels

	229 jmp .LAlignStorePixels

	230

	231 .LAlignPixelsLeft1:

	232 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel

	233 jmp .LAlignStorePixels

	234

	235 .LAlignPixelsLeft3:

	236 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels

	237

	238 .LAlignStorePixels:

	239 movdqu %xmm5, (%rdx) // Store destination pixels

	240

	241 .LAlignDone:

	242 addq %rdi, %rax // Adjust pointers and pixel count

	243 addq %rdi, %rdx

	244 shrq $2, %rdi

	245 lddqu (%rax), %xmm1 // Pre-load new source pixels (after ali gnment)

	246 subl %edi, %ecx

	247

	248 .LAligned: // Destination is guaranteed to be 16 by te aligned

	249 xorq %rdi, %rdi // Reset offset to zero

	250 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup)

	251 testl $0xF, %eax // Check alignment of source pointer

	252 jz .LAlignedLoop

	253

	254 // Source not aligned to destination

	255 // *********************************

	256 .p2align 4

	257 .LUnalignedLoop: // Main loop for unaligned, handles eigh t pixels per iteration

	258 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	259 ja .LAlphaNotOpaqueOrZero00

	260 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels

	261 jz .LAlphaZero00

	262 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	263

	264 .LAlphaZero00:

	265 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

	266 ja .LAlphaNotOpaqueOrZero01

	267 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels

	268 jz .LAlphaZero01

	269 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels

	270

	271 .LAlphaZero01:

	272 addq $32, %rdi // Adjust offset and pixel count

	273 subl $8, %ecx

	274 jae .LUnalignedLoop

	275 addl $8, %ecx // Adjust pixel count

	276 jmp .LLoopCleanup0

	277

	278 .p2align 4

	279 .LAlphaNotOpaqueOrZero00:

	280 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels

	281 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	282 psrlw $8, %xmm2 // Discard red and blue

	283 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	284 movdqa %xmm6, %xmm4

	285 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	286 movdqa %xmm5, %xmm3

	287 psubw %xmm2, %xmm4 // Finalize alpha calculations

	288

	289 psllw $8, %xmm5 // Filter out red and blue components

	290 pmulhuw %xmm4, %xmm5 // Scale red and blue

	291 psrlw $8, %xmm3 // Filter out alpha and green components

	292 pmullw %xmm4, %xmm3 // Scale alpha and green

	293

	294 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels

	295 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	296 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	297 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	298

	299 // Handle next four pixels

	300 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

	301 ja .LAlphaNotOpaqueOrZero01

	302 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels

	303 jz .LAlphaZero02

	304 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels

	305 .LAlphaZero02:

	306 addq $32, %rdi // Adjust offset and pixel count

	307 subl $8, %ecx

	308 jae .LUnalignedLoop

	309 addl $8, %ecx // Adjust pixel count

	310 jmp .LLoopCleanup0

	311

	312 .p2align 4

	313 .LAlphaNotOpaqueOrZero01:

	314 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels

	315

	316 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha

	317 psrlw $8, %xmm1 // Discard red and blue

	318 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high)

	319 movdqa %xmm6, %xmm4

	320 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low)

	321 movdqa %xmm5, %xmm3

	322 psubw %xmm1, %xmm4 // Finalize alpha calculations

	323

	324 psllw $8, %xmm5 // Filter out red and blue components

	325 pmulhuw %xmm4, %xmm5 // Scale red and blue

	326 psrlw $8, %xmm3 // Filter out alpha and green components

	327 pmullw %xmm4, %xmm3 // Scale alpha and green

	328

	329 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels

	330 addq $32, %rdi

	331 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	332 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether

	333 subl $8, %ecx

	334 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels

	335 jae .LUnalignedLoop

	336 addl $8, %ecx // Adjust pixel count

	337

	338 // Cleanup - handle pending pixels from loop

	339 .LLoopCleanup0:

	340 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	341 ja .LAlphaNotOpaqueOrZero02

	342 jz .LAlphaZero03

	343 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	344 .LAlphaZero03:

	345 addq $16, %rdi

	346 subl $4, %ecx

	347 js .LSmallRemaining // Reuse code from small loop

	348 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels

	349 jmp .LLoopCleanup0

	350

	351 .LAlphaNotOpaqueOrZero02:

	352 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels

	353 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	354 psrlw $8, %xmm2 // Discard red and blue

	355 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	356 movdqa %xmm6, %xmm4

	357 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	358 movdqa %xmm5, %xmm3

	359 psubw %xmm2, %xmm4 // Finalize alpha calculations

	360

	361 psllw $8, %xmm5 // Filter out red and blue components

	362 pmulhuw %xmm4, %xmm5 // Scale red and blue

	363 psrlw $8, %xmm3 // Filter out alpha and green components

	364 pmullw %xmm4, %xmm3 // Scale alpha and green

	365

	366 addq $16, %rdi

	367 subl $4, %ecx

	368 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	369 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	370 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels

	371 js .LSmallRemaining // Reuse code from small loop

	372 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels

	373 jmp .LLoopCleanup0

	374

	375 // Source aligned to destination

	376 // *****************************

	377 .p2align 4

	378 .LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration

	379 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	380 ja .LAlphaNotOpaqueOrZero10

	381 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels

	382 jz .LAlphaZero10

	383 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	384

	385 .LAlphaZero10:

	386 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

	387 ja .LAlphaNotOpaqueOrZero11

	388 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels

	389 jz .LAlphaZero11

	390 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels

	391

	392 .LAlphaZero11:

	393 addq $32, %rdi // Adjust offset and pixel count

	394 subl $8, %ecx

	395 jae .LAlignedLoop

	396 jmp .LLoopCleanup1

	397

	398 .p2align 4

	399 .LAlphaNotOpaqueOrZero10:

	400 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels

	401 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	402 psrlw $8, %xmm2 // Discard red and blue

	403 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	404 movdqa %xmm6, %xmm4

	405 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	406 movdqa %xmm5, %xmm3

	407 psubw %xmm2, %xmm4 // Finalize alpha calculations

	408

	409 psllw $8, %xmm5 // Filter out red and blue components

	410 pmulhuw %xmm4, %xmm5 // Scale red and blue

	411 psrlw $8, %xmm3 // Filter out alpha and green components

	412 pmullw %xmm4, %xmm3 // Scale alpha and green

	413

	414 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels

	415 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	416 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	417 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	418

	419 // Handle next four pixels

	420 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

	421 ja .LAlphaNotOpaqueOrZero11

	422 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels

	423 jz .LAlphaZero12

	424 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels

	425 .LAlphaZero12:

	426 addq $32, %rdi // Adjust offset and pixel count

	427 subl $8, %ecx

	428 jae .LAlignedLoop

	429 jmp .LLoopCleanup1

	430

	431 .p2align 4

	432 .LAlphaNotOpaqueOrZero11:

	433 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels

	434

	435 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha

	436 psrlw $8, %xmm1 // Discard red and blue

	437 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high)

	438 movdqa %xmm6, %xmm4

	439 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low)

	440 movdqa %xmm5, %xmm3

	441 psubw %xmm1, %xmm4 // Finalize alpha calculations

	442

	443 psllw $8, %xmm5 // Filter out red and blue components

	444 pmulhuw %xmm4, %xmm5 // Scale red and blue

	445 psrlw $8, %xmm3 // Filter out alpha and green components

	446 pmullw %xmm4, %xmm3 // Scale alpha and green

	447 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels

	448

	449 addq $32, %rdi

	450 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	451 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether

	452 subl $8, %ecx

	453 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels

	454 jae .LAlignedLoop

	455

	456 // Cleanup - handle four pending pixels from loop

	457 .LLoopCleanup1:

	458 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	459 ja .LAlphaNotOpaqueOrZero12

	460 jz .LAlphaZero13

	461 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	462 .LAlphaZero13:

	463 addl $8, %ecx // Adjust offset and pixel count

	464 jz .LExit

	465 addq $16, %rdi

	466 jmp .LRemainLoop1

	467

	468 .LAlphaNotOpaqueOrZero12:

	469 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels

	470 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	471 psrlw $8, %xmm2 // Discard red and blue

	472 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	473 movdqa %xmm6, %xmm4

	474 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	475 movdqa %xmm5, %xmm3

	476 psubw %xmm2, %xmm4 // Finalize alpha calculations

	477

	478 psllw $8, %xmm5 // Filter out red and blue components

	479 pmulhuw %xmm4, %xmm5 // Scale red and blue

	480 psrlw $8, %xmm3 // Filter out alpha and green components

	481 pmullw %xmm4, %xmm3 // Scale alpha and green

	482

	483 addl $8, %ecx // Adjust offset and pixel count

	484 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	485 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	486 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	487 jz .LExit

	488 addq $16, %rdi

	489

	490 // Handle last 1-7 pixels

	491 .LRemainLoop1:

	492 movdqa (%rax, %rdi), %xmm1 // Load four source pixels

	493 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	494 ja .LRemainAlphaNotOpaqueOrZero1

	495 jz .LRemainAlphaZero1

	496

	497 // All alphas were opaque (copy)

	498 subl $4, %ecx // Check if we have more than four pixel s left

	499 jle .LRemainStore

	500 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	501 addq $16, %rdi

	502 jmp .LRemainLoop1

	503

	504 // All alphas were zero (skip)

	505 .p2align 4

	506 .LRemainAlphaZero1:

	507 subl $4, %ecx // Check if we have more than four pixel s left

	508 jle .LExit

	509 addq $16, %rdi

	510 jmp .LRemainLoop1

	511

	512 // Handle mixed alphas (calculate and scale)

	513 .p2align 4

	514 .LRemainAlphaNotOpaqueOrZero1:

	515 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels

	516

	517 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	518 psrlw $8, %xmm2 // Discard red and blue

	519 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	520 movdqa %xmm6, %xmm4

	521 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	522 movdqa %xmm5, %xmm3

	523 psubw %xmm2, %xmm4 // Finalize alpha calculations

	524

	525 psllw $8, %xmm5 // Filter out red and blue components

	526 pmulhuw %xmm4, %xmm5 // Scale red and blue

	527 psrlw $8, %xmm3 // Filter out alpha and green components

	528 pmullw %xmm4, %xmm3 // Scale alpha and green

	529

	530 subl $4, %ecx

	531 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	532 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	533 jle .LRemainStore

	534 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels

	535 addq $16, %rdi

	536 jmp .LRemainLoop1

	537

	538 // Store the last 1-4 pixels

	539 .p2align 4

	540 .LRemainStore:

	541 jz .LRemainFull

	542 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels

	543 cmpl $-2, %ecx // Check how many pixels should be writt en

	544 jb .LRemainPixelsLeft11

	545 ja .LRemainPixelsLeft13

	546 pblendw $0x0F, %xmm1, %xmm5

	547 movdqa %xmm5, (%rdx, %rdi) // Store last 2 destination pixels

	548 .LExit:

	549 ret

	550

	551 .LRemainPixelsLeft11:

	552 pblendw $0x03, %xmm1, %xmm5

	553 movdqa %xmm5, (%rdx, %rdi) // Store last destination pixel

	554 ret

	555

	556 .LRemainPixelsLeft13:

	557 pblendw $0x3F, %xmm1, %xmm5

	558 movdqa %xmm5, (%rdx, %rdi) // Store last 3 destination pixels

	559 ret

	560

	561 .LRemainFull:

	562 movdqa %xmm1, (%rdx, %rdi) // Store last 4 destination pixels

	563 ret

	564

	565 .cfi_endproc

	566 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm

	567

	568 // Constants for SSE code

	569 .pushsection .rodata.sse4,"a",@progbits

	570 .p2align 4

	571 .LAlphaCheckMask:
	mtklein 2014/05/16 18:06:38 Looks like the differences here are: 1) calling Looks like the differences here are: 1) calling convention is different (of course) 2) pointers and offsets go in different registers and use different op suffixes (of course) 3) We store constants in the x86-64 code but construct them programmatically in the x86 code Can we do 3) in both? Or, more aggressively, perhaps we can just forget about writing x86 fast paths and focus on x86-64? I take it Silvermont is x86-64, right? And so will be all future chips? henrik.smiding 2014/05/20 15:10:29 I tested doing a position independent version in 3 Show quoted text On 2014/05/16 18:06:38, mtklein wrote: > Looks like the differences here are: > 1) calling convention is different (of course) > 2) pointers and offsets go in different registers and use different op > suffixes (of course) > 3) We store constants in the x86-64 code but construct them programmatically > in the x86 code > > Can we do 3) in both? Or, more aggressively, perhaps we can just forget about > writing x86 fast paths and focus on x86-64? I take it Silvermont is x86-64, > right? And so will be all future chips? I tested doing a position independent version in 32-bit. It was slower. Probably due to the time it took to fetch the IP. Yes, Silvermont supports x86-64. But just like on Windows, many apps still run in x86 or x32 mode. In Android L there will be both a 64-bit and a 32-bit version of Skia, simultaneously. Some legacy apps with native code 32-bit code will still use 32-bit Skia. I'm guessing the webview app will also run both in 32 and 64-bit versions.
	572 .long 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000

	573 .LInverseAlphaCalc:

	574 .word 256, 256, 256, 256, 256, 256, 256, 256

	575 .LResultMergeMask:

	576 .long 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF

	577 .popsection

	578 #endif

OLD	NEW

« src/opts/SkBlitRow_opts_SSE4_asm.S ('K') | « src/opts/SkBlitRow_opts_SSE4_asm.S ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »