src/opts/SkBlitRow_opts_SSE4_asm.S - Issue 289473009: Add SSE4 optimization of S32A_Opaque_Blitrow

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4_asm.S

Issue 289473009: Add SSE4 optimization of S32A_Opaque_Blitrow (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2013 The Android Open Source Project

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #if !defined(_MSC_VER)

	9

	10 #define CFI_PUSH(REG) \

	11 .cfi_adjust_cfa_offset 4; \

	12 .cfi_rel_offset REG, 0

	13

	14 #define CFI_POP(REG) \

	15 .cfi_adjust_cfa_offset -4;\

	16 .cfi_restore REG

	17

	18 #define PUSH(REG) pushl REG; CFI_PUSH (REG)

	19 #define POP(REG) popl REG; CFI_POP (REG)

	20

	21 /*

	22 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,

	23 * const SkPMColor* SK_RESTRICT src,

	24 * int count, U8CPU alpha)

	25 *

	26 * The primary optimization comes from checking the source pixels' alpha value.

	27 * If the alpha is zero, the pixel can be skipped entirely.

	28 * If the alpha is fully opaque, the pixel can be copied directly to the destina tion.

	29 * According to collected statistics, these two cases are the most common.

	30 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the

	31 * memory latency worse-case.

	32 */

	33

	34 .section .text.sse4,"ax",@progbits

	35 .type S32A_Opaque_BlitRow32_SSE4_asm, @function

	36 .globl S32A_Opaque_BlitRow32_SSE4_asm

	37

	38 .p2align 4

	39 S32A_Opaque_BlitRow32_SSE4_asm:

	40 .cfi_startproc

	41 movl 8(%esp), %eax // Source pointer

	42 movl 12(%esp), %ecx // Pixel count

	43 movl 4(%esp), %edx // Destination pointer

	44 prefetcht0 (%eax)

	45

	46 // Setup SSE constants

	47 pcmpeqd %xmm7, %xmm7 // 0xFF000000 mask to check alpha

	48 pcmpeqw %xmm6, %xmm6 // 16-bit 256 to calculate inv. alpha
	mtklein 2014/05/16 18:06:38 Does the interlaced instruction scheduling here re Does the interlaced instruction scheduling here really help? If not, it sure hurts comprehensibility. henrik.smiding 2014/05/20 15:10:29 On a Haswell core, probably not. On a Silvermont/A Show quoted text On 2014/05/16 18:06:38, mtklein wrote: > Does the interlaced instruction scheduling here really help? If not, it sure > hurts comprehensibility. On a Haswell core, probably not. On a Silvermont/Atom, especially for SSE, it does. But in this case I see no problem doing it in sequence, since it's initialization and not a loop.
	49 pslld $24, %xmm7

	50 pcmpeqw %xmm0, %xmm0 // 0x00FF00FF mask (Must be in xmm0 beca use of pblendvb)

	51 psrlw $15, %xmm6

	52 psrlw $8, %xmm0

	53 subl $4, %ecx // Check if we have only 0-3 pixels

	54 psllw $8, %xmm6

	55 js .LReallySmall

	56 PUSH(%edi)

	57 cmpl $11, %ecx // Do we have enough pixels to run the m ain loop?

	58 ja .LBigBlit

	59

	60 // Handle small blits (4-15 pixels)

	61 // ********************************

	62 xorl %edi, %edi // Reset offset to zero

	63

	64 .LSmallLoop:

	65 lddqu (%eax, %edi), %xmm1 // Load four source pixels

	66 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
	mtklein 2014/05/16 18:06:38 Is this the sort of place intrinsics fail us? I g Is this the sort of place intrinsics fail us? I guess you'd have to have used two _mm_test* here to get the same effect as this three-way ptest? henrik.smiding 2014/05/20 15:10:29 That's correct. It was all about not making the wo Show quoted text On 2014/05/16 18:06:38, mtklein wrote: > Is this the sort of place intrinsics fail us? I guess you'd have to have used > two _mm_test* here to get the same effect as this three-way ptest? That's correct. It was all about not making the worst case worse (pixel alphas in the range 1-254). I'm guessing there was a problem with register allocation and op-code ordering as well.
	67 ja .LSmallAlphaNotOpaqueOrZero

	68 jz .LSmallAlphaZero

	69 movdqu %xmm1, (%edx, %edi) // Store four destination pixels

	70 .LSmallAlphaZero:

	71 addl $16, %edi

	72 subl $4, %ecx // Check if there are four additional pi xels, at least

	73 jns .LSmallLoop

	74 jmp .LSmallRemaining

	75

	76 // Handle mixed alphas (calculate and scale)

	77 .p2align 4

	78 .LSmallAlphaNotOpaqueOrZero:

	79 lddqu (%edx, %edi), %xmm5 // Load four destination pixels

	80

	81 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	82 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green

	83 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	84 movdqa %xmm6, %xmm4

	85 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	86 movdqa %xmm5, %xmm3

	87 psubw %xmm2, %xmm4 // Finalize alpha calculations

	88

	89 psllw $8, %xmm5 // Filter out red and blue components

	90 pmulhuw %xmm4, %xmm5 // Scale red and blue

	91 psrlw $8, %xmm3 // Filter out alpha and green components

	92 pmullw %xmm4, %xmm3 // Scale alpha and green

	93

	94 addl $16, %edi

	95 subl $4, %ecx // Check if we can store all four pixels

	96 pblendvb %xmm0, %xmm5, %xmm3

	97 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	98 movdqu %xmm1, -16(%edx, %edi) // Store four destination pixels

	99 jns .LSmallLoop

	100

	101 // Handle the last 0-3 pixels (also used by the big unaligned loop)

	102 .LSmallRemaining:

	103 cmpl $-4, %ecx // Check if we are done

	104 je .LSmallExit

	105 sall $2, %ecx // Calculate offset for last pixels

	106 addl %ecx, %edi

	107

	108 lddqu (%eax, %edi), %xmm1 // Load last four source pixels (overlap ping)
	mtklein 2014/05/16 18:06:38 I was expecting we'd fall back on non-SIMD or do s I was expecting we'd fall back on non-SIMD or do something complicated to handle the tail. I will have to remember this trick. Can you add a note that we can only do all four when alpha == 0 or FF, and that when we actually blend we have to be careful about not double-blending the overlapping pixels? henrik.smiding 2014/05/20 15:10:29 I've improved the comments a bit. Show quoted text On 2014/05/16 18:06:38, mtklein wrote: > I was expecting we'd fall back on non-SIMD or do something complicated to handle > the tail. I will have to remember this trick. Can you add a note that we can > only do all four when alpha == 0 or FF, and that when we actually blend we have > to be careful about not double-blending the overlapping pixels? I've improved the comments a bit.
	109 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	110 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store

	111 jz .LSmallExit

	112

	113 // Handle mixed alphas (calculate and scale)
	mtklein 2014/05/16 18:06:38 Can we share or macro away this big blend block? Can we share or macro away this big blend block? I think I count it 11 times, sometimes with slightly different register choices. Even if it can't all be shared, it'd be nice to factor apart the required and incidental differences. henrik.smiding 2014/05/20 15:10:29 Done. I replaced about 200 lines of code with macr Show quoted text On 2014/05/16 18:06:38, mtklein wrote: > Can we share or macro away this big blend block? I think I count it 11 times, > sometimes with slightly different register choices. Even if it can't all be > shared, it'd be nice to factor apart the required and incidental differences. Done. I replaced about 200 lines of code with macros. It shouldn't affect performance at all.
	114 lddqu (%edx, %edi), %xmm5 // Load last four destination pixels (ov erlapping)

	115 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	116 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green

	117 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	118 movdqa %xmm6, %xmm4

	119 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	120 movdqa %xmm5, %xmm3

	121 psubw %xmm2, %xmm4 // Finalize alpha calculations

	122

	123 psllw $8, %xmm3 // Filter out red and blue components

	124 pmulhuw %xmm4, %xmm3 // Scale red and blue

	125 movdqa %xmm5, %xmm2

	126 psrlw $8, %xmm2 // Filter out alpha and green components

	127 pmullw %xmm4, %xmm2 // Scale alpha and green

	128

	129 cmpl $-8, %ecx // Check how many pixels should be writt en

	130 pblendvb %xmm0, %xmm3, %xmm2 // Combine results

	131 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether

	132 jb .LSmallPixelsLeft1

	133 ja .LSmallPixelsLeft3

	134 pblendw $0xF0, %xmm1, %xmm5

	135 movdqu %xmm5, (%edx, %edi) // Store last two destination pixels

	136 .LSmallExit:

	137 POP(%edi)

	138 ret

	139

	140 .LSmallPixelsLeft1:

	141 pblendw $0xC0, %xmm1, %xmm5

	142 movdqu %xmm5, (%edx, %edi) // Store last destination pixel

	143 POP(%edi)

	144 ret

	145

	146 .LSmallPixelsLeft3:

	147 pblendw $0xFC, %xmm1, %xmm5

	148 movdqu %xmm5, (%edx, %edi) // Store last three destination pixels

	149 POP(%edi)

	150 ret

	151

	152 .LSmallRemainingStoreAll:

	153 movdqu %xmm1, (%edx, %edi) // Store last destination pixels (overwr ite)

	154 POP(%edi)

	155 ret

	156

	157 // Handle really small blits (0-3 pixels)

	158 // **************************************

	159 .LReallySmall:

	160 addl $4, %ecx

	161 jle .LReallySmallExit

	162 pcmpeqd %xmm1, %xmm1

	163 cmp $2, %ecx // Check how many pixels should be read

	164 pinsrd $0x0, (%eax), %xmm1 // Load one source pixel

	165 pinsrd $0x0, (%edx), %xmm5 // Load one destination pixel

	166 jb .LReallySmallCalc

	167 pinsrd $0x1, 4(%eax), %xmm1 // Load second source pixel

	168 pinsrd $0x1, 4(%edx), %xmm5 // Load second destination pixel

	169 je .LReallySmallCalc

	170 pinsrd $0x2, 8(%eax), %xmm1 // Load third source pixel

	171 pinsrd $0x2, 8(%edx), %xmm5 // Load third destination pixel

	172

	173 .LReallySmallCalc:

	174 ptest %xmm7, %xmm1 // Check if all alphas are opaque

	175 jc .LReallySmallStore // If all alphas are opaque, just store

	176

	177 // Handle mixed alphas (calculate and scale)

	178 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	179 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green

	180 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	181 movdqa %xmm6, %xmm4

	182 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	183 movdqa %xmm5, %xmm3

	184 psubw %xmm2, %xmm4 // Finalize alpha calculations

	185

	186 pand %xmm0, %xmm5 // Filter out red and blue components

	187 pmullw %xmm4, %xmm5 // Scale red and blue

	188 psrlw $8, %xmm3 // Filter out alpha and green components

	189 pmullw %xmm4, %xmm3 // Scale alpha and green

	190

	191 psrlw $8, %xmm5 // Combine results

	192 pblendvb %xmm0, %xmm5, %xmm3

	193 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	194

	195 .LReallySmallStore:

	196 cmp $2, %ecx // Check how many pixels should be writt en

	197 pextrd $0x0, %xmm1, (%edx) // Store one destination pixel

	198 jb .LReallySmallExit

	199 pextrd $0x1, %xmm1, 4(%edx) // Store second destination pixel

	200 je .LReallySmallExit

	201 pextrd $0x2, %xmm1, 8(%edx) // Store third destination pixel

	202 .LReallySmallExit:

	203 ret

	204

	205 // Handle bigger blit operations (16+ pixels)

	206 // ******************************************

	207 .p2align 4

	208 .LBigBlit:

	209 // Align destination?

	210 testl $0xF, %edx

	211 lddqu (%eax), %xmm1 // Pre-load four source pixels

	212 jz .LAligned

	213

	214 movl %edx, %edi // Calculate alignment of destination po inter

	215 negl %edi

	216 andl $0xF, %edi

	217

	218 // Handle 1-3 pixels to align destination

	219 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
	mtklein 2014/05/16 18:06:38 Do you think we're benefitting by having everythin Do you think we're benefitting by having everything here in SSE? I'm wondering if it'd simplify things to handle these edgy cases (small, unaligned) in portable code and call into this asm only for the long-haul when we can assert dst is aligned and count >= 4 or 16? If it doesn't kill performance, I want to break this monster method up into chunks that are easier to reason about. If we're going to handle all of unaligned dst, aligned dst/unaligned src, and aligned src+dst, maybe we can pull those tests out into C++ code up front, and have it dispatch to three different methods in here? henrik.smiding 2014/05/20 15:10:29 That would kill performance of short blits, like 1 Show quoted text On 2014/05/16 18:06:38, mtklein wrote: > Do you think we're benefitting by having everything here in SSE? I'm wondering > if it'd simplify things to handle these edgy cases (small, unaligned) in > portable code and call into this asm only for the long-haul when we can assert > dst is aligned and count >= 4 or 16? > > If it doesn't kill performance, I want to break this monster method up into > chunks that are easier to reason about. If we're going to handle all of > unaligned dst, aligned dst/unaligned src, and aligned src+dst, maybe we can pull > those tests out into C++ code up front, and have it dispatch to three different > methods in here? That would kill performance of short blits, like 1-16 pixels, which are quite common. It would also force at least three copies of the initialization code, and duplication of the code paths that are reused between the blocks. If you want to reduce the number of lines further, I could test if the 'perfect alignment' path could be removed without affecting performance.
	220 jz .LAlignDone // If all alphas are opaque, just skip

	221 lddqu (%edx), %xmm5 // Load four destination pixels

	222 jc .LAlignStore // If all alphas are opaque, just store

	223

	224 // Handle mixed alphas (calculate and scale)

	225 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	226 psrlw $8, %xmm2 // Discard red and blue

	227 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	228 movdqa %xmm6, %xmm4

	229 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	230 movdqa %xmm5, %xmm3

	231 psubw %xmm2, %xmm4 // Finalize alpha calculations

	232

	233 psllw $8, %xmm3 // Filter out red and blue components

	234 pmulhuw %xmm4, %xmm3 // Scale red and blue

	235 movdqa %xmm5, %xmm2

	236 psrlw $8, %xmm2 // Filter out alpha and green components

	237 pmullw %xmm4, %xmm2 // Scale alpha and green

	238

	239 pblendvb %xmm0, %xmm3, %xmm2 // Combine results

	240 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether

	241

	242 .LAlignStore:

	243 cmp $8, %edi // Check how many pixels should be writt en

	244 jb .LAlignPixelsLeft1

	245 ja .LAlignPixelsLeft3

	246 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels

	247 jmp .LAlignStorePixels

	248

	249 .LAlignPixelsLeft1:

	250 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel

	251 jmp .LAlignStorePixels

	252

	253 .LAlignPixelsLeft3:

	254 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels

	255

	256 .LAlignStorePixels:

	257 movdqu %xmm5, (%edx) // Store destination pixels

	258

	259 .LAlignDone:

	260 addl %edi, %eax // Adjust pointers and pixel count

	261 addl %edi, %edx

	262 shrl $2, %edi

	263 lddqu (%eax), %xmm1 // Pre-load new source pixels (after ali gnment)

	264 subl %edi, %ecx

	265

	266 .LAligned: // Destination is guaranteed to be 16 by te aligned

	267 xorl %edi, %edi // Reset offset to zero

	268 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup)

	269 testl $0xF, %eax // Check alignment of source pointer

	270 jz .LAlignedLoop

	271

	272 // Source not aligned to destination

	273 // *********************************

	274 .p2align 4

	275 .LUnalignedLoop: // Main loop for unaligned, handles eigh t pixels per iteration

	276 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	277 ja .LAlphaNotOpaqueOrZero00

	278 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels

	279 jz .LAlphaZero00

	280 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	281

	282 .LAlphaZero00:

	283 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

	284 ja .LAlphaNotOpaqueOrZero01

	285 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

	286 jz .LAlphaZero01

	287 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

	288

	289 .LAlphaZero01:

	290 addl $32, %edi // Adjust offset and pixel count

	291 subl $8, %ecx

	292 jae .LUnalignedLoop

	293 addl $8, %ecx // Adjust pixel count

	294 jmp .LLoopCleanup0

	295

	296 .p2align 4

	297 .LAlphaNotOpaqueOrZero00:

	298 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

	299 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	300 psrlw $8, %xmm2 // Discard red and blue

	301 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	302 movdqa %xmm6, %xmm4

	303 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	304 movdqa %xmm5, %xmm3

	305 psubw %xmm2, %xmm4 // Finalize alpha calculations

	306

	307 psllw $8, %xmm5 // Filter out red and blue components

	308 pmulhuw %xmm4, %xmm5 // Scale red and blue

	309 psrlw $8, %xmm3 // Filter out alpha and green components

	310 pmullw %xmm4, %xmm3 // Scale alpha and green

	311

	312 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels

	313 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	314 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	315 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	316

	317 // Handle next four pixels

	318 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

	319 ja .LAlphaNotOpaqueOrZero01

	320 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

	321 jz .LAlphaZero02

	322 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

	323 .LAlphaZero02:

	324 addl $32, %edi // Adjust offset and pixel count

	325 subl $8, %ecx

	326 jae .LUnalignedLoop

	327 addl $8, %ecx // Adjust pixel count

	328 jmp .LLoopCleanup0

	329

	330 .p2align 4

	331 .LAlphaNotOpaqueOrZero01:

	332 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels

	333

	334 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha

	335 psrlw $8, %xmm1 // Discard red and blue

	336 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high)

	337 movdqa %xmm6, %xmm4

	338 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low)

	339 movdqa %xmm5, %xmm3

	340 psubw %xmm1, %xmm4 // Finalize alpha calculations

	341

	342 psllw $8, %xmm5 // Filter out red and blue components

	343 pmulhuw %xmm4, %xmm5 // Scale red and blue

	344 psrlw $8, %xmm3 // Filter out alpha and green components

	345 pmullw %xmm4, %xmm3 // Scale alpha and green

	346

	347 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels

	348 addl $32, %edi

	349 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	350 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether

	351 subl $8, %ecx

	352 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels

	353 jae .LUnalignedLoop

	354 addl $8, %ecx // Adjust pixel count

	355

	356 // Cleanup - handle pending pixels from loop

	357 .LLoopCleanup0:

	358 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	359 ja .LAlphaNotOpaqueOrZero02

	360 jz .LAlphaZero03

	361 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	362 .LAlphaZero03:

	363 addl $16, %edi

	364 subl $4, %ecx

	365 js .LSmallRemaining // Reuse code from small loop

	366 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels

	367 jmp .LLoopCleanup0

	368

	369 .LAlphaNotOpaqueOrZero02:

	370 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

	371 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	372 psrlw $8, %xmm2 // Discard red and blue

	373 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	374 movdqa %xmm6, %xmm4

	375 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	376 movdqa %xmm5, %xmm3

	377 psubw %xmm2, %xmm4 // Finalize alpha calculations

	378

	379 psllw $8, %xmm5 // Filter out red and blue components

	380 pmulhuw %xmm4, %xmm5 // Scale red and blue

	381 psrlw $8, %xmm3 // Filter out alpha and green components

	382 pmullw %xmm4, %xmm3 // Scale alpha and green

	383

	384 addl $16, %edi

	385 subl $4, %ecx

	386 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	387 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	388 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels

	389 js .LSmallRemaining // Reuse code from small loop

	390 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels

	391 jmp .LLoopCleanup0

	392

	393 // Source aligned to destination

	394 // *****************************

	395 .p2align 4

	396 .LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration

	397 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	398 ja .LAlphaNotOpaqueOrZero10

	399 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels

	400 jz .LAlphaZero10

	401 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	402

	403 .LAlphaZero10:

	404 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

	405 ja .LAlphaNotOpaqueOrZero11

	406 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

	407 jz .LAlphaZero11

	408 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

	409

	410 .LAlphaZero11:

	411 addl $32, %edi // Adjust offset and pixel count

	412 subl $8, %ecx

	413 jae .LAlignedLoop

	414 jmp .LLoopCleanup1

	415

	416 .p2align 4

	417 .LAlphaNotOpaqueOrZero10:

	418 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

	419 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	420 psrlw $8, %xmm2 // Discard red and blue

	421 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	422 movdqa %xmm6, %xmm4

	423 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	424 movdqa %xmm5, %xmm3

	425 psubw %xmm2, %xmm4 // Finalize alpha calculations

	426

	427 psllw $8, %xmm5 // Filter out red and blue components

	428 pmulhuw %xmm4, %xmm5 // Scale red and blue

	429 psrlw $8, %xmm3 // Filter out alpha and green components

	430 pmullw %xmm4, %xmm3 // Scale alpha and green

	431

	432 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels

	433 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	434 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	435 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	436

	437 // Handle next four pixels

	438 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e

	439 ja .LAlphaNotOpaqueOrZero11

	440 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

	441 jz .LAlphaZero12

	442 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels

	443 .LAlphaZero12:

	444 addl $32, %edi // Adjust offset and pixel count

	445 subl $8, %ecx

	446 jae .LAlignedLoop

	447 jmp .LLoopCleanup1

	448

	449 .p2align 4

	450 .LAlphaNotOpaqueOrZero11:

	451 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels

	452

	453 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha

	454 psrlw $8, %xmm1 // Discard red and blue

	455 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high)

	456 movdqa %xmm6, %xmm4

	457 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low)

	458 movdqa %xmm5, %xmm3

	459 psubw %xmm1, %xmm4 // Finalize alpha calculations

	460

	461 psllw $8, %xmm5 // Filter out red and blue components

	462 pmulhuw %xmm4, %xmm5 // Scale red and blue

	463 psrlw $8, %xmm3 // Filter out alpha and green components

	464 pmullw %xmm4, %xmm3 // Scale alpha and green

	465 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels

	466

	467 addl $32, %edi

	468 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	469 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether

	470 subl $8, %ecx

	471 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels

	472 jae .LAlignedLoop

	473

	474 // Cleanup - handle four pending pixels from loop

	475 .LLoopCleanup1:

	476 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	477 ja .LAlphaNotOpaqueOrZero12

	478 jz .LAlphaZero13

	479 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	480 .LAlphaZero13:

	481 addl $8, %ecx // Adjust offset and pixel count

	482 jz .LExit

	483 addl $16, %edi

	484 jmp .LRemainLoop1

	485

	486 .LAlphaNotOpaqueOrZero12:

	487 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

	488 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	489 psrlw $8, %xmm2 // Discard red and blue

	490 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	491 movdqa %xmm6, %xmm4

	492 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	493 movdqa %xmm5, %xmm3

	494 psubw %xmm2, %xmm4 // Finalize alpha calculations

	495

	496 psllw $8, %xmm5 // Filter out red and blue components

	497 pmulhuw %xmm4, %xmm5 // Scale red and blue

	498 psrlw $8, %xmm3 // Filter out alpha and green components

	499 pmullw %xmm4, %xmm3 // Scale alpha and green

	500

	501 addl $8, %ecx // Adjust offset and pixel count

	502 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	503 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	504 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	505 jz .LExit

	506 addl $16, %edi

	507

	508 // Handle last 1-7 pixels

	509 .LRemainLoop1:

	510 movdqa (%eax, %edi), %xmm1 // Load four source pixels

	511 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e

	512 ja .LRemainAlphaNotOpaqueOrZero1

	513 jz .LRemainAlphaZero1

	514

	515 // All alphas were opaque (copy)

	516 subl $4, %ecx // Check if we have more than four pixel s left

	517 jle .LRemainStore

	518 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	519 addl $16, %edi

	520 jmp .LRemainLoop1

	521

	522 // All alphas were zero (skip)

	523 .p2align 4

	524 .LRemainAlphaZero1:

	525 subl $4, %ecx // Check if we have more than four pixel s left

	526 jle .LExit

	527 addl $16, %edi

	528 jmp .LRemainLoop1

	529

	530 // Handle mixed alphas (calculate and scale)

	531 .p2align 4

	532 .LRemainAlphaNotOpaqueOrZero1:

	533 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

	534

	535 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha

	536 psrlw $8, %xmm2 // Discard red and blue

	537 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)

	538 movdqa %xmm6, %xmm4

	539 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)

	540 movdqa %xmm5, %xmm3

	541 psubw %xmm2, %xmm4 // Finalize alpha calculations

	542

	543 psllw $8, %xmm5 // Filter out red and blue components

	544 pmulhuw %xmm4, %xmm5 // Scale red and blue

	545 psrlw $8, %xmm3 // Filter out alpha and green components

	546 pmullw %xmm4, %xmm3 // Scale alpha and green

	547

	548 subl $4, %ecx

	549 pblendvb %xmm0, %xmm5, %xmm3 // Combine results

	550 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether

	551 jle .LRemainStore

	552 movdqa %xmm1, (%edx, %edi) // Store four destination pixels

	553 addl $16, %edi

	554 jmp .LRemainLoop1

	555

	556 // Store the last 1-4 pixels

	557 .p2align 4

	558 .LRemainStore:

	559 jz .LRemainFull

	560 movdqa (%edx, %edi), %xmm5 // Load four destination pixels

	561 cmp $-2, %ecx // Check how many pixels should be writt en

	562 jb .LRemainPixelsLeft11

	563 ja .LRemainPixelsLeft13

	564 pblendw $0x0F, %xmm1, %xmm5

	565 movdqa %xmm5, (%edx, %edi) // Store last 2 destination pixels

	566 .LExit:

	567 POP(%edi) // Exit

	568 ret

	569

	570 .LRemainPixelsLeft11:

	571 pblendw $0x03, %xmm1, %xmm5

	572 movdqa %xmm5, (%edx, %edi) // Store last destination pixel

	573 POP(%edi) // Exit

	574 ret

	575

	576 .LRemainPixelsLeft13:

	577 pblendw $0x3F, %xmm1, %xmm5

	578 movdqa %xmm5, (%edx, %edi) // Store last 3 destination pixels

	579 POP(%edi) // Exit

	580 ret

	581

	582 .LRemainFull:

	583 movdqa %xmm1, (%edx, %edi) // Store last 4 destination pixels

	584 POP(%edi) // Exit

	585 ret

	586

	587 .cfi_endproc

	588 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm

	589 #endif

OLD	NEW

« src/opts/SkBlitRow_opts_SSE4.h ('K') | « src/opts/SkBlitRow_opts_SSE4.h ('k') | src/opts/SkBlitRow_opts_SSE4_x64_asm.S » ('j') | src/opts/SkBlitRow_opts_SSE4_x64_asm.S » ('J')