| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright 2013 The Android Open Source Project | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license that can be | |
| 5 * found in the LICENSE file. | |
| 6 */ | |
| 7 | |
| 8 #if !defined(_MSC_VER) | |
| 9 | |
| 10 #define CFI_PUSH(REG) \ | |
| 11 .cfi_adjust_cfa_offset 4; \ | |
| 12 .cfi_rel_offset REG, 0 | |
| 13 | |
| 14 #define CFI_POP(REG) \ | |
| 15 .cfi_adjust_cfa_offset -4; \ | |
| 16 .cfi_restore REG | |
| 17 | |
| 18 #define PUSH(REG) pushl REG; CFI_PUSH (REG) | |
| 19 #define POP(REG) popl REG; CFI_POP (REG) | |
| 20 #define RETURN POP(%edi); ret | |
| 21 | |
| 22 #define EXTRACT_ALPHA(var1, var2) \ | |
| 23 movdqa %var1, %var2; /* Clone source pixels to extract alpha
*/\ | |
| 24 psrlw $8, %var2; /* Discard red and blue, leaving alpha a
nd green */\ | |
| 25 pshufhw $0xF5, %var2, %var2; /* Repeat alpha for scaling (high) */\ | |
| 26 movdqa %xmm6, %xmm4; \ | |
| 27 pshuflw $0xF5, %var2, %var2; /* Repeat alpha for scaling (low) */\ | |
| 28 movdqa %xmm5, %xmm3; \ | |
| 29 psubw %var2, %xmm4 /* Finalize alpha calculations */ | |
| 30 | |
| 31 #define SCALE_PIXELS \ | |
| 32 psllw $8, %xmm5; /* Filter out red and blue components */
\ | |
| 33 pmulhuw %xmm4, %xmm5; /* Scale red and blue */\ | |
| 34 psrlw $8, %xmm3; /* Filter out alpha and green components
*/\ | |
| 35 pmullw %xmm4, %xmm3 /* Scale alpha and green */ | |
| 36 | |
| 37 | |
| 38 /* | |
| 39 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | |
| 40 * const SkPMColor* SK_RESTRICT src, | |
| 41 * int count, U8CPU alpha) | |
| 42 * | |
| 43 * This function is divided into six blocks: initialization, blit 4-15 pixels, | |
| 44 * blit 0-3 pixels, align destination for 16+ pixel blits, | |
| 45 * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned. | |
| 46 * There are some code reuse between the blocks. | |
| 47 * | |
| 48 * The primary optimization comes from checking the source pixels' alpha value. | |
| 49 * If the alpha is zero, the pixel can be skipped entirely. | |
| 50 * If the alpha is fully opaque, the pixel can be copied directly to the destina
tion. | |
| 51 * According to collected statistics, these two cases are the most common. | |
| 52 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the | |
| 53 * memory latency worse-case. | |
| 54 */ | |
| 55 | |
| 56 #ifdef __clang__ | |
| 57 .text | |
| 58 .global _S32A_Opaque_BlitRow32_SSE4_asm | |
| 59 #else | |
| 60 .section .text.sse4.2,"ax",@progbits | |
| 61 .type S32A_Opaque_BlitRow32_SSE4_asm, @function | |
| 62 .global S32A_Opaque_BlitRow32_SSE4_asm | |
| 63 #endif | |
| 64 | |
| 65 .p2align 4 | |
| 66 _S32A_Opaque_BlitRow32_SSE4_asm: | |
| 67 S32A_Opaque_BlitRow32_SSE4_asm: | |
| 68 .cfi_startproc | |
| 69 movl 8(%esp), %eax // Source pointer | |
| 70 movl 12(%esp), %ecx // Pixel count | |
| 71 movl 4(%esp), %edx // Destination pointer | |
| 72 prefetcht0 (%eax) | |
| 73 | |
| 74 // Setup SSE constants | |
| 75 pcmpeqd %xmm7, %xmm7 // 0xFF000000 mask to check alpha | |
| 76 pslld $24, %xmm7 | |
| 77 pcmpeqw %xmm6, %xmm6 // 16-bit 256 to calculate inv. alpha | |
| 78 psrlw $15, %xmm6 | |
| 79 psllw $8, %xmm6 | |
| 80 pcmpeqw %xmm0, %xmm0 // 0x00FF00FF mask (Must be in xmm0 beca
use of pblendvb) | |
| 81 psrlw $8, %xmm0 | |
| 82 subl $4, %ecx // Check if we have only 0-3 pixels | |
| 83 js .LReallySmall | |
| 84 PUSH(%edi) | |
| 85 cmpl $11, %ecx // Do we have enough pixels to run the m
ain loop? | |
| 86 ja .LBigBlit | |
| 87 | |
| 88 // Handle small blits (4-15 pixels) | |
| 89 ////////////////////////////////////////////////////////////////////////////
//// | |
| 90 xorl %edi, %edi // Reset offset to zero | |
| 91 | |
| 92 .LSmallLoop: | |
| 93 lddqu (%eax, %edi), %xmm1 // Load four source pixels | |
| 94 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
| 95 ja .LSmallAlphaNotOpaqueOrZero | |
| 96 jz .LSmallAlphaZero // If all alphas are zero, skip the pixe
ls completely | |
| 97 movdqu %xmm1, (%edx, %edi) // Store four destination pixels | |
| 98 .LSmallAlphaZero: | |
| 99 addl $16, %edi | |
| 100 subl $4, %ecx // Check if there are four additional pi
xels, at least | |
| 101 jns .LSmallLoop | |
| 102 jmp .LSmallRemaining | |
| 103 | |
| 104 // Handle mixed alphas (calculate and scale) | |
| 105 .p2align 4 | |
| 106 .LSmallAlphaNotOpaqueOrZero: | |
| 107 lddqu (%edx, %edi), %xmm5 // Load four destination pixels | |
| 108 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 109 SCALE_PIXELS // Scale pixels using alpha | |
| 110 | |
| 111 addl $16, %edi | |
| 112 subl $4, %ecx // Check if we can store all four pixels | |
| 113 pblendvb %xmm0, %xmm5, %xmm3 | |
| 114 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
| 115 movdqu %xmm1, -16(%edx, %edi) // Store four destination pixels | |
| 116 jns .LSmallLoop | |
| 117 | |
| 118 // Handle the last 0-3 pixels (also used by the big unaligned loop) | |
| 119 .LSmallRemaining: | |
| 120 cmpl $-4, %ecx // Check if we are done | |
| 121 je .LSmallExit | |
| 122 sall $2, %ecx // Calculate offset for last pixels | |
| 123 addl %ecx, %edi | |
| 124 | |
| 125 lddqu (%eax, %edi), %xmm1 // Load last four source pixels (overlap
ping) | |
| 126 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
| 127 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store
(overlapping) | |
| 128 jz .LSmallExit // If all alphas are zero, skip the pixe
ls completely | |
| 129 | |
| 130 // Handle mixed alphas (calculate and scale) | |
| 131 lddqu (%edx, %edi), %xmm5 // Load last four destination pixels (ov
erlapping) | |
| 132 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 133 | |
| 134 psllw $8, %xmm3 // Filter out red and blue components | |
| 135 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
| 136 movdqa %xmm5, %xmm2 | |
| 137 psrlw $8, %xmm2 // Filter out alpha and green components | |
| 138 pmullw %xmm4, %xmm2 // Scale alpha and green | |
| 139 | |
| 140 cmpl $-8, %ecx // Check how many pixels should be writt
en | |
| 141 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
| 142 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether | |
| 143 jb .LSmallPixelsLeft1 | |
| 144 ja .LSmallPixelsLeft3 // To avoid double-blending the overlapp
ing pixels... | |
| 145 pblendw $0xF0, %xmm1, %xmm5 // Merge only the final two pixels to th
e destination | |
| 146 movdqu %xmm5, (%edx, %edi) // Store last two destination pixels | |
| 147 .LSmallExit: | |
| 148 RETURN | |
| 149 | |
| 150 .LSmallPixelsLeft1: | |
| 151 pblendw $0xC0, %xmm1, %xmm5 // Merge only the final pixel to the des
tination | |
| 152 movdqu %xmm5, (%edx, %edi) // Store last destination pixel | |
| 153 RETURN | |
| 154 | |
| 155 .LSmallPixelsLeft3: | |
| 156 pblendw $0xFC, %xmm1, %xmm5 // Merge only the final three pixels to
the destination | |
| 157 movdqu %xmm5, (%edx, %edi) // Store last three destination pixels | |
| 158 RETURN | |
| 159 | |
| 160 .LSmallRemainingStoreAll: | |
| 161 movdqu %xmm1, (%edx, %edi) // Store last destination pixels (overwr
ite) | |
| 162 RETURN | |
| 163 | |
| 164 // Handle really small blits (0-3 pixels) | |
| 165 ////////////////////////////////////////////////////////////////////////////
//// | |
| 166 .LReallySmall: | |
| 167 addl $4, %ecx | |
| 168 jle .LReallySmallExit | |
| 169 pcmpeqd %xmm1, %xmm1 | |
| 170 cmp $2, %ecx // Check how many pixels should be read | |
| 171 pinsrd $0x0, (%eax), %xmm1 // Load one source pixel | |
| 172 pinsrd $0x0, (%edx), %xmm5 // Load one destination pixel | |
| 173 jb .LReallySmallCalc | |
| 174 pinsrd $0x1, 4(%eax), %xmm1 // Load second source pixel | |
| 175 pinsrd $0x1, 4(%edx), %xmm5 // Load second destination pixel | |
| 176 je .LReallySmallCalc | |
| 177 pinsrd $0x2, 8(%eax), %xmm1 // Load third source pixel | |
| 178 pinsrd $0x2, 8(%edx), %xmm5 // Load third destination pixel | |
| 179 | |
| 180 .LReallySmallCalc: | |
| 181 ptest %xmm7, %xmm1 // Check if all alphas are opaque | |
| 182 jc .LReallySmallStore // If all alphas are opaque, just store | |
| 183 | |
| 184 // Handle mixed alphas (calculate and scale) | |
| 185 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 186 | |
| 187 pand %xmm0, %xmm5 // Filter out red and blue components | |
| 188 pmullw %xmm4, %xmm5 // Scale red and blue | |
| 189 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 190 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 191 | |
| 192 psrlw $8, %xmm5 // Combine results | |
| 193 pblendvb %xmm0, %xmm5, %xmm3 | |
| 194 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
| 195 | |
| 196 .LReallySmallStore: | |
| 197 cmp $2, %ecx // Check how many pixels should be writt
en | |
| 198 pextrd $0x0, %xmm1, (%edx) // Store one destination pixel | |
| 199 jb .LReallySmallExit | |
| 200 pextrd $0x1, %xmm1, 4(%edx) // Store second destination pixel | |
| 201 je .LReallySmallExit | |
| 202 pextrd $0x2, %xmm1, 8(%edx) // Store third destination pixel | |
| 203 .LReallySmallExit: | |
| 204 ret | |
| 205 | |
| 206 // Handle bigger blit operations (16+ pixels) | |
| 207 ////////////////////////////////////////////////////////////////////////////
//// | |
| 208 .p2align 4 | |
| 209 .LBigBlit: | |
| 210 // Align destination? | |
| 211 testl $0xF, %edx | |
| 212 lddqu (%eax), %xmm1 // Pre-load four source pixels | |
| 213 jz .LAligned | |
| 214 | |
| 215 movl %edx, %edi // Calculate alignment of destination po
inter | |
| 216 negl %edi | |
| 217 andl $0xF, %edi | |
| 218 | |
| 219 // Handle 1-3 pixels to align destination | |
| 220 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
| 221 jz .LAlignDone // If all alphas are zero, just skip | |
| 222 lddqu (%edx), %xmm5 // Load four destination pixels | |
| 223 jc .LAlignStore // If all alphas are opaque, just store | |
| 224 | |
| 225 // Handle mixed alphas (calculate and scale) | |
| 226 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 227 | |
| 228 psllw $8, %xmm3 // Filter out red and blue components | |
| 229 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
| 230 movdqa %xmm5, %xmm2 | |
| 231 psrlw $8, %xmm2 // Filter out alpha and green components | |
| 232 pmullw %xmm4, %xmm2 // Scale alpha and green | |
| 233 | |
| 234 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
| 235 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether | |
| 236 | |
| 237 .LAlignStore: | |
| 238 cmp $8, %edi // Check how many pixels should be writt
en | |
| 239 jb .LAlignPixelsLeft1 | |
| 240 ja .LAlignPixelsLeft3 | |
| 241 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels | |
| 242 jmp .LAlignStorePixels | |
| 243 | |
| 244 .LAlignPixelsLeft1: | |
| 245 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel | |
| 246 jmp .LAlignStorePixels | |
| 247 | |
| 248 .LAlignPixelsLeft3: | |
| 249 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels | |
| 250 | |
| 251 .LAlignStorePixels: | |
| 252 movdqu %xmm5, (%edx) // Store destination pixels | |
| 253 | |
| 254 .LAlignDone: | |
| 255 addl %edi, %eax // Adjust pointers and pixel count | |
| 256 addl %edi, %edx | |
| 257 shrl $2, %edi | |
| 258 lddqu (%eax), %xmm1 // Pre-load new source pixels (after ali
gnment) | |
| 259 subl %edi, %ecx | |
| 260 | |
| 261 .LAligned: // Destination is guaranteed to be 16 by
te aligned | |
| 262 xorl %edi, %edi // Reset offset to zero | |
| 263 subl $8, %ecx // Decrease counter (Reserve four pixels
for the cleanup) | |
| 264 testl $0xF, %eax // Check alignment of source pointer | |
| 265 jz .LAlignedLoop | |
| 266 | |
| 267 // Source not aligned to destination | |
| 268 ////////////////////////////////////////////////////////////////////////////
//// | |
| 269 .p2align 4 | |
| 270 .LUnalignedLoop: // Main loop for unaligned, handles eigh
t pixels per iteration | |
| 271 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
| 272 ja .LAlphaNotOpaqueOrZero00 | |
| 273 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
| 274 jz .LAlphaZero00 | |
| 275 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 276 | |
| 277 .LAlphaZero00: | |
| 278 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
| 279 ja .LAlphaNotOpaqueOrZero01 | |
| 280 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 281 jz .LAlphaZero01 | |
| 282 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
| 283 | |
| 284 .LAlphaZero01: | |
| 285 addl $32, %edi // Adjust offset and pixel count | |
| 286 subl $8, %ecx | |
| 287 jae .LUnalignedLoop | |
| 288 addl $8, %ecx // Adjust pixel count | |
| 289 jmp .LLoopCleanup0 | |
| 290 | |
| 291 .p2align 4 | |
| 292 .LAlphaNotOpaqueOrZero00: | |
| 293 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 294 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 295 SCALE_PIXELS // Scale pixels using alpha | |
| 296 | |
| 297 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
| 298 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 299 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
| 300 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 301 | |
| 302 // Handle next four pixels | |
| 303 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
| 304 ja .LAlphaNotOpaqueOrZero01 | |
| 305 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 306 jz .LAlphaZero02 | |
| 307 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
| 308 .LAlphaZero02: | |
| 309 addl $32, %edi // Adjust offset and pixel count | |
| 310 subl $8, %ecx | |
| 311 jae .LUnalignedLoop | |
| 312 addl $8, %ecx // Adjust pixel count | |
| 313 jmp .LLoopCleanup0 | |
| 314 | |
| 315 .p2align 4 | |
| 316 .LAlphaNotOpaqueOrZero01: | |
| 317 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels | |
| 318 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value | |
| 319 SCALE_PIXELS // Scale pixels using alpha | |
| 320 | |
| 321 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 322 addl $32, %edi | |
| 323 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 324 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether | |
| 325 subl $8, %ecx | |
| 326 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels | |
| 327 jae .LUnalignedLoop | |
| 328 addl $8, %ecx // Adjust pixel count | |
| 329 | |
| 330 // Cleanup - handle pending pixels from loop | |
| 331 .LLoopCleanup0: | |
| 332 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
| 333 ja .LAlphaNotOpaqueOrZero02 | |
| 334 jz .LAlphaZero03 | |
| 335 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 336 .LAlphaZero03: | |
| 337 addl $16, %edi | |
| 338 subl $4, %ecx | |
| 339 js .LSmallRemaining // Reuse code from small loop | |
| 340 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 341 jmp .LLoopCleanup0 | |
| 342 | |
| 343 .LAlphaNotOpaqueOrZero02: | |
| 344 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 345 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 346 SCALE_PIXELS // Scale pixels using alpha | |
| 347 | |
| 348 addl $16, %edi | |
| 349 subl $4, %ecx | |
| 350 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 351 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
| 352 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels | |
| 353 js .LSmallRemaining // Reuse code from small loop | |
| 354 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 355 jmp .LLoopCleanup0 | |
| 356 | |
| 357 // Source aligned to destination | |
| 358 ////////////////////////////////////////////////////////////////////////////
//// | |
| 359 .p2align 4 | |
| 360 .LAlignedLoop: // Main loop for aligned, handles eight
pixels per iteration | |
| 361 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
| 362 ja .LAlphaNotOpaqueOrZero10 | |
| 363 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
| 364 jz .LAlphaZero10 | |
| 365 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 366 | |
| 367 .LAlphaZero10: | |
| 368 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
| 369 ja .LAlphaNotOpaqueOrZero11 | |
| 370 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 371 jz .LAlphaZero11 | |
| 372 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
| 373 | |
| 374 .LAlphaZero11: | |
| 375 addl $32, %edi // Adjust offset and pixel count | |
| 376 subl $8, %ecx | |
| 377 jae .LAlignedLoop | |
| 378 jmp .LLoopCleanup1 | |
| 379 | |
| 380 .p2align 4 | |
| 381 .LAlphaNotOpaqueOrZero10: | |
| 382 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 383 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 384 SCALE_PIXELS // Scale pixels using alpha | |
| 385 | |
| 386 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
| 387 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 388 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
| 389 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 390 | |
| 391 // Handle next four pixels | |
| 392 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e | |
| 393 ja .LAlphaNotOpaqueOrZero11 | |
| 394 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 395 jz .LAlphaZero12 | |
| 396 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
| 397 .LAlphaZero12: | |
| 398 addl $32, %edi // Adjust offset and pixel count | |
| 399 subl $8, %ecx | |
| 400 jae .LAlignedLoop | |
| 401 jmp .LLoopCleanup1 | |
| 402 | |
| 403 .p2align 4 | |
| 404 .LAlphaNotOpaqueOrZero11: | |
| 405 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels | |
| 406 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value | |
| 407 SCALE_PIXELS // Scale pixels using alpha | |
| 408 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 409 | |
| 410 addl $32, %edi | |
| 411 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 412 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether | |
| 413 subl $8, %ecx | |
| 414 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels | |
| 415 jae .LAlignedLoop | |
| 416 | |
| 417 // Cleanup - handle four pending pixels from loop | |
| 418 .LLoopCleanup1: | |
| 419 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
| 420 ja .LAlphaNotOpaqueOrZero12 | |
| 421 jz .LAlphaZero13 | |
| 422 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 423 .LAlphaZero13: | |
| 424 addl $8, %ecx // Adjust offset and pixel count | |
| 425 jz .LExit | |
| 426 addl $16, %edi | |
| 427 jmp .LRemainLoop1 | |
| 428 | |
| 429 .LAlphaNotOpaqueOrZero12: | |
| 430 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 431 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 432 SCALE_PIXELS // Scale pixels using alpha | |
| 433 | |
| 434 addl $8, %ecx // Adjust offset and pixel count | |
| 435 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 436 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
| 437 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 438 jz .LExit | |
| 439 addl $16, %edi | |
| 440 | |
| 441 // Handle last 1-7 pixels | |
| 442 .LRemainLoop1: | |
| 443 movdqa (%eax, %edi), %xmm1 // Load four source pixels | |
| 444 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e | |
| 445 ja .LRemainAlphaNotOpaqueOrZero1 | |
| 446 jz .LRemainAlphaZero1 | |
| 447 | |
| 448 // All alphas were opaque (copy) | |
| 449 subl $4, %ecx // Check if we have more than four pixel
s left | |
| 450 jle .LRemainStore | |
| 451 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 452 addl $16, %edi | |
| 453 jmp .LRemainLoop1 | |
| 454 | |
| 455 // All alphas were zero (skip) | |
| 456 .p2align 4 | |
| 457 .LRemainAlphaZero1: | |
| 458 subl $4, %ecx // Check if we have more than four pixel
s left | |
| 459 jle .LExit | |
| 460 addl $16, %edi | |
| 461 jmp .LRemainLoop1 | |
| 462 | |
| 463 // Handle mixed alphas (calculate and scale) | |
| 464 .p2align 4 | |
| 465 .LRemainAlphaNotOpaqueOrZero1: | |
| 466 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 467 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value | |
| 468 SCALE_PIXELS // Scale pixels using alpha | |
| 469 | |
| 470 subl $4, %ecx | |
| 471 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 472 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether | |
| 473 jle .LRemainStore | |
| 474 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 475 addl $16, %edi | |
| 476 jmp .LRemainLoop1 | |
| 477 | |
| 478 // Store the last 1-4 pixels | |
| 479 .p2align 4 | |
| 480 .LRemainStore: | |
| 481 jz .LRemainFull | |
| 482 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 483 cmp $-2, %ecx // Check how many pixels should be writt
en | |
| 484 jb .LRemainPixelsLeft11 | |
| 485 ja .LRemainPixelsLeft13 | |
| 486 pblendw $0x0F, %xmm1, %xmm5 | |
| 487 movdqa %xmm5, (%edx, %edi) // Store last 2 destination pixels | |
| 488 .LExit: | |
| 489 RETURN | |
| 490 | |
| 491 .LRemainPixelsLeft11: | |
| 492 pblendw $0x03, %xmm1, %xmm5 | |
| 493 movdqa %xmm5, (%edx, %edi) // Store last destination pixel | |
| 494 RETURN | |
| 495 | |
| 496 .LRemainPixelsLeft13: | |
| 497 pblendw $0x3F, %xmm1, %xmm5 | |
| 498 movdqa %xmm5, (%edx, %edi) // Store last 3 destination pixels | |
| 499 RETURN | |
| 500 | |
| 501 .LRemainFull: | |
| 502 movdqa %xmm1, (%edx, %edi) // Store last 4 destination pixels | |
| 503 RETURN | |
| 504 | |
| 505 .cfi_endproc | |
| 506 #ifndef __clang__ | |
| 507 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm | |
| 508 #endif | |
| 509 #endif | |
| OLD | NEW |