OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2013 The Android Open Source Project |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. |
| 6 */ |
| 7 |
| 8 #if defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC)) |
| 9 |
| 10 #define EXTRACT_ALPHA(var1, var2) \ |
| 11 movdqa %var1, %var2; /* Clone source pixels to extract alpha
*/\ |
| 12 psrlw $8, %var2; /* Discard red and blue, leaving alpha a
nd green */\ |
| 13 pshufhw $0xF5, %var2, %var2; /* Repeat alpha for scaling (high) */\ |
| 14 movdqa %xmm6, %xmm4; \ |
| 15 pshuflw $0xF5, %var2, %var2; /* Repeat alpha for scaling (low) */\ |
| 16 movdqa %xmm5, %xmm3; \ |
| 17 psubw %var2, %xmm4 /* Finalize alpha calculations */ |
| 18 |
| 19 #define SCALE_PIXELS \ |
| 20 psllw $8, %xmm5; /* Filter out red and blue components */
\ |
| 21 pmulhuw %xmm4, %xmm5; /* Scale red and blue */\ |
| 22 psrlw $8, %xmm3; /* Filter out alpha and green components
*/\ |
| 23 pmullw %xmm4, %xmm3 /* Scale alpha and green */ |
| 24 |
| 25 |
| 26 /* |
| 27 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, |
| 28 * const SkPMColor* SK_RESTRICT src, |
| 29 * int count, U8CPU alpha) |
| 30 * |
| 31 * This function is divided into six blocks: initialization, blit 4-15 pixels, |
| 32 * blit 0-3 pixels, align destination for 16+ pixel blits, |
| 33 * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned. |
| 34 * There are some code reuse between the blocks. |
| 35 * |
| 36 * The primary optimization comes from checking the source pixels' alpha value. |
| 37 * If the alpha is zero, the pixel can be skipped entirely. |
| 38 * If the alpha is fully opaque, the pixel can be copied directly to the destina
tion. |
| 39 * According to collected statistics, these two cases are the most common. |
| 40 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the |
| 41 * memory latency worse-case. |
| 42 */ |
| 43 |
| 44 #ifdef __clang__ |
| 45 .text |
| 46 #else |
| 47 .section .text.sse4.2,"ax",@progbits |
| 48 .type S32A_Opaque_BlitRow32_SSE4_asm, @function |
| 49 #endif |
| 50 .global S32A_Opaque_BlitRow32_SSE4_asm |
| 51 .global _S32A_Opaque_BlitRow32_SSE4_asm |
| 52 |
| 53 .p2align 4 |
| 54 S32A_Opaque_BlitRow32_SSE4_asm: |
| 55 _S32A_Opaque_BlitRow32_SSE4_asm: |
| 56 .cfi_startproc |
| 57 prefetcht0 (%rsi) |
| 58 movl %edx, %ecx // Pixel count |
| 59 movq %rdi, %rdx // Destination pointer |
| 60 movq %rsi, %rax // Source pointer |
| 61 |
| 62 // Setup SSE constants |
| 63 movdqa .LAlphaCheckMask(%rip), %xmm7 // 0xFF000000 mask to check alpha |
| 64 movdqa .LInverseAlphaCalc(%rip), %xmm6// 16-bit 256 to calculate inv. a
lpha |
| 65 movdqa .LResultMergeMask(%rip), %xmm0 // 0x00FF00FF mask (Must be in xm
m0 because of pblendvb) |
| 66 |
| 67 subl $4, %ecx // Check if we have only 0-3 pixels |
| 68 js .LReallySmall |
| 69 cmpl $11, %ecx // Do we have enough pixels to run the m
ain loop? |
| 70 ja .LBigBlit |
| 71 |
| 72 // Handle small blits (4-15 pixels) |
| 73 ////////////////////////////////////////////////////////////////////////////
//// |
| 74 xorq %rdi, %rdi // Reset offset to zero |
| 75 |
| 76 .LSmallLoop: |
| 77 lddqu (%rax, %rdi), %xmm1 // Load four source pixels |
| 78 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e |
| 79 ja .LSmallAlphaNotOpaqueOrZero |
| 80 jz .LSmallAlphaZero |
| 81 movdqu %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 82 .LSmallAlphaZero: |
| 83 addq $16, %rdi |
| 84 subl $4, %ecx // Check if there are four additional pi
xels, at least |
| 85 jns .LSmallLoop |
| 86 jmp .LSmallRemaining |
| 87 |
| 88 // Handle mixed alphas (calculate and scale) |
| 89 .p2align 4 |
| 90 .LSmallAlphaNotOpaqueOrZero: |
| 91 lddqu (%rdx, %rdi), %xmm5 // Load four destination pixels |
| 92 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 93 SCALE_PIXELS // Scale pixels using alpha |
| 94 |
| 95 addq $16, %rdi |
| 96 subl $4, %ecx // Check if we can store all four pixels |
| 97 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly |
| 98 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether |
| 99 movdqu %xmm1, -16(%rdx, %rdi) // Store four destination pixels |
| 100 jns .LSmallLoop |
| 101 |
| 102 // Handle the last 0-3 pixels (also used by the big unaligned loop) |
| 103 .LSmallRemaining: |
| 104 cmpl $-4, %ecx // Check if we are done |
| 105 je .LSmallExit |
| 106 sall $2, %ecx // Calculate offset for last pixels |
| 107 movslq %ecx, %rcx |
| 108 addq %rcx, %rdi |
| 109 |
| 110 lddqu (%rax, %rdi), %xmm1 // Load last four source pixels (overlap
ping) |
| 111 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e |
| 112 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store
(overlapping) |
| 113 jz .LSmallExit // If all alphas are zero, skip the pixe
ls completely |
| 114 |
| 115 // Handle mixed alphas (calculate and scale) |
| 116 lddqu (%rdx, %rdi), %xmm5 // Load last four destination pixels (ov
erlapping) |
| 117 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 118 |
| 119 psllw $8, %xmm3 // Filter out red and blue components |
| 120 pmulhuw %xmm4, %xmm3 // Scale red and blue |
| 121 movdqa %xmm5, %xmm2 |
| 122 psrlw $8, %xmm2 // Filter out alpha and green components |
| 123 pmullw %xmm4, %xmm2 // Scale alpha and green |
| 124 |
| 125 cmpl $-8, %ecx // Check how many pixels should be writt
en |
| 126 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, impli
citly) |
| 127 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether |
| 128 jb .LSmallPixelsLeft1 |
| 129 ja .LSmallPixelsLeft3 // To avoid double-blending the overlapp
ing pixels... |
| 130 pblendw $0xF0, %xmm1, %xmm5 // Merge only the final two pixels to th
e destination |
| 131 movdqu %xmm5, (%rdx, %rdi) // Store last two destination pixels |
| 132 .LSmallExit: |
| 133 ret |
| 134 |
| 135 .LSmallPixelsLeft1: |
| 136 pblendw $0xC0, %xmm1, %xmm5 // Merge only the final pixel to the des
tination |
| 137 movdqu %xmm5, (%rdx, %rdi) // Store last destination pixel |
| 138 ret |
| 139 |
| 140 .LSmallPixelsLeft3: |
| 141 pblendw $0xFC, %xmm1, %xmm5 // Merge only the final three pixels to
the destination |
| 142 movdqu %xmm5, (%rdx, %rdi) // Store last three destination pixels |
| 143 ret |
| 144 |
| 145 .LSmallRemainingStoreAll: |
| 146 movdqu %xmm1, (%rdx, %rdi) // Store last destination pixels (overwr
ite) |
| 147 ret |
| 148 |
| 149 // Handle really small blits (0-3 pixels) |
| 150 ////////////////////////////////////////////////////////////////////////////
//// |
| 151 .LReallySmall: |
| 152 addl $4, %ecx |
| 153 jle .LReallySmallExit |
| 154 pcmpeqd %xmm1, %xmm1 |
| 155 cmpl $2, %ecx // Check how many pixels should be read |
| 156 pinsrd $0x0, (%rax), %xmm1 // Load one source pixel |
| 157 pinsrd $0x0, (%rdx), %xmm5 // Load one destination pixel |
| 158 jb .LReallySmallCalc |
| 159 pinsrd $0x1, 4(%rax), %xmm1 // Load second source pixel |
| 160 pinsrd $0x1, 4(%rdx), %xmm5 // Load second destination pixel |
| 161 je .LReallySmallCalc |
| 162 pinsrd $0x2, 8(%rax), %xmm1 // Load third source pixel |
| 163 pinsrd $0x2, 8(%rdx), %xmm5 // Load third destination pixel |
| 164 |
| 165 .LReallySmallCalc: |
| 166 ptest %xmm7, %xmm1 // Check if all alphas are opaque |
| 167 jc .LReallySmallStore // If all alphas are opaque, just store |
| 168 |
| 169 // Handle mixed alphas (calculate and scale) |
| 170 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 171 |
| 172 pand %xmm0, %xmm5 // Filter out red and blue components |
| 173 pmullw %xmm4, %xmm5 // Scale red and blue |
| 174 psrlw $8, %xmm3 // Filter out alpha and green components |
| 175 pmullw %xmm4, %xmm3 // Scale alpha and green |
| 176 |
| 177 psrlw $8, %xmm5 // Combine results |
| 178 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly |
| 179 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether |
| 180 |
| 181 .LReallySmallStore: |
| 182 cmpl $2, %ecx // Check how many pixels should be writt
en |
| 183 pextrd $0x0, %xmm1, (%rdx) // Store one destination pixel |
| 184 jb .LReallySmallExit |
| 185 pextrd $0x1, %xmm1, 4(%rdx) // Store second destination pixel |
| 186 je .LReallySmallExit |
| 187 pextrd $0x2, %xmm1, 8(%rdx) // Store third destination pixel |
| 188 .LReallySmallExit: |
| 189 ret |
| 190 |
| 191 // Handle bigger blit operations (16+ pixels) |
| 192 ////////////////////////////////////////////////////////////////////////////
//// |
| 193 .p2align 4 |
| 194 .LBigBlit: |
| 195 // Align destination? |
| 196 testl $0xF, %edx |
| 197 lddqu (%rax), %xmm1 // Pre-load four source pixels |
| 198 jz .LAligned |
| 199 |
| 200 movq %rdx, %rdi // Calculate alignment of destination po
inter |
| 201 negq %rdi |
| 202 andl $0xF, %edi |
| 203 |
| 204 // Handle 1-3 pixels to align destination |
| 205 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e |
| 206 jz .LAlignDone // If all alphas are zero, just skip |
| 207 lddqu (%rdx), %xmm5 // Load four destination pixels |
| 208 jc .LAlignStore // If all alphas are opaque, just store |
| 209 |
| 210 // Handle mixed alphas (calculate and scale) |
| 211 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 212 |
| 213 psllw $8, %xmm3 // Filter out red and blue components |
| 214 pmulhuw %xmm4, %xmm3 // Scale red and blue |
| 215 movdqa %xmm5, %xmm2 |
| 216 psrlw $8, %xmm2 // Filter out alpha and green components |
| 217 pmullw %xmm4, %xmm2 // Scale alpha and green |
| 218 |
| 219 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, impli
citly) |
| 220 paddb %xmm2, %xmm1 // Add source and destination pixels tog
ether |
| 221 |
| 222 .LAlignStore: |
| 223 cmpl $8, %edi // Check how many pixels should be writt
en |
| 224 jb .LAlignPixelsLeft1 |
| 225 ja .LAlignPixelsLeft3 |
| 226 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels |
| 227 jmp .LAlignStorePixels |
| 228 |
| 229 .LAlignPixelsLeft1: |
| 230 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel |
| 231 jmp .LAlignStorePixels |
| 232 |
| 233 .LAlignPixelsLeft3: |
| 234 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels |
| 235 |
| 236 .LAlignStorePixels: |
| 237 movdqu %xmm5, (%rdx) // Store destination pixels |
| 238 |
| 239 .LAlignDone: |
| 240 addq %rdi, %rax // Adjust pointers and pixel count |
| 241 addq %rdi, %rdx |
| 242 shrq $2, %rdi |
| 243 lddqu (%rax), %xmm1 // Pre-load new source pixels (after ali
gnment) |
| 244 subl %edi, %ecx |
| 245 |
| 246 .LAligned: // Destination is guaranteed to be 16 by
te aligned |
| 247 xorq %rdi, %rdi // Reset offset to zero |
| 248 subl $8, %ecx // Decrease counter (Reserve four pixels
for the cleanup) |
| 249 testl $0xF, %eax // Check alignment of source pointer |
| 250 jz .LAlignedLoop |
| 251 |
| 252 // Source not aligned to destination |
| 253 ////////////////////////////////////////////////////////////////////////////
//// |
| 254 .p2align 4 |
| 255 .LUnalignedLoop: // Main loop for unaligned, handles eigh
t pixels per iteration |
| 256 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e |
| 257 ja .LAlphaNotOpaqueOrZero00 |
| 258 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels |
| 259 jz .LAlphaZero00 |
| 260 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 261 |
| 262 .LAlphaZero00: |
| 263 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e |
| 264 ja .LAlphaNotOpaqueOrZero01 |
| 265 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels |
| 266 jz .LAlphaZero01 |
| 267 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels |
| 268 |
| 269 .LAlphaZero01: |
| 270 addq $32, %rdi // Adjust offset and pixel count |
| 271 subl $8, %ecx |
| 272 jae .LUnalignedLoop |
| 273 addl $8, %ecx // Adjust pixel count |
| 274 jmp .LLoopCleanup0 |
| 275 |
| 276 .p2align 4 |
| 277 .LAlphaNotOpaqueOrZero00: |
| 278 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels |
| 279 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 280 SCALE_PIXELS // Scale pixels using alpha |
| 281 |
| 282 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels |
| 283 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) |
| 284 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether |
| 285 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 286 |
| 287 // Handle next four pixels |
| 288 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e |
| 289 ja .LAlphaNotOpaqueOrZero01 |
| 290 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels |
| 291 jz .LAlphaZero02 |
| 292 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels |
| 293 .LAlphaZero02: |
| 294 addq $32, %rdi // Adjust offset and pixel count |
| 295 subl $8, %ecx |
| 296 jae .LUnalignedLoop |
| 297 addl $8, %ecx // Adjust pixel count |
| 298 jmp .LLoopCleanup0 |
| 299 |
| 300 .p2align 4 |
| 301 .LAlphaNotOpaqueOrZero01: |
| 302 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels |
| 303 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value |
| 304 SCALE_PIXELS // Scale pixels using alpha |
| 305 |
| 306 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels |
| 307 addq $32, %rdi |
| 308 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) |
| 309 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether |
| 310 subl $8, %ecx |
| 311 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels |
| 312 jae .LUnalignedLoop |
| 313 addl $8, %ecx // Adjust pixel count |
| 314 |
| 315 // Cleanup - handle pending pixels from loop |
| 316 .LLoopCleanup0: |
| 317 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e |
| 318 ja .LAlphaNotOpaqueOrZero02 |
| 319 jz .LAlphaZero03 |
| 320 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 321 .LAlphaZero03: |
| 322 addq $16, %rdi |
| 323 subl $4, %ecx |
| 324 js .LSmallRemaining // Reuse code from small loop |
| 325 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels |
| 326 jmp .LLoopCleanup0 |
| 327 |
| 328 .LAlphaNotOpaqueOrZero02: |
| 329 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels |
| 330 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 331 SCALE_PIXELS // Scale pixels using alpha |
| 332 |
| 333 addq $16, %rdi |
| 334 subl $4, %ecx |
| 335 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) |
| 336 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether |
| 337 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels |
| 338 js .LSmallRemaining // Reuse code from small loop |
| 339 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels |
| 340 jmp .LLoopCleanup0 |
| 341 |
| 342 // Source aligned to destination |
| 343 ////////////////////////////////////////////////////////////////////////////
//// |
| 344 .p2align 4 |
| 345 .LAlignedLoop: // Main loop for aligned, handles eight
pixels per iteration |
| 346 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e |
| 347 ja .LAlphaNotOpaqueOrZero10 |
| 348 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels |
| 349 jz .LAlphaZero10 |
| 350 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 351 |
| 352 .LAlphaZero10: |
| 353 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e |
| 354 ja .LAlphaNotOpaqueOrZero11 |
| 355 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels |
| 356 jz .LAlphaZero11 |
| 357 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels |
| 358 |
| 359 .LAlphaZero11: |
| 360 addq $32, %rdi // Adjust offset and pixel count |
| 361 subl $8, %ecx |
| 362 jae .LAlignedLoop |
| 363 jmp .LLoopCleanup1 |
| 364 |
| 365 .p2align 4 |
| 366 .LAlphaNotOpaqueOrZero10: |
| 367 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels |
| 368 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 369 SCALE_PIXELS // Scale pixels using alpha |
| 370 |
| 371 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels |
| 372 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) |
| 373 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether |
| 374 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 375 |
| 376 // Handle next four pixels |
| 377 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu
e |
| 378 ja .LAlphaNotOpaqueOrZero11 |
| 379 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels |
| 380 jz .LAlphaZero12 |
| 381 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels |
| 382 .LAlphaZero12: |
| 383 addq $32, %rdi // Adjust offset and pixel count |
| 384 subl $8, %ecx |
| 385 jae .LAlignedLoop |
| 386 jmp .LLoopCleanup1 |
| 387 |
| 388 .p2align 4 |
| 389 .LAlphaNotOpaqueOrZero11: |
| 390 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels |
| 391 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value |
| 392 SCALE_PIXELS // Scale pixels using alpha |
| 393 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels |
| 394 |
| 395 addq $32, %rdi |
| 396 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) |
| 397 paddb %xmm3, %xmm2 // Add source and destination pixels tog
ether |
| 398 subl $8, %ecx |
| 399 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels |
| 400 jae .LAlignedLoop |
| 401 |
| 402 // Cleanup - handle four pending pixels from loop |
| 403 .LLoopCleanup1: |
| 404 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e |
| 405 ja .LAlphaNotOpaqueOrZero12 |
| 406 jz .LAlphaZero13 |
| 407 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 408 .LAlphaZero13: |
| 409 addl $8, %ecx // Adjust offset and pixel count |
| 410 jz .LExit |
| 411 addq $16, %rdi |
| 412 jmp .LRemainLoop1 |
| 413 |
| 414 .LAlphaNotOpaqueOrZero12: |
| 415 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels |
| 416 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 417 SCALE_PIXELS // Scale pixels using alpha |
| 418 |
| 419 addl $8, %ecx // Adjust offset and pixel count |
| 420 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) |
| 421 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether |
| 422 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 423 jz .LExit |
| 424 addq $16, %rdi |
| 425 |
| 426 // Handle last 1-7 pixels |
| 427 .LRemainLoop1: |
| 428 movdqa (%rax, %rdi), %xmm1 // Load four source pixels |
| 429 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu
e |
| 430 ja .LRemainAlphaNotOpaqueOrZero1 |
| 431 jz .LRemainAlphaZero1 |
| 432 |
| 433 // All alphas were opaque (copy) |
| 434 subl $4, %ecx // Check if we have more than four pixel
s left |
| 435 jle .LRemainStore |
| 436 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 437 addq $16, %rdi |
| 438 jmp .LRemainLoop1 |
| 439 |
| 440 // All alphas were zero (skip) |
| 441 .p2align 4 |
| 442 .LRemainAlphaZero1: |
| 443 subl $4, %ecx // Check if we have more than four pixel
s left |
| 444 jle .LExit |
| 445 addq $16, %rdi |
| 446 jmp .LRemainLoop1 |
| 447 |
| 448 // Handle mixed alphas (calculate and scale) |
| 449 .p2align 4 |
| 450 .LRemainAlphaNotOpaqueOrZero1: |
| 451 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels |
| 452 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value |
| 453 SCALE_PIXELS // Scale pixels using alpha |
| 454 |
| 455 subl $4, %ecx |
| 456 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, impli
citly) |
| 457 paddb %xmm3, %xmm1 // Add source and destination pixels tog
ether |
| 458 jle .LRemainStore |
| 459 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels |
| 460 addq $16, %rdi |
| 461 jmp .LRemainLoop1 |
| 462 |
| 463 // Store the last 1-4 pixels |
| 464 .p2align 4 |
| 465 .LRemainStore: |
| 466 jz .LRemainFull |
| 467 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels |
| 468 cmpl $-2, %ecx // Check how many pixels should be writt
en |
| 469 jb .LRemainPixelsLeft11 |
| 470 ja .LRemainPixelsLeft13 |
| 471 pblendw $0x0F, %xmm1, %xmm5 |
| 472 movdqa %xmm5, (%rdx, %rdi) // Store last 2 destination pixels |
| 473 .LExit: |
| 474 ret |
| 475 |
| 476 .LRemainPixelsLeft11: |
| 477 pblendw $0x03, %xmm1, %xmm5 |
| 478 movdqa %xmm5, (%rdx, %rdi) // Store last destination pixel |
| 479 ret |
| 480 |
| 481 .LRemainPixelsLeft13: |
| 482 pblendw $0x3F, %xmm1, %xmm5 |
| 483 movdqa %xmm5, (%rdx, %rdi) // Store last 3 destination pixels |
| 484 ret |
| 485 |
| 486 .LRemainFull: |
| 487 movdqa %xmm1, (%rdx, %rdi) // Store last 4 destination pixels |
| 488 ret |
| 489 |
| 490 .cfi_endproc |
| 491 #ifndef __clang__ |
| 492 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm |
| 493 #endif |
| 494 |
| 495 // Constants for SSE code |
| 496 #ifndef __clang__ |
| 497 .section .rodata |
| 498 #endif |
| 499 .p2align 4 |
| 500 .LAlphaCheckMask: |
| 501 .long 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000 |
| 502 .LInverseAlphaCalc: |
| 503 .word 256, 256, 256, 256, 256, 256, 256, 256 |
| 504 .LResultMergeMask: |
| 505 .long 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF |
| 506 #endif |
OLD | NEW |