Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 /* | |
| 2 * Copyright 2013 The Android Open Source Project | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license that can be | |
| 5 * found in the LICENSE file. | |
| 6 */ | |
| 7 | |
| 8 #if !defined(_MSC_VER) | |
| 9 | |
| 10 #define CFI_PUSH(REG) \ | |
| 11 .cfi_adjust_cfa_offset 4; \ | |
| 12 .cfi_rel_offset REG, 0 | |
| 13 | |
| 14 #define CFI_POP(REG) \ | |
| 15 .cfi_adjust_cfa_offset -4;\ | |
| 16 .cfi_restore REG | |
| 17 | |
| 18 #define PUSH(REG) pushl REG; CFI_PUSH (REG) | |
| 19 #define POP(REG) popl REG; CFI_POP (REG) | |
| 20 | |
| 21 /* | |
| 22 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | |
| 23 * const SkPMColor* SK_RESTRICT src, | |
| 24 * int count, U8CPU alpha) | |
| 25 * | |
| 26 * The primary optimization comes from checking the source pixels' alpha value. | |
| 27 * If the alpha is zero, the pixel can be skipped entirely. | |
| 28 * If the alpha is fully opaque, the pixel can be copied directly to the destina tion. | |
| 29 * According to collected statistics, these two cases are the most common. | |
| 30 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the | |
| 31 * memory latency worse-case. | |
| 32 */ | |
| 33 | |
| 34 .section .text.sse4,"ax",@progbits | |
| 35 .type S32A_Opaque_BlitRow32_SSE4_asm, @function | |
| 36 .globl S32A_Opaque_BlitRow32_SSE4_asm | |
| 37 | |
| 38 .p2align 4 | |
| 39 S32A_Opaque_BlitRow32_SSE4_asm: | |
| 40 .cfi_startproc | |
| 41 movl 8(%esp), %eax // Source pointer | |
| 42 movl 12(%esp), %ecx // Pixel count | |
| 43 movl 4(%esp), %edx // Destination pointer | |
| 44 prefetcht0 (%eax) | |
| 45 | |
| 46 // Setup SSE constants | |
| 47 pcmpeqd %xmm7, %xmm7 // 0xFF000000 mask to check alpha | |
| 48 pcmpeqw %xmm6, %xmm6 // 16-bit 256 to calculate inv. alpha | |
|
mtklein
2014/05/16 18:06:38
Does the interlaced instruction scheduling here re
henrik.smiding
2014/05/20 15:10:29
On a Haswell core, probably not. On a Silvermont/A
| |
| 49 pslld $24, %xmm7 | |
| 50 pcmpeqw %xmm0, %xmm0 // 0x00FF00FF mask (Must be in xmm0 beca use of pblendvb) | |
| 51 psrlw $15, %xmm6 | |
| 52 psrlw $8, %xmm0 | |
| 53 subl $4, %ecx // Check if we have only 0-3 pixels | |
| 54 psllw $8, %xmm6 | |
| 55 js .LReallySmall | |
| 56 PUSH(%edi) | |
| 57 cmpl $11, %ecx // Do we have enough pixels to run the m ain loop? | |
| 58 ja .LBigBlit | |
| 59 | |
| 60 // Handle small blits (4-15 pixels) | |
| 61 // ******************************** | |
| 62 xorl %edi, %edi // Reset offset to zero | |
| 63 | |
| 64 .LSmallLoop: | |
| 65 lddqu (%eax, %edi), %xmm1 // Load four source pixels | |
| 66 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
|
mtklein
2014/05/16 18:06:38
Is this the sort of place intrinsics fail us? I g
henrik.smiding
2014/05/20 15:10:29
That's correct. It was all about not making the wo
| |
| 67 ja .LSmallAlphaNotOpaqueOrZero | |
| 68 jz .LSmallAlphaZero | |
| 69 movdqu %xmm1, (%edx, %edi) // Store four destination pixels | |
| 70 .LSmallAlphaZero: | |
| 71 addl $16, %edi | |
| 72 subl $4, %ecx // Check if there are four additional pi xels, at least | |
| 73 jns .LSmallLoop | |
| 74 jmp .LSmallRemaining | |
| 75 | |
| 76 // Handle mixed alphas (calculate and scale) | |
| 77 .p2align 4 | |
| 78 .LSmallAlphaNotOpaqueOrZero: | |
| 79 lddqu (%edx, %edi), %xmm5 // Load four destination pixels | |
| 80 | |
| 81 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 82 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
| 83 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 84 movdqa %xmm6, %xmm4 | |
| 85 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 86 movdqa %xmm5, %xmm3 | |
| 87 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 88 | |
| 89 psllw $8, %xmm5 // Filter out red and blue components | |
| 90 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 91 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 92 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 93 | |
| 94 addl $16, %edi | |
| 95 subl $4, %ecx // Check if we can store all four pixels | |
| 96 pblendvb %xmm0, %xmm5, %xmm3 | |
| 97 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 98 movdqu %xmm1, -16(%edx, %edi) // Store four destination pixels | |
| 99 jns .LSmallLoop | |
| 100 | |
| 101 // Handle the last 0-3 pixels (also used by the big unaligned loop) | |
| 102 .LSmallRemaining: | |
| 103 cmpl $-4, %ecx // Check if we are done | |
| 104 je .LSmallExit | |
| 105 sall $2, %ecx // Calculate offset for last pixels | |
| 106 addl %ecx, %edi | |
| 107 | |
| 108 lddqu (%eax, %edi), %xmm1 // Load last four source pixels (overlap ping) | |
|
mtklein
2014/05/16 18:06:38
I was expecting we'd fall back on non-SIMD or do s
henrik.smiding
2014/05/20 15:10:29
I've improved the comments a bit.
| |
| 109 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 110 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store | |
| 111 jz .LSmallExit | |
| 112 | |
| 113 // Handle mixed alphas (calculate and scale) | |
|
mtklein
2014/05/16 18:06:38
Can we share or macro away this big blend block?
henrik.smiding
2014/05/20 15:10:29
Done. I replaced about 200 lines of code with macr
| |
| 114 lddqu (%edx, %edi), %xmm5 // Load last four destination pixels (ov erlapping) | |
| 115 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 116 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
| 117 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 118 movdqa %xmm6, %xmm4 | |
| 119 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 120 movdqa %xmm5, %xmm3 | |
| 121 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 122 | |
| 123 psllw $8, %xmm3 // Filter out red and blue components | |
| 124 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
| 125 movdqa %xmm5, %xmm2 | |
| 126 psrlw $8, %xmm2 // Filter out alpha and green components | |
| 127 pmullw %xmm4, %xmm2 // Scale alpha and green | |
| 128 | |
| 129 cmpl $-8, %ecx // Check how many pixels should be writt en | |
| 130 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
| 131 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether | |
| 132 jb .LSmallPixelsLeft1 | |
| 133 ja .LSmallPixelsLeft3 | |
| 134 pblendw $0xF0, %xmm1, %xmm5 | |
| 135 movdqu %xmm5, (%edx, %edi) // Store last two destination pixels | |
| 136 .LSmallExit: | |
| 137 POP(%edi) | |
| 138 ret | |
| 139 | |
| 140 .LSmallPixelsLeft1: | |
| 141 pblendw $0xC0, %xmm1, %xmm5 | |
| 142 movdqu %xmm5, (%edx, %edi) // Store last destination pixel | |
| 143 POP(%edi) | |
| 144 ret | |
| 145 | |
| 146 .LSmallPixelsLeft3: | |
| 147 pblendw $0xFC, %xmm1, %xmm5 | |
| 148 movdqu %xmm5, (%edx, %edi) // Store last three destination pixels | |
| 149 POP(%edi) | |
| 150 ret | |
| 151 | |
| 152 .LSmallRemainingStoreAll: | |
| 153 movdqu %xmm1, (%edx, %edi) // Store last destination pixels (overwr ite) | |
| 154 POP(%edi) | |
| 155 ret | |
| 156 | |
| 157 // Handle really small blits (0-3 pixels) | |
| 158 // ************************************** | |
| 159 .LReallySmall: | |
| 160 addl $4, %ecx | |
| 161 jle .LReallySmallExit | |
| 162 pcmpeqd %xmm1, %xmm1 | |
| 163 cmp $2, %ecx // Check how many pixels should be read | |
| 164 pinsrd $0x0, (%eax), %xmm1 // Load one source pixel | |
| 165 pinsrd $0x0, (%edx), %xmm5 // Load one destination pixel | |
| 166 jb .LReallySmallCalc | |
| 167 pinsrd $0x1, 4(%eax), %xmm1 // Load second source pixel | |
| 168 pinsrd $0x1, 4(%edx), %xmm5 // Load second destination pixel | |
| 169 je .LReallySmallCalc | |
| 170 pinsrd $0x2, 8(%eax), %xmm1 // Load third source pixel | |
| 171 pinsrd $0x2, 8(%edx), %xmm5 // Load third destination pixel | |
| 172 | |
| 173 .LReallySmallCalc: | |
| 174 ptest %xmm7, %xmm1 // Check if all alphas are opaque | |
| 175 jc .LReallySmallStore // If all alphas are opaque, just store | |
| 176 | |
| 177 // Handle mixed alphas (calculate and scale) | |
| 178 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 179 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
| 180 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 181 movdqa %xmm6, %xmm4 | |
| 182 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 183 movdqa %xmm5, %xmm3 | |
| 184 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 185 | |
| 186 pand %xmm0, %xmm5 // Filter out red and blue components | |
| 187 pmullw %xmm4, %xmm5 // Scale red and blue | |
| 188 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 189 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 190 | |
| 191 psrlw $8, %xmm5 // Combine results | |
| 192 pblendvb %xmm0, %xmm5, %xmm3 | |
| 193 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 194 | |
| 195 .LReallySmallStore: | |
| 196 cmp $2, %ecx // Check how many pixels should be writt en | |
| 197 pextrd $0x0, %xmm1, (%edx) // Store one destination pixel | |
| 198 jb .LReallySmallExit | |
| 199 pextrd $0x1, %xmm1, 4(%edx) // Store second destination pixel | |
| 200 je .LReallySmallExit | |
| 201 pextrd $0x2, %xmm1, 8(%edx) // Store third destination pixel | |
| 202 .LReallySmallExit: | |
| 203 ret | |
| 204 | |
| 205 // Handle bigger blit operations (16+ pixels) | |
| 206 // ****************************************** | |
| 207 .p2align 4 | |
| 208 .LBigBlit: | |
| 209 // Align destination? | |
| 210 testl $0xF, %edx | |
| 211 lddqu (%eax), %xmm1 // Pre-load four source pixels | |
| 212 jz .LAligned | |
| 213 | |
| 214 movl %edx, %edi // Calculate alignment of destination po inter | |
| 215 negl %edi | |
| 216 andl $0xF, %edi | |
| 217 | |
| 218 // Handle 1-3 pixels to align destination | |
| 219 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
|
mtklein
2014/05/16 18:06:38
Do you think we're benefitting by having everythin
henrik.smiding
2014/05/20 15:10:29
That would kill performance of short blits, like 1
| |
| 220 jz .LAlignDone // If all alphas are opaque, just skip | |
| 221 lddqu (%edx), %xmm5 // Load four destination pixels | |
| 222 jc .LAlignStore // If all alphas are opaque, just store | |
| 223 | |
| 224 // Handle mixed alphas (calculate and scale) | |
| 225 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 226 psrlw $8, %xmm2 // Discard red and blue | |
| 227 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 228 movdqa %xmm6, %xmm4 | |
| 229 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 230 movdqa %xmm5, %xmm3 | |
| 231 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 232 | |
| 233 psllw $8, %xmm3 // Filter out red and blue components | |
| 234 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
| 235 movdqa %xmm5, %xmm2 | |
| 236 psrlw $8, %xmm2 // Filter out alpha and green components | |
| 237 pmullw %xmm4, %xmm2 // Scale alpha and green | |
| 238 | |
| 239 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
| 240 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether | |
| 241 | |
| 242 .LAlignStore: | |
| 243 cmp $8, %edi // Check how many pixels should be writt en | |
| 244 jb .LAlignPixelsLeft1 | |
| 245 ja .LAlignPixelsLeft3 | |
| 246 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels | |
| 247 jmp .LAlignStorePixels | |
| 248 | |
| 249 .LAlignPixelsLeft1: | |
| 250 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel | |
| 251 jmp .LAlignStorePixels | |
| 252 | |
| 253 .LAlignPixelsLeft3: | |
| 254 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels | |
| 255 | |
| 256 .LAlignStorePixels: | |
| 257 movdqu %xmm5, (%edx) // Store destination pixels | |
| 258 | |
| 259 .LAlignDone: | |
| 260 addl %edi, %eax // Adjust pointers and pixel count | |
| 261 addl %edi, %edx | |
| 262 shrl $2, %edi | |
| 263 lddqu (%eax), %xmm1 // Pre-load new source pixels (after ali gnment) | |
| 264 subl %edi, %ecx | |
| 265 | |
| 266 .LAligned: // Destination is guaranteed to be 16 by te aligned | |
| 267 xorl %edi, %edi // Reset offset to zero | |
| 268 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup) | |
| 269 testl $0xF, %eax // Check alignment of source pointer | |
| 270 jz .LAlignedLoop | |
| 271 | |
| 272 // Source not aligned to destination | |
| 273 // ********************************* | |
| 274 .p2align 4 | |
| 275 .LUnalignedLoop: // Main loop for unaligned, handles eigh t pixels per iteration | |
| 276 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 277 ja .LAlphaNotOpaqueOrZero00 | |
| 278 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
| 279 jz .LAlphaZero00 | |
| 280 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 281 | |
| 282 .LAlphaZero00: | |
| 283 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
| 284 ja .LAlphaNotOpaqueOrZero01 | |
| 285 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 286 jz .LAlphaZero01 | |
| 287 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
| 288 | |
| 289 .LAlphaZero01: | |
| 290 addl $32, %edi // Adjust offset and pixel count | |
| 291 subl $8, %ecx | |
| 292 jae .LUnalignedLoop | |
| 293 addl $8, %ecx // Adjust pixel count | |
| 294 jmp .LLoopCleanup0 | |
| 295 | |
| 296 .p2align 4 | |
| 297 .LAlphaNotOpaqueOrZero00: | |
| 298 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 299 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 300 psrlw $8, %xmm2 // Discard red and blue | |
| 301 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 302 movdqa %xmm6, %xmm4 | |
| 303 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 304 movdqa %xmm5, %xmm3 | |
| 305 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 306 | |
| 307 psllw $8, %xmm5 // Filter out red and blue components | |
| 308 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 309 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 310 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 311 | |
| 312 lddqu 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
| 313 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 314 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 315 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 316 | |
| 317 // Handle next four pixels | |
| 318 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
| 319 ja .LAlphaNotOpaqueOrZero01 | |
| 320 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 321 jz .LAlphaZero02 | |
| 322 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
| 323 .LAlphaZero02: | |
| 324 addl $32, %edi // Adjust offset and pixel count | |
| 325 subl $8, %ecx | |
| 326 jae .LUnalignedLoop | |
| 327 addl $8, %ecx // Adjust pixel count | |
| 328 jmp .LLoopCleanup0 | |
| 329 | |
| 330 .p2align 4 | |
| 331 .LAlphaNotOpaqueOrZero01: | |
| 332 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels | |
| 333 | |
| 334 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha | |
| 335 psrlw $8, %xmm1 // Discard red and blue | |
| 336 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high) | |
| 337 movdqa %xmm6, %xmm4 | |
| 338 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low) | |
| 339 movdqa %xmm5, %xmm3 | |
| 340 psubw %xmm1, %xmm4 // Finalize alpha calculations | |
| 341 | |
| 342 psllw $8, %xmm5 // Filter out red and blue components | |
| 343 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 344 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 345 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 346 | |
| 347 lddqu 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 348 addl $32, %edi | |
| 349 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 350 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether | |
| 351 subl $8, %ecx | |
| 352 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels | |
| 353 jae .LUnalignedLoop | |
| 354 addl $8, %ecx // Adjust pixel count | |
| 355 | |
| 356 // Cleanup - handle pending pixels from loop | |
| 357 .LLoopCleanup0: | |
| 358 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 359 ja .LAlphaNotOpaqueOrZero02 | |
| 360 jz .LAlphaZero03 | |
| 361 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 362 .LAlphaZero03: | |
| 363 addl $16, %edi | |
| 364 subl $4, %ecx | |
| 365 js .LSmallRemaining // Reuse code from small loop | |
| 366 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 367 jmp .LLoopCleanup0 | |
| 368 | |
| 369 .LAlphaNotOpaqueOrZero02: | |
| 370 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 371 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 372 psrlw $8, %xmm2 // Discard red and blue | |
| 373 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 374 movdqa %xmm6, %xmm4 | |
| 375 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 376 movdqa %xmm5, %xmm3 | |
| 377 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 378 | |
| 379 psllw $8, %xmm5 // Filter out red and blue components | |
| 380 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 381 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 382 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 383 | |
| 384 addl $16, %edi | |
| 385 subl $4, %ecx | |
| 386 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 387 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 388 movdqa %xmm1, -16(%edx, %edi) // Store four destination pixels | |
| 389 js .LSmallRemaining // Reuse code from small loop | |
| 390 lddqu (%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 391 jmp .LLoopCleanup0 | |
| 392 | |
| 393 // Source aligned to destination | |
| 394 // ***************************** | |
| 395 .p2align 4 | |
| 396 .LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration | |
| 397 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 398 ja .LAlphaNotOpaqueOrZero10 | |
| 399 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
| 400 jz .LAlphaZero10 | |
| 401 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 402 | |
| 403 .LAlphaZero10: | |
| 404 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
| 405 ja .LAlphaNotOpaqueOrZero11 | |
| 406 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 407 jz .LAlphaZero11 | |
| 408 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
| 409 | |
| 410 .LAlphaZero11: | |
| 411 addl $32, %edi // Adjust offset and pixel count | |
| 412 subl $8, %ecx | |
| 413 jae .LAlignedLoop | |
| 414 jmp .LLoopCleanup1 | |
| 415 | |
| 416 .p2align 4 | |
| 417 .LAlphaNotOpaqueOrZero10: | |
| 418 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 419 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 420 psrlw $8, %xmm2 // Discard red and blue | |
| 421 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 422 movdqa %xmm6, %xmm4 | |
| 423 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 424 movdqa %xmm5, %xmm3 | |
| 425 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 426 | |
| 427 psllw $8, %xmm5 // Filter out red and blue components | |
| 428 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 429 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 430 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 431 | |
| 432 movdqa 16(%eax, %edi), %xmm2 // Pre-load four source pixels | |
| 433 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 434 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 435 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 436 | |
| 437 // Handle next four pixels | |
| 438 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
| 439 ja .LAlphaNotOpaqueOrZero11 | |
| 440 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 441 jz .LAlphaZero12 | |
| 442 movdqa %xmm2, 16(%edx, %edi) // Store four destination pixels | |
| 443 .LAlphaZero12: | |
| 444 addl $32, %edi // Adjust offset and pixel count | |
| 445 subl $8, %ecx | |
| 446 jae .LAlignedLoop | |
| 447 jmp .LLoopCleanup1 | |
| 448 | |
| 449 .p2align 4 | |
| 450 .LAlphaNotOpaqueOrZero11: | |
| 451 movdqa 16(%edx, %edi), %xmm5 // Load four destination pixels | |
| 452 | |
| 453 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha | |
| 454 psrlw $8, %xmm1 // Discard red and blue | |
| 455 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high) | |
| 456 movdqa %xmm6, %xmm4 | |
| 457 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low) | |
| 458 movdqa %xmm5, %xmm3 | |
| 459 psubw %xmm1, %xmm4 // Finalize alpha calculations | |
| 460 | |
| 461 psllw $8, %xmm5 // Filter out red and blue components | |
| 462 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 463 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 464 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 465 movdqa 32(%eax, %edi), %xmm1 // Pre-load four source pixels | |
| 466 | |
| 467 addl $32, %edi | |
| 468 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 469 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether | |
| 470 subl $8, %ecx | |
| 471 movdqa %xmm2, -16(%edx, %edi) // Store four destination pixels | |
| 472 jae .LAlignedLoop | |
| 473 | |
| 474 // Cleanup - handle four pending pixels from loop | |
| 475 .LLoopCleanup1: | |
| 476 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 477 ja .LAlphaNotOpaqueOrZero12 | |
| 478 jz .LAlphaZero13 | |
| 479 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 480 .LAlphaZero13: | |
| 481 addl $8, %ecx // Adjust offset and pixel count | |
| 482 jz .LExit | |
| 483 addl $16, %edi | |
| 484 jmp .LRemainLoop1 | |
| 485 | |
| 486 .LAlphaNotOpaqueOrZero12: | |
| 487 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 488 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 489 psrlw $8, %xmm2 // Discard red and blue | |
| 490 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 491 movdqa %xmm6, %xmm4 | |
| 492 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 493 movdqa %xmm5, %xmm3 | |
| 494 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 495 | |
| 496 psllw $8, %xmm5 // Filter out red and blue components | |
| 497 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 498 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 499 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 500 | |
| 501 addl $8, %ecx // Adjust offset and pixel count | |
| 502 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 503 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 504 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 505 jz .LExit | |
| 506 addl $16, %edi | |
| 507 | |
| 508 // Handle last 1-7 pixels | |
| 509 .LRemainLoop1: | |
| 510 movdqa (%eax, %edi), %xmm1 // Load four source pixels | |
| 511 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 512 ja .LRemainAlphaNotOpaqueOrZero1 | |
| 513 jz .LRemainAlphaZero1 | |
| 514 | |
| 515 // All alphas were opaque (copy) | |
| 516 subl $4, %ecx // Check if we have more than four pixel s left | |
| 517 jle .LRemainStore | |
| 518 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 519 addl $16, %edi | |
| 520 jmp .LRemainLoop1 | |
| 521 | |
| 522 // All alphas were zero (skip) | |
| 523 .p2align 4 | |
| 524 .LRemainAlphaZero1: | |
| 525 subl $4, %ecx // Check if we have more than four pixel s left | |
| 526 jle .LExit | |
| 527 addl $16, %edi | |
| 528 jmp .LRemainLoop1 | |
| 529 | |
| 530 // Handle mixed alphas (calculate and scale) | |
| 531 .p2align 4 | |
| 532 .LRemainAlphaNotOpaqueOrZero1: | |
| 533 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 534 | |
| 535 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 536 psrlw $8, %xmm2 // Discard red and blue | |
| 537 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 538 movdqa %xmm6, %xmm4 | |
| 539 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 540 movdqa %xmm5, %xmm3 | |
| 541 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 542 | |
| 543 psllw $8, %xmm5 // Filter out red and blue components | |
| 544 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 545 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 546 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 547 | |
| 548 subl $4, %ecx | |
| 549 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 550 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 551 jle .LRemainStore | |
| 552 movdqa %xmm1, (%edx, %edi) // Store four destination pixels | |
| 553 addl $16, %edi | |
| 554 jmp .LRemainLoop1 | |
| 555 | |
| 556 // Store the last 1-4 pixels | |
| 557 .p2align 4 | |
| 558 .LRemainStore: | |
| 559 jz .LRemainFull | |
| 560 movdqa (%edx, %edi), %xmm5 // Load four destination pixels | |
| 561 cmp $-2, %ecx // Check how many pixels should be writt en | |
| 562 jb .LRemainPixelsLeft11 | |
| 563 ja .LRemainPixelsLeft13 | |
| 564 pblendw $0x0F, %xmm1, %xmm5 | |
| 565 movdqa %xmm5, (%edx, %edi) // Store last 2 destination pixels | |
| 566 .LExit: | |
| 567 POP(%edi) // Exit | |
| 568 ret | |
| 569 | |
| 570 .LRemainPixelsLeft11: | |
| 571 pblendw $0x03, %xmm1, %xmm5 | |
| 572 movdqa %xmm5, (%edx, %edi) // Store last destination pixel | |
| 573 POP(%edi) // Exit | |
| 574 ret | |
| 575 | |
| 576 .LRemainPixelsLeft13: | |
| 577 pblendw $0x3F, %xmm1, %xmm5 | |
| 578 movdqa %xmm5, (%edx, %edi) // Store last 3 destination pixels | |
| 579 POP(%edi) // Exit | |
| 580 ret | |
| 581 | |
| 582 .LRemainFull: | |
| 583 movdqa %xmm1, (%edx, %edi) // Store last 4 destination pixels | |
| 584 POP(%edi) // Exit | |
| 585 ret | |
| 586 | |
| 587 .cfi_endproc | |
| 588 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm | |
| 589 #endif | |
| OLD | NEW |