Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 /* | |
| 2 * Copyright 2013 The Android Open Source Project | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license that can be | |
| 5 * found in the LICENSE file. | |
| 6 */ | |
| 7 | |
| 8 #if !defined(_MSC_VER) | |
| 9 | |
| 10 /* | |
| 11 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, | |
| 12 * const SkPMColor* SK_RESTRICT src, | |
| 13 * int count, U8CPU alpha) | |
| 14 * | |
| 15 * The primary optimization comes from checking the source pixels' alpha value. | |
| 16 * If the alpha is zero, the pixel can be skipped entirely. | |
| 17 * If the alpha is fully opaque, the pixel can be copied directly to the destina tion. | |
| 18 * According to collected statistics, these two cases are the most common. | |
| 19 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the | |
| 20 * memory latency worse-case. | |
| 21 */ | |
| 22 | |
| 23 .section .text.sse4,"ax",@progbits | |
| 24 .type S32A_Opaque_BlitRow32_SSE4_asm, @function | |
| 25 .globl S32A_Opaque_BlitRow32_SSE4_asm | |
| 26 | |
| 27 .p2align 4 | |
| 28 S32A_Opaque_BlitRow32_SSE4_asm: | |
| 29 .cfi_startproc | |
| 30 prefetcht0 (%rsi) | |
| 31 movl %edx, %ecx // Pixel count | |
| 32 movq %rdi, %rdx // Destination pointer | |
| 33 movq %rsi, %rax // Source pointer | |
| 34 | |
| 35 // Setup SSE constants | |
| 36 movdqa .LAlphaCheckMask(%rip), %xmm7 // 0xFF000000 mask to check alpha | |
| 37 movdqa .LInverseAlphaCalc(%rip), %xmm6// 16-bit 256 to calculate inv. a lpha | |
| 38 movdqa .LResultMergeMask(%rip), %xmm0 // 0x00FF00FF mask (Must be in xm m0 because of pblendvb) | |
| 39 | |
| 40 subl $4, %ecx // Check if we have only 0-3 pixels | |
| 41 js .LReallySmall | |
| 42 cmpl $11, %ecx // Do we have enough pixels to run the m ain loop? | |
| 43 ja .LBigBlit | |
| 44 | |
| 45 // Handle small blits (4-15 pixels) | |
| 46 // ******************************** | |
| 47 xorq %rdi, %rdi // Reset offset to zero | |
| 48 | |
| 49 .LSmallLoop: | |
| 50 lddqu (%rax, %rdi), %xmm1 // Load four source pixels | |
| 51 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 52 ja .LSmallAlphaNotOpaqueOrZero | |
| 53 jz .LSmallAlphaZero | |
| 54 movdqu %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 55 .LSmallAlphaZero: | |
| 56 addq $16, %rdi | |
| 57 subl $4, %ecx // Check if there are four additional pi xels, at least | |
| 58 jns .LSmallLoop | |
| 59 jmp .LSmallRemaining | |
| 60 | |
| 61 // Handle mixed alphas (calculate and scale) | |
| 62 .p2align 4 | |
| 63 .LSmallAlphaNotOpaqueOrZero: | |
| 64 lddqu (%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 65 | |
| 66 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 67 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
| 68 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 69 movdqa %xmm6, %xmm4 | |
| 70 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 71 movdqa %xmm5, %xmm3 | |
| 72 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 73 | |
| 74 psllw $8, %xmm5 // Filter out red and blue components | |
| 75 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 76 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 77 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 78 | |
| 79 addq $16, %rdi | |
| 80 subl $4, %ecx // Check if we can store all four pixels | |
| 81 pblendvb %xmm0, %xmm5, %xmm3 | |
| 82 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 83 movdqu %xmm1, -16(%rdx, %rdi) // Store four destination pixels | |
| 84 jns .LSmallLoop | |
| 85 | |
| 86 // Handle the last 0-3 pixels (also used by the big unaligned loop) | |
| 87 .LSmallRemaining: | |
| 88 cmpl $-4, %ecx // Check if we are done | |
| 89 je .LSmallExit | |
| 90 sall $2, %ecx // Calculate offset for last pixels | |
| 91 movslq %ecx, %rcx | |
| 92 addq %rcx, %rdi | |
| 93 | |
| 94 lddqu (%rax, %rdi), %xmm1 // Load last four source pixels (overlap ping) | |
| 95 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 96 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store | |
| 97 jz .LSmallExit | |
| 98 | |
| 99 // Handle mixed alphas (calculate and scale) | |
| 100 lddqu (%rdx, %rdi), %xmm5 // Load last four destination pixels (ov erlapping) | |
| 101 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 102 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
| 103 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 104 movdqa %xmm6, %xmm4 | |
| 105 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 106 movdqa %xmm5, %xmm3 | |
| 107 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 108 | |
| 109 psllw $8, %xmm3 // Filter out red and blue components | |
| 110 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
| 111 movdqa %xmm5, %xmm2 | |
| 112 psrlw $8, %xmm2 // Filter out alpha and green components | |
| 113 pmullw %xmm4, %xmm2 // Scale alpha and green | |
| 114 | |
| 115 cmpl $-8, %ecx // Check how many pixels should be writt en | |
| 116 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
| 117 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether | |
| 118 jb .LSmallPixelsLeft1 | |
| 119 ja .LSmallPixelsLeft3 | |
| 120 pblendw $0xF0, %xmm1, %xmm5 | |
| 121 movdqu %xmm5, (%rdx, %rdi) // Store last two destination pixels | |
| 122 .LSmallExit: | |
| 123 ret | |
| 124 | |
| 125 .LSmallPixelsLeft1: | |
| 126 pblendw $0xC0, %xmm1, %xmm5 | |
| 127 movdqu %xmm5, (%rdx, %rdi) // Store last destination pixel | |
| 128 ret | |
| 129 | |
| 130 .LSmallPixelsLeft3: | |
| 131 pblendw $0xFC, %xmm1, %xmm5 | |
| 132 movdqu %xmm5, (%rdx, %rdi) // Store last three destination pixels | |
| 133 ret | |
| 134 | |
| 135 .LSmallRemainingStoreAll: | |
| 136 movdqu %xmm1, (%rdx, %rdi) // Store last destination pixels (overwr ite) | |
| 137 ret | |
| 138 | |
| 139 // Handle really small blits (0-3 pixels) | |
| 140 // ************************************** | |
| 141 .LReallySmall: | |
| 142 addl $4, %ecx | |
| 143 jle .LReallySmallExit | |
| 144 pcmpeqd %xmm1, %xmm1 | |
| 145 cmpl $2, %ecx // Check how many pixels should be read | |
| 146 pinsrd $0x0, (%rax), %xmm1 // Load one source pixel | |
| 147 pinsrd $0x0, (%rdx), %xmm5 // Load one destination pixel | |
| 148 jb .LReallySmallCalc | |
| 149 pinsrd $0x1, 4(%rax), %xmm1 // Load second source pixel | |
| 150 pinsrd $0x1, 4(%rdx), %xmm5 // Load second destination pixel | |
| 151 je .LReallySmallCalc | |
| 152 pinsrd $0x2, 8(%rax), %xmm1 // Load third source pixel | |
| 153 pinsrd $0x2, 8(%rdx), %xmm5 // Load third destination pixel | |
| 154 | |
| 155 .LReallySmallCalc: | |
| 156 ptest %xmm7, %xmm1 // Check if all alphas are opaque | |
| 157 jc .LReallySmallStore // If all alphas are opaque, just store | |
| 158 | |
| 159 // Handle mixed alphas (calculate and scale) | |
| 160 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 161 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green | |
| 162 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 163 movdqa %xmm6, %xmm4 | |
| 164 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 165 movdqa %xmm5, %xmm3 | |
| 166 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 167 | |
| 168 pand %xmm0, %xmm5 // Filter out red and blue components | |
| 169 pmullw %xmm4, %xmm5 // Scale red and blue | |
| 170 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 171 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 172 | |
| 173 psrlw $8, %xmm5 // Combine results | |
| 174 pblendvb %xmm0, %xmm5, %xmm3 | |
| 175 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 176 | |
| 177 .LReallySmallStore: | |
| 178 cmpl $2, %ecx // Check how many pixels should be writt en | |
| 179 pextrd $0x0, %xmm1, (%rdx) // Store one destination pixel | |
| 180 jb .LReallySmallExit | |
| 181 pextrd $0x1, %xmm1, 4(%rdx) // Store second destination pixel | |
| 182 je .LReallySmallExit | |
| 183 pextrd $0x2, %xmm1, 8(%rdx) // Store third destination pixel | |
| 184 .LReallySmallExit: | |
| 185 ret | |
| 186 | |
| 187 // Handle bigger blit operations (16+ pixels) | |
| 188 // ****************************************** | |
| 189 .p2align 4 | |
| 190 .LBigBlit: | |
| 191 // Align destination? | |
| 192 testl $0xF, %edx | |
| 193 lddqu (%rax), %xmm1 // Pre-load four source pixels | |
| 194 jz .LAligned | |
| 195 | |
| 196 movq %rdx, %rdi // Calculate alignment of destination po inter | |
| 197 negq %rdi | |
| 198 andl $0xF, %edi | |
| 199 | |
| 200 // Handle 1-3 pixels to align destination | |
| 201 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 202 jz .LAlignDone // If all alphas are opaque, just skip | |
| 203 lddqu (%rdx), %xmm5 // Load four destination pixels | |
| 204 jc .LAlignStore // If all alphas are opaque, just store | |
| 205 | |
| 206 // Handle mixed alphas (calculate and scale) | |
| 207 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 208 psrlw $8, %xmm2 // Discard red and blue | |
| 209 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 210 movdqa %xmm6, %xmm4 | |
| 211 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 212 movdqa %xmm5, %xmm3 | |
| 213 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 214 | |
| 215 psllw $8, %xmm3 // Filter out red and blue components | |
| 216 pmulhuw %xmm4, %xmm3 // Scale red and blue | |
| 217 movdqa %xmm5, %xmm2 | |
| 218 psrlw $8, %xmm2 // Filter out alpha and green components | |
| 219 pmullw %xmm4, %xmm2 // Scale alpha and green | |
| 220 | |
| 221 pblendvb %xmm0, %xmm3, %xmm2 // Combine results | |
| 222 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether | |
| 223 | |
| 224 .LAlignStore: | |
| 225 cmpl $8, %edi // Check how many pixels should be writt en | |
| 226 jb .LAlignPixelsLeft1 | |
| 227 ja .LAlignPixelsLeft3 | |
| 228 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels | |
| 229 jmp .LAlignStorePixels | |
| 230 | |
| 231 .LAlignPixelsLeft1: | |
| 232 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel | |
| 233 jmp .LAlignStorePixels | |
| 234 | |
| 235 .LAlignPixelsLeft3: | |
| 236 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels | |
| 237 | |
| 238 .LAlignStorePixels: | |
| 239 movdqu %xmm5, (%rdx) // Store destination pixels | |
| 240 | |
| 241 .LAlignDone: | |
| 242 addq %rdi, %rax // Adjust pointers and pixel count | |
| 243 addq %rdi, %rdx | |
| 244 shrq $2, %rdi | |
| 245 lddqu (%rax), %xmm1 // Pre-load new source pixels (after ali gnment) | |
| 246 subl %edi, %ecx | |
| 247 | |
| 248 .LAligned: // Destination is guaranteed to be 16 by te aligned | |
| 249 xorq %rdi, %rdi // Reset offset to zero | |
| 250 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup) | |
| 251 testl $0xF, %eax // Check alignment of source pointer | |
| 252 jz .LAlignedLoop | |
| 253 | |
| 254 // Source not aligned to destination | |
| 255 // ********************************* | |
| 256 .p2align 4 | |
| 257 .LUnalignedLoop: // Main loop for unaligned, handles eigh t pixels per iteration | |
| 258 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 259 ja .LAlphaNotOpaqueOrZero00 | |
| 260 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
| 261 jz .LAlphaZero00 | |
| 262 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 263 | |
| 264 .LAlphaZero00: | |
| 265 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
| 266 ja .LAlphaNotOpaqueOrZero01 | |
| 267 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
| 268 jz .LAlphaZero01 | |
| 269 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
| 270 | |
| 271 .LAlphaZero01: | |
| 272 addq $32, %rdi // Adjust offset and pixel count | |
| 273 subl $8, %ecx | |
| 274 jae .LUnalignedLoop | |
| 275 addl $8, %ecx // Adjust pixel count | |
| 276 jmp .LLoopCleanup0 | |
| 277 | |
| 278 .p2align 4 | |
| 279 .LAlphaNotOpaqueOrZero00: | |
| 280 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 281 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 282 psrlw $8, %xmm2 // Discard red and blue | |
| 283 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 284 movdqa %xmm6, %xmm4 | |
| 285 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 286 movdqa %xmm5, %xmm3 | |
| 287 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 288 | |
| 289 psllw $8, %xmm5 // Filter out red and blue components | |
| 290 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 291 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 292 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 293 | |
| 294 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
| 295 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 296 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 297 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 298 | |
| 299 // Handle next four pixels | |
| 300 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
| 301 ja .LAlphaNotOpaqueOrZero01 | |
| 302 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
| 303 jz .LAlphaZero02 | |
| 304 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
| 305 .LAlphaZero02: | |
| 306 addq $32, %rdi // Adjust offset and pixel count | |
| 307 subl $8, %ecx | |
| 308 jae .LUnalignedLoop | |
| 309 addl $8, %ecx // Adjust pixel count | |
| 310 jmp .LLoopCleanup0 | |
| 311 | |
| 312 .p2align 4 | |
| 313 .LAlphaNotOpaqueOrZero01: | |
| 314 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 315 | |
| 316 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha | |
| 317 psrlw $8, %xmm1 // Discard red and blue | |
| 318 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high) | |
| 319 movdqa %xmm6, %xmm4 | |
| 320 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low) | |
| 321 movdqa %xmm5, %xmm3 | |
| 322 psubw %xmm1, %xmm4 // Finalize alpha calculations | |
| 323 | |
| 324 psllw $8, %xmm5 // Filter out red and blue components | |
| 325 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 326 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 327 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 328 | |
| 329 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
| 330 addq $32, %rdi | |
| 331 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 332 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether | |
| 333 subl $8, %ecx | |
| 334 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels | |
| 335 jae .LUnalignedLoop | |
| 336 addl $8, %ecx // Adjust pixel count | |
| 337 | |
| 338 // Cleanup - handle pending pixels from loop | |
| 339 .LLoopCleanup0: | |
| 340 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 341 ja .LAlphaNotOpaqueOrZero02 | |
| 342 jz .LAlphaZero03 | |
| 343 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 344 .LAlphaZero03: | |
| 345 addq $16, %rdi | |
| 346 subl $4, %ecx | |
| 347 js .LSmallRemaining // Reuse code from small loop | |
| 348 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels | |
| 349 jmp .LLoopCleanup0 | |
| 350 | |
| 351 .LAlphaNotOpaqueOrZero02: | |
| 352 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 353 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 354 psrlw $8, %xmm2 // Discard red and blue | |
| 355 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 356 movdqa %xmm6, %xmm4 | |
| 357 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 358 movdqa %xmm5, %xmm3 | |
| 359 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 360 | |
| 361 psllw $8, %xmm5 // Filter out red and blue components | |
| 362 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 363 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 364 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 365 | |
| 366 addq $16, %rdi | |
| 367 subl $4, %ecx | |
| 368 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 369 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 370 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels | |
| 371 js .LSmallRemaining // Reuse code from small loop | |
| 372 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels | |
| 373 jmp .LLoopCleanup0 | |
| 374 | |
| 375 // Source aligned to destination | |
| 376 // ***************************** | |
| 377 .p2align 4 | |
| 378 .LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration | |
| 379 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 380 ja .LAlphaNotOpaqueOrZero10 | |
| 381 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
| 382 jz .LAlphaZero10 | |
| 383 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 384 | |
| 385 .LAlphaZero10: | |
| 386 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
| 387 ja .LAlphaNotOpaqueOrZero11 | |
| 388 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
| 389 jz .LAlphaZero11 | |
| 390 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
| 391 | |
| 392 .LAlphaZero11: | |
| 393 addq $32, %rdi // Adjust offset and pixel count | |
| 394 subl $8, %ecx | |
| 395 jae .LAlignedLoop | |
| 396 jmp .LLoopCleanup1 | |
| 397 | |
| 398 .p2align 4 | |
| 399 .LAlphaNotOpaqueOrZero10: | |
| 400 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 401 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 402 psrlw $8, %xmm2 // Discard red and blue | |
| 403 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 404 movdqa %xmm6, %xmm4 | |
| 405 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 406 movdqa %xmm5, %xmm3 | |
| 407 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 408 | |
| 409 psllw $8, %xmm5 // Filter out red and blue components | |
| 410 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 411 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 412 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 413 | |
| 414 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels | |
| 415 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 416 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 417 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 418 | |
| 419 // Handle next four pixels | |
| 420 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e | |
| 421 ja .LAlphaNotOpaqueOrZero11 | |
| 422 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
| 423 jz .LAlphaZero12 | |
| 424 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels | |
| 425 .LAlphaZero12: | |
| 426 addq $32, %rdi // Adjust offset and pixel count | |
| 427 subl $8, %ecx | |
| 428 jae .LAlignedLoop | |
| 429 jmp .LLoopCleanup1 | |
| 430 | |
| 431 .p2align 4 | |
| 432 .LAlphaNotOpaqueOrZero11: | |
| 433 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 434 | |
| 435 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha | |
| 436 psrlw $8, %xmm1 // Discard red and blue | |
| 437 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high) | |
| 438 movdqa %xmm6, %xmm4 | |
| 439 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low) | |
| 440 movdqa %xmm5, %xmm3 | |
| 441 psubw %xmm1, %xmm4 // Finalize alpha calculations | |
| 442 | |
| 443 psllw $8, %xmm5 // Filter out red and blue components | |
| 444 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 445 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 446 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 447 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels | |
| 448 | |
| 449 addq $32, %rdi | |
| 450 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 451 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether | |
| 452 subl $8, %ecx | |
| 453 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels | |
| 454 jae .LAlignedLoop | |
| 455 | |
| 456 // Cleanup - handle four pending pixels from loop | |
| 457 .LLoopCleanup1: | |
| 458 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 459 ja .LAlphaNotOpaqueOrZero12 | |
| 460 jz .LAlphaZero13 | |
| 461 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 462 .LAlphaZero13: | |
| 463 addl $8, %ecx // Adjust offset and pixel count | |
| 464 jz .LExit | |
| 465 addq $16, %rdi | |
| 466 jmp .LRemainLoop1 | |
| 467 | |
| 468 .LAlphaNotOpaqueOrZero12: | |
| 469 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 470 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 471 psrlw $8, %xmm2 // Discard red and blue | |
| 472 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 473 movdqa %xmm6, %xmm4 | |
| 474 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 475 movdqa %xmm5, %xmm3 | |
| 476 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 477 | |
| 478 psllw $8, %xmm5 // Filter out red and blue components | |
| 479 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 480 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 481 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 482 | |
| 483 addl $8, %ecx // Adjust offset and pixel count | |
| 484 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 485 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 486 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 487 jz .LExit | |
| 488 addq $16, %rdi | |
| 489 | |
| 490 // Handle last 1-7 pixels | |
| 491 .LRemainLoop1: | |
| 492 movdqa (%rax, %rdi), %xmm1 // Load four source pixels | |
| 493 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e | |
| 494 ja .LRemainAlphaNotOpaqueOrZero1 | |
| 495 jz .LRemainAlphaZero1 | |
| 496 | |
| 497 // All alphas were opaque (copy) | |
| 498 subl $4, %ecx // Check if we have more than four pixel s left | |
| 499 jle .LRemainStore | |
| 500 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 501 addq $16, %rdi | |
| 502 jmp .LRemainLoop1 | |
| 503 | |
| 504 // All alphas were zero (skip) | |
| 505 .p2align 4 | |
| 506 .LRemainAlphaZero1: | |
| 507 subl $4, %ecx // Check if we have more than four pixel s left | |
| 508 jle .LExit | |
| 509 addq $16, %rdi | |
| 510 jmp .LRemainLoop1 | |
| 511 | |
| 512 // Handle mixed alphas (calculate and scale) | |
| 513 .p2align 4 | |
| 514 .LRemainAlphaNotOpaqueOrZero1: | |
| 515 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 516 | |
| 517 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha | |
| 518 psrlw $8, %xmm2 // Discard red and blue | |
| 519 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high) | |
| 520 movdqa %xmm6, %xmm4 | |
| 521 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low) | |
| 522 movdqa %xmm5, %xmm3 | |
| 523 psubw %xmm2, %xmm4 // Finalize alpha calculations | |
| 524 | |
| 525 psllw $8, %xmm5 // Filter out red and blue components | |
| 526 pmulhuw %xmm4, %xmm5 // Scale red and blue | |
| 527 psrlw $8, %xmm3 // Filter out alpha and green components | |
| 528 pmullw %xmm4, %xmm3 // Scale alpha and green | |
| 529 | |
| 530 subl $4, %ecx | |
| 531 pblendvb %xmm0, %xmm5, %xmm3 // Combine results | |
| 532 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether | |
| 533 jle .LRemainStore | |
| 534 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels | |
| 535 addq $16, %rdi | |
| 536 jmp .LRemainLoop1 | |
| 537 | |
| 538 // Store the last 1-4 pixels | |
| 539 .p2align 4 | |
| 540 .LRemainStore: | |
| 541 jz .LRemainFull | |
| 542 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels | |
| 543 cmpl $-2, %ecx // Check how many pixels should be writt en | |
| 544 jb .LRemainPixelsLeft11 | |
| 545 ja .LRemainPixelsLeft13 | |
| 546 pblendw $0x0F, %xmm1, %xmm5 | |
| 547 movdqa %xmm5, (%rdx, %rdi) // Store last 2 destination pixels | |
| 548 .LExit: | |
| 549 ret | |
| 550 | |
| 551 .LRemainPixelsLeft11: | |
| 552 pblendw $0x03, %xmm1, %xmm5 | |
| 553 movdqa %xmm5, (%rdx, %rdi) // Store last destination pixel | |
| 554 ret | |
| 555 | |
| 556 .LRemainPixelsLeft13: | |
| 557 pblendw $0x3F, %xmm1, %xmm5 | |
| 558 movdqa %xmm5, (%rdx, %rdi) // Store last 3 destination pixels | |
| 559 ret | |
| 560 | |
| 561 .LRemainFull: | |
| 562 movdqa %xmm1, (%rdx, %rdi) // Store last 4 destination pixels | |
| 563 ret | |
| 564 | |
| 565 .cfi_endproc | |
| 566 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm | |
| 567 | |
| 568 // Constants for SSE code | |
| 569 .pushsection .rodata.sse4,"a",@progbits | |
| 570 .p2align 4 | |
| 571 .LAlphaCheckMask: | |
|
mtklein
2014/05/16 18:06:38
Looks like the differences here are:
1) calling
henrik.smiding
2014/05/20 15:10:29
I tested doing a position independent version in 3
| |
| 572 .long 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000 | |
| 573 .LInverseAlphaCalc: | |
| 574 .word 256, 256, 256, 256, 256, 256, 256, 256 | |
| 575 .LResultMergeMask: | |
| 576 .long 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF | |
| 577 .popsection | |
| 578 #endif | |
| OLD | NEW |