Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(644)

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4_x64_asm.S

Issue 289473009: Add SSE4 optimization of S32A_Opaque_Blitrow (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright 2013 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #if !defined(_MSC_VER)
9
10 /*
11 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
12 * const SkPMColor* SK_RESTRICT src,
13 * int count, U8CPU alpha)
14 *
15 * The primary optimization comes from checking the source pixels' alpha value.
16 * If the alpha is zero, the pixel can be skipped entirely.
17 * If the alpha is fully opaque, the pixel can be copied directly to the destina tion.
18 * According to collected statistics, these two cases are the most common.
19 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the
20 * memory latency worse-case.
21 */
22
23 .section .text.sse4,"ax",@progbits
24 .type S32A_Opaque_BlitRow32_SSE4_asm, @function
25 .globl S32A_Opaque_BlitRow32_SSE4_asm
26
27 .p2align 4
28 S32A_Opaque_BlitRow32_SSE4_asm:
29 .cfi_startproc
30 prefetcht0 (%rsi)
31 movl %edx, %ecx // Pixel count
32 movq %rdi, %rdx // Destination pointer
33 movq %rsi, %rax // Source pointer
34
35 // Setup SSE constants
36 movdqa .LAlphaCheckMask(%rip), %xmm7 // 0xFF000000 mask to check alpha
37 movdqa .LInverseAlphaCalc(%rip), %xmm6// 16-bit 256 to calculate inv. a lpha
38 movdqa .LResultMergeMask(%rip), %xmm0 // 0x00FF00FF mask (Must be in xm m0 because of pblendvb)
39
40 subl $4, %ecx // Check if we have only 0-3 pixels
41 js .LReallySmall
42 cmpl $11, %ecx // Do we have enough pixels to run the m ain loop?
43 ja .LBigBlit
44
45 // Handle small blits (4-15 pixels)
46 // ********************************
47 xorq %rdi, %rdi // Reset offset to zero
48
49 .LSmallLoop:
50 lddqu (%rax, %rdi), %xmm1 // Load four source pixels
51 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
52 ja .LSmallAlphaNotOpaqueOrZero
53 jz .LSmallAlphaZero
54 movdqu %xmm1, (%rdx, %rdi) // Store four destination pixels
55 .LSmallAlphaZero:
56 addq $16, %rdi
57 subl $4, %ecx // Check if there are four additional pi xels, at least
58 jns .LSmallLoop
59 jmp .LSmallRemaining
60
61 // Handle mixed alphas (calculate and scale)
62 .p2align 4
63 .LSmallAlphaNotOpaqueOrZero:
64 lddqu (%rdx, %rdi), %xmm5 // Load four destination pixels
65
66 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
67 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green
68 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
69 movdqa %xmm6, %xmm4
70 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
71 movdqa %xmm5, %xmm3
72 psubw %xmm2, %xmm4 // Finalize alpha calculations
73
74 psllw $8, %xmm5 // Filter out red and blue components
75 pmulhuw %xmm4, %xmm5 // Scale red and blue
76 psrlw $8, %xmm3 // Filter out alpha and green components
77 pmullw %xmm4, %xmm3 // Scale alpha and green
78
79 addq $16, %rdi
80 subl $4, %ecx // Check if we can store all four pixels
81 pblendvb %xmm0, %xmm5, %xmm3
82 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether
83 movdqu %xmm1, -16(%rdx, %rdi) // Store four destination pixels
84 jns .LSmallLoop
85
86 // Handle the last 0-3 pixels (also used by the big unaligned loop)
87 .LSmallRemaining:
88 cmpl $-4, %ecx // Check if we are done
89 je .LSmallExit
90 sall $2, %ecx // Calculate offset for last pixels
91 movslq %ecx, %rcx
92 addq %rcx, %rdi
93
94 lddqu (%rax, %rdi), %xmm1 // Load last four source pixels (overlap ping)
95 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
96 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store
97 jz .LSmallExit
98
99 // Handle mixed alphas (calculate and scale)
100 lddqu (%rdx, %rdi), %xmm5 // Load last four destination pixels (ov erlapping)
101 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
102 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green
103 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
104 movdqa %xmm6, %xmm4
105 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
106 movdqa %xmm5, %xmm3
107 psubw %xmm2, %xmm4 // Finalize alpha calculations
108
109 psllw $8, %xmm3 // Filter out red and blue components
110 pmulhuw %xmm4, %xmm3 // Scale red and blue
111 movdqa %xmm5, %xmm2
112 psrlw $8, %xmm2 // Filter out alpha and green components
113 pmullw %xmm4, %xmm2 // Scale alpha and green
114
115 cmpl $-8, %ecx // Check how many pixels should be writt en
116 pblendvb %xmm0, %xmm3, %xmm2 // Combine results
117 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether
118 jb .LSmallPixelsLeft1
119 ja .LSmallPixelsLeft3
120 pblendw $0xF0, %xmm1, %xmm5
121 movdqu %xmm5, (%rdx, %rdi) // Store last two destination pixels
122 .LSmallExit:
123 ret
124
125 .LSmallPixelsLeft1:
126 pblendw $0xC0, %xmm1, %xmm5
127 movdqu %xmm5, (%rdx, %rdi) // Store last destination pixel
128 ret
129
130 .LSmallPixelsLeft3:
131 pblendw $0xFC, %xmm1, %xmm5
132 movdqu %xmm5, (%rdx, %rdi) // Store last three destination pixels
133 ret
134
135 .LSmallRemainingStoreAll:
136 movdqu %xmm1, (%rdx, %rdi) // Store last destination pixels (overwr ite)
137 ret
138
139 // Handle really small blits (0-3 pixels)
140 // **************************************
141 .LReallySmall:
142 addl $4, %ecx
143 jle .LReallySmallExit
144 pcmpeqd %xmm1, %xmm1
145 cmpl $2, %ecx // Check how many pixels should be read
146 pinsrd $0x0, (%rax), %xmm1 // Load one source pixel
147 pinsrd $0x0, (%rdx), %xmm5 // Load one destination pixel
148 jb .LReallySmallCalc
149 pinsrd $0x1, 4(%rax), %xmm1 // Load second source pixel
150 pinsrd $0x1, 4(%rdx), %xmm5 // Load second destination pixel
151 je .LReallySmallCalc
152 pinsrd $0x2, 8(%rax), %xmm1 // Load third source pixel
153 pinsrd $0x2, 8(%rdx), %xmm5 // Load third destination pixel
154
155 .LReallySmallCalc:
156 ptest %xmm7, %xmm1 // Check if all alphas are opaque
157 jc .LReallySmallStore // If all alphas are opaque, just store
158
159 // Handle mixed alphas (calculate and scale)
160 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
161 psrlw $8, %xmm2 // Discard red and blue, leaving alpha a nd green
162 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
163 movdqa %xmm6, %xmm4
164 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
165 movdqa %xmm5, %xmm3
166 psubw %xmm2, %xmm4 // Finalize alpha calculations
167
168 pand %xmm0, %xmm5 // Filter out red and blue components
169 pmullw %xmm4, %xmm5 // Scale red and blue
170 psrlw $8, %xmm3 // Filter out alpha and green components
171 pmullw %xmm4, %xmm3 // Scale alpha and green
172
173 psrlw $8, %xmm5 // Combine results
174 pblendvb %xmm0, %xmm5, %xmm3
175 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether
176
177 .LReallySmallStore:
178 cmpl $2, %ecx // Check how many pixels should be writt en
179 pextrd $0x0, %xmm1, (%rdx) // Store one destination pixel
180 jb .LReallySmallExit
181 pextrd $0x1, %xmm1, 4(%rdx) // Store second destination pixel
182 je .LReallySmallExit
183 pextrd $0x2, %xmm1, 8(%rdx) // Store third destination pixel
184 .LReallySmallExit:
185 ret
186
187 // Handle bigger blit operations (16+ pixels)
188 // ******************************************
189 .p2align 4
190 .LBigBlit:
191 // Align destination?
192 testl $0xF, %edx
193 lddqu (%rax), %xmm1 // Pre-load four source pixels
194 jz .LAligned
195
196 movq %rdx, %rdi // Calculate alignment of destination po inter
197 negq %rdi
198 andl $0xF, %edi
199
200 // Handle 1-3 pixels to align destination
201 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
202 jz .LAlignDone // If all alphas are opaque, just skip
203 lddqu (%rdx), %xmm5 // Load four destination pixels
204 jc .LAlignStore // If all alphas are opaque, just store
205
206 // Handle mixed alphas (calculate and scale)
207 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
208 psrlw $8, %xmm2 // Discard red and blue
209 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
210 movdqa %xmm6, %xmm4
211 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
212 movdqa %xmm5, %xmm3
213 psubw %xmm2, %xmm4 // Finalize alpha calculations
214
215 psllw $8, %xmm3 // Filter out red and blue components
216 pmulhuw %xmm4, %xmm3 // Scale red and blue
217 movdqa %xmm5, %xmm2
218 psrlw $8, %xmm2 // Filter out alpha and green components
219 pmullw %xmm4, %xmm2 // Scale alpha and green
220
221 pblendvb %xmm0, %xmm3, %xmm2 // Combine results
222 paddb %xmm2, %xmm1 // Add source and destination pixels tog ether
223
224 .LAlignStore:
225 cmpl $8, %edi // Check how many pixels should be writt en
226 jb .LAlignPixelsLeft1
227 ja .LAlignPixelsLeft3
228 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels
229 jmp .LAlignStorePixels
230
231 .LAlignPixelsLeft1:
232 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel
233 jmp .LAlignStorePixels
234
235 .LAlignPixelsLeft3:
236 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels
237
238 .LAlignStorePixels:
239 movdqu %xmm5, (%rdx) // Store destination pixels
240
241 .LAlignDone:
242 addq %rdi, %rax // Adjust pointers and pixel count
243 addq %rdi, %rdx
244 shrq $2, %rdi
245 lddqu (%rax), %xmm1 // Pre-load new source pixels (after ali gnment)
246 subl %edi, %ecx
247
248 .LAligned: // Destination is guaranteed to be 16 by te aligned
249 xorq %rdi, %rdi // Reset offset to zero
250 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup)
251 testl $0xF, %eax // Check alignment of source pointer
252 jz .LAlignedLoop
253
254 // Source not aligned to destination
255 // *********************************
256 .p2align 4
257 .LUnalignedLoop: // Main loop for unaligned, handles eigh t pixels per iteration
258 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
259 ja .LAlphaNotOpaqueOrZero00
260 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels
261 jz .LAlphaZero00
262 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
263
264 .LAlphaZero00:
265 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e
266 ja .LAlphaNotOpaqueOrZero01
267 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels
268 jz .LAlphaZero01
269 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels
270
271 .LAlphaZero01:
272 addq $32, %rdi // Adjust offset and pixel count
273 subl $8, %ecx
274 jae .LUnalignedLoop
275 addl $8, %ecx // Adjust pixel count
276 jmp .LLoopCleanup0
277
278 .p2align 4
279 .LAlphaNotOpaqueOrZero00:
280 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels
281 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
282 psrlw $8, %xmm2 // Discard red and blue
283 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
284 movdqa %xmm6, %xmm4
285 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
286 movdqa %xmm5, %xmm3
287 psubw %xmm2, %xmm4 // Finalize alpha calculations
288
289 psllw $8, %xmm5 // Filter out red and blue components
290 pmulhuw %xmm4, %xmm5 // Scale red and blue
291 psrlw $8, %xmm3 // Filter out alpha and green components
292 pmullw %xmm4, %xmm3 // Scale alpha and green
293
294 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels
295 pblendvb %xmm0, %xmm5, %xmm3 // Combine results
296 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether
297 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
298
299 // Handle next four pixels
300 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e
301 ja .LAlphaNotOpaqueOrZero01
302 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels
303 jz .LAlphaZero02
304 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels
305 .LAlphaZero02:
306 addq $32, %rdi // Adjust offset and pixel count
307 subl $8, %ecx
308 jae .LUnalignedLoop
309 addl $8, %ecx // Adjust pixel count
310 jmp .LLoopCleanup0
311
312 .p2align 4
313 .LAlphaNotOpaqueOrZero01:
314 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels
315
316 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha
317 psrlw $8, %xmm1 // Discard red and blue
318 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high)
319 movdqa %xmm6, %xmm4
320 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low)
321 movdqa %xmm5, %xmm3
322 psubw %xmm1, %xmm4 // Finalize alpha calculations
323
324 psllw $8, %xmm5 // Filter out red and blue components
325 pmulhuw %xmm4, %xmm5 // Scale red and blue
326 psrlw $8, %xmm3 // Filter out alpha and green components
327 pmullw %xmm4, %xmm3 // Scale alpha and green
328
329 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels
330 addq $32, %rdi
331 pblendvb %xmm0, %xmm5, %xmm3 // Combine results
332 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether
333 subl $8, %ecx
334 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels
335 jae .LUnalignedLoop
336 addl $8, %ecx // Adjust pixel count
337
338 // Cleanup - handle pending pixels from loop
339 .LLoopCleanup0:
340 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
341 ja .LAlphaNotOpaqueOrZero02
342 jz .LAlphaZero03
343 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
344 .LAlphaZero03:
345 addq $16, %rdi
346 subl $4, %ecx
347 js .LSmallRemaining // Reuse code from small loop
348 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels
349 jmp .LLoopCleanup0
350
351 .LAlphaNotOpaqueOrZero02:
352 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels
353 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
354 psrlw $8, %xmm2 // Discard red and blue
355 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
356 movdqa %xmm6, %xmm4
357 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
358 movdqa %xmm5, %xmm3
359 psubw %xmm2, %xmm4 // Finalize alpha calculations
360
361 psllw $8, %xmm5 // Filter out red and blue components
362 pmulhuw %xmm4, %xmm5 // Scale red and blue
363 psrlw $8, %xmm3 // Filter out alpha and green components
364 pmullw %xmm4, %xmm3 // Scale alpha and green
365
366 addq $16, %rdi
367 subl $4, %ecx
368 pblendvb %xmm0, %xmm5, %xmm3 // Combine results
369 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether
370 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels
371 js .LSmallRemaining // Reuse code from small loop
372 lddqu (%rax, %rdi), %xmm1 // Pre-load four source pixels
373 jmp .LLoopCleanup0
374
375 // Source aligned to destination
376 // *****************************
377 .p2align 4
378 .LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration
379 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
380 ja .LAlphaNotOpaqueOrZero10
381 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels
382 jz .LAlphaZero10
383 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
384
385 .LAlphaZero10:
386 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e
387 ja .LAlphaNotOpaqueOrZero11
388 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels
389 jz .LAlphaZero11
390 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels
391
392 .LAlphaZero11:
393 addq $32, %rdi // Adjust offset and pixel count
394 subl $8, %ecx
395 jae .LAlignedLoop
396 jmp .LLoopCleanup1
397
398 .p2align 4
399 .LAlphaNotOpaqueOrZero10:
400 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels
401 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
402 psrlw $8, %xmm2 // Discard red and blue
403 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
404 movdqa %xmm6, %xmm4
405 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
406 movdqa %xmm5, %xmm3
407 psubw %xmm2, %xmm4 // Finalize alpha calculations
408
409 psllw $8, %xmm5 // Filter out red and blue components
410 pmulhuw %xmm4, %xmm5 // Scale red and blue
411 psrlw $8, %xmm3 // Filter out alpha and green components
412 pmullw %xmm4, %xmm3 // Scale alpha and green
413
414 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels
415 pblendvb %xmm0, %xmm5, %xmm3 // Combine results
416 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether
417 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
418
419 // Handle next four pixels
420 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaqu e
421 ja .LAlphaNotOpaqueOrZero11
422 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels
423 jz .LAlphaZero12
424 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels
425 .LAlphaZero12:
426 addq $32, %rdi // Adjust offset and pixel count
427 subl $8, %ecx
428 jae .LAlignedLoop
429 jmp .LLoopCleanup1
430
431 .p2align 4
432 .LAlphaNotOpaqueOrZero11:
433 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels
434
435 movdqa %xmm2, %xmm1 // Clone source pixels to extract alpha
436 psrlw $8, %xmm1 // Discard red and blue
437 pshufhw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (high)
438 movdqa %xmm6, %xmm4
439 pshuflw $0xF5, %xmm1, %xmm1 // Repeat alpha for scaling (low)
440 movdqa %xmm5, %xmm3
441 psubw %xmm1, %xmm4 // Finalize alpha calculations
442
443 psllw $8, %xmm5 // Filter out red and blue components
444 pmulhuw %xmm4, %xmm5 // Scale red and blue
445 psrlw $8, %xmm3 // Filter out alpha and green components
446 pmullw %xmm4, %xmm3 // Scale alpha and green
447 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels
448
449 addq $32, %rdi
450 pblendvb %xmm0, %xmm5, %xmm3 // Combine results
451 paddb %xmm3, %xmm2 // Add source and destination pixels tog ether
452 subl $8, %ecx
453 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels
454 jae .LAlignedLoop
455
456 // Cleanup - handle four pending pixels from loop
457 .LLoopCleanup1:
458 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
459 ja .LAlphaNotOpaqueOrZero12
460 jz .LAlphaZero13
461 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
462 .LAlphaZero13:
463 addl $8, %ecx // Adjust offset and pixel count
464 jz .LExit
465 addq $16, %rdi
466 jmp .LRemainLoop1
467
468 .LAlphaNotOpaqueOrZero12:
469 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels
470 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
471 psrlw $8, %xmm2 // Discard red and blue
472 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
473 movdqa %xmm6, %xmm4
474 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
475 movdqa %xmm5, %xmm3
476 psubw %xmm2, %xmm4 // Finalize alpha calculations
477
478 psllw $8, %xmm5 // Filter out red and blue components
479 pmulhuw %xmm4, %xmm5 // Scale red and blue
480 psrlw $8, %xmm3 // Filter out alpha and green components
481 pmullw %xmm4, %xmm3 // Scale alpha and green
482
483 addl $8, %ecx // Adjust offset and pixel count
484 pblendvb %xmm0, %xmm5, %xmm3 // Combine results
485 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether
486 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
487 jz .LExit
488 addq $16, %rdi
489
490 // Handle last 1-7 pixels
491 .LRemainLoop1:
492 movdqa (%rax, %rdi), %xmm1 // Load four source pixels
493 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaqu e
494 ja .LRemainAlphaNotOpaqueOrZero1
495 jz .LRemainAlphaZero1
496
497 // All alphas were opaque (copy)
498 subl $4, %ecx // Check if we have more than four pixel s left
499 jle .LRemainStore
500 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
501 addq $16, %rdi
502 jmp .LRemainLoop1
503
504 // All alphas were zero (skip)
505 .p2align 4
506 .LRemainAlphaZero1:
507 subl $4, %ecx // Check if we have more than four pixel s left
508 jle .LExit
509 addq $16, %rdi
510 jmp .LRemainLoop1
511
512 // Handle mixed alphas (calculate and scale)
513 .p2align 4
514 .LRemainAlphaNotOpaqueOrZero1:
515 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels
516
517 movdqa %xmm1, %xmm2 // Clone source pixels to extract alpha
518 psrlw $8, %xmm2 // Discard red and blue
519 pshufhw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (high)
520 movdqa %xmm6, %xmm4
521 pshuflw $0xF5, %xmm2, %xmm2 // Repeat alpha for scaling (low)
522 movdqa %xmm5, %xmm3
523 psubw %xmm2, %xmm4 // Finalize alpha calculations
524
525 psllw $8, %xmm5 // Filter out red and blue components
526 pmulhuw %xmm4, %xmm5 // Scale red and blue
527 psrlw $8, %xmm3 // Filter out alpha and green components
528 pmullw %xmm4, %xmm3 // Scale alpha and green
529
530 subl $4, %ecx
531 pblendvb %xmm0, %xmm5, %xmm3 // Combine results
532 paddb %xmm3, %xmm1 // Add source and destination pixels tog ether
533 jle .LRemainStore
534 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels
535 addq $16, %rdi
536 jmp .LRemainLoop1
537
538 // Store the last 1-4 pixels
539 .p2align 4
540 .LRemainStore:
541 jz .LRemainFull
542 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels
543 cmpl $-2, %ecx // Check how many pixels should be writt en
544 jb .LRemainPixelsLeft11
545 ja .LRemainPixelsLeft13
546 pblendw $0x0F, %xmm1, %xmm5
547 movdqa %xmm5, (%rdx, %rdi) // Store last 2 destination pixels
548 .LExit:
549 ret
550
551 .LRemainPixelsLeft11:
552 pblendw $0x03, %xmm1, %xmm5
553 movdqa %xmm5, (%rdx, %rdi) // Store last destination pixel
554 ret
555
556 .LRemainPixelsLeft13:
557 pblendw $0x3F, %xmm1, %xmm5
558 movdqa %xmm5, (%rdx, %rdi) // Store last 3 destination pixels
559 ret
560
561 .LRemainFull:
562 movdqa %xmm1, (%rdx, %rdi) // Store last 4 destination pixels
563 ret
564
565 .cfi_endproc
566 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm
567
568 // Constants for SSE code
569 .pushsection .rodata.sse4,"a",@progbits
570 .p2align 4
571 .LAlphaCheckMask:
mtklein 2014/05/16 18:06:38 Looks like the differences here are: 1) calling
henrik.smiding 2014/05/20 15:10:29 I tested doing a position independent version in 3
572 .long 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000
573 .LInverseAlphaCalc:
574 .word 256, 256, 256, 256, 256, 256, 256, 256
575 .LResultMergeMask:
576 .long 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF
577 .popsection
578 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698